Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/tutorials/pyro_intro.ipynb
1192 views
Kernel: Python 3

Open In Colab

Pyro is a probabilistic programming system built on top of PyTorch. It supports posterior inference based on MCMC and stochastic variational inference; discrete latent variables can be marginalized out exactly using dynamic programmming.

!pip install pyro-ppl
Collecting pyro-ppl Downloading https://files.pythonhosted.org/packages/aa/7a/fbab572fd385154a0c07b0fa138683aa52e14603bb83d37b198e5f9269b1/pyro_ppl-1.6.0-py3-none-any.whl (634kB) |████████████████████████████████| 634kB 5.4MB/s Requirement already satisfied: torch>=1.8.0 in /usr/local/lib/python3.7/dist-packages (from pyro-ppl) (1.8.1+cu101) Collecting pyro-api>=0.1.1 Downloading https://files.pythonhosted.org/packages/fc/81/957ae78e6398460a7230b0eb9b8f1cb954c5e913e868e48d89324c68cec7/pyro_api-0.1.2-py3-none-any.whl Requirement already satisfied: numpy>=1.7 in /usr/local/lib/python3.7/dist-packages (from pyro-ppl) (1.19.5) Requirement already satisfied: tqdm>=4.36 in /usr/local/lib/python3.7/dist-packages (from pyro-ppl) (4.41.1) Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.7/dist-packages (from pyro-ppl) (3.3.0) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch>=1.8.0->pyro-ppl) (3.7.4.3) Installing collected packages: pyro-api, pyro-ppl Successfully installed pyro-api-0.1.2 pyro-ppl-1.6.0
import matplotlib.pyplot as plt import numpy as np import torch import pyro import pyro.infer import pyro.optim import pyro.distributions as dist from torch.distributions import constraints from pyro.infer import MCMC, NUTS, Predictive, HMC from pyro.infer import SVI, Trace_ELBO from pyro.infer import EmpiricalMarginal from pyro.distributions import Beta, Binomial, HalfCauchy, Normal, Pareto, Uniform from pyro.distributions.util import scalar_like from pyro.infer.mcmc.util import initialize_model, summary from pyro.util import ignore_experimental_warning pyro.set_rng_seed(101)

Example: inferring mean of 1d Gaussian .

We use the simple example from the Pyro intro. The goal is to infer the weight θ\theta of an object, given noisy measurements yy. We assume the following model: θN(μ=8.5,τ2=1.0)yN(θ,σ2=0.752) \begin{align} \theta &\sim N(\mu=8.5, \tau^2=1.0)\\ y \sim &N(\theta, \sigma^2=0.75^2) \end{align}

Where μ=8.5\mu=8.5 is the initial guess.

def model(hparams, data=None): prior_mean, prior_sd, obs_sd = hparams theta = pyro.sample("theta", dist.Normal(prior_mean, prior_sd)) y = pyro.sample("y", dist.Normal(theta, obs_sd), obs=data) return y

Exact inference

By Bayes rule for Gaussians, we know that the exact posterior, given a single observation y=9.5y=9.5, is given by

θyN(m,ss)m=σ2μ+τ2yσ2+τ2=0.752×8.5+1×9.50.752+12=9.14s2=σ2τ2σ2+τ2=0.752×120.752+12=0.62\begin{align} \theta|y &\sim N(m, s^s) \\ m &=\frac{\sigma^2 \mu + \tau^2 y}{\sigma^2 + \tau^2} = \frac{0.75^2 \times 8.5 + 1 \times 9.5}{0.75^2 + 1^2} = 9.14 \\ s^2 &= \frac{\sigma^2 \tau^2}{\sigma^2 + \tau^2} = \frac{0.75^2 \times 1^2}{0.75^2 + 1^2}= 0.6^2 \end{align}
mu = 8.5 tau = 1.0 sigma = 0.75 hparams = (mu, tau, sigma) y = 9.5 m = (sigma**2 * mu + tau**2 * y) / (sigma**2 + tau**2) # posterior mean s2 = (sigma**2 * tau**2) / (sigma**2 + tau**2) # posterior variance s = np.sqrt(s2) print(m) print(s)
9.14 0.6

Ancestral sampling

def model2(hparams, data=None): prior_mean, prior_sd, obs_sd = hparams theta = pyro.sample("theta", dist.Normal(prior_mean, prior_sd)) y = pyro.sample("y", dist.Normal(theta, obs_sd), obs=data) return theta, y for i in range(5): theta, y = model2(hparams) print([theta, y])
[tensor(9.1529), tensor(8.7116)] [tensor(8.7306), tensor(9.3978)] [tensor(9.0740), tensor(9.4240)] [tensor(7.3040), tensor(7.8569)] [tensor(7.8939), tensor(8.0257)]
nuts_kernel = NUTS(model) obs = torch.tensor(y) mcmc = MCMC(nuts_kernel, num_samples=1000, warmup_steps=50) mcmc.run(hparams, obs) print(type(mcmc))
Sample: 100%|██████████| 1050/1050 [00:03, 326.67it/s, step size=1.30e+00, acc. prob=0.880]
<class 'pyro.infer.mcmc.api.MCMC'>
samples = mcmc.get_samples() print(type(samples)) print(samples.keys()) print(samples["theta"].shape)
<class 'dict'> dict_keys(['theta']) torch.Size([1000])
mcmc.diagnostics()
{'acceptance rate': {'chain 0': 0.924}, 'divergences': {'chain 0': []}, 'theta': OrderedDict([('n_eff', tensor(500.2368)), ('r_hat', tensor(1.0050))])}
thetas = samples["theta"].numpy() print(np.mean(thetas)) print(np.std(thetas))
9.152181 0.625822

Variational Inference

See the documentation

For the guide (approximate posterior), we use a pytorch.distributions.normal.

# the guide must have the same signature as the model def guide(hparams, data): y = data prior_mean, prior_sd, obs_sd = hparams m = pyro.param("m", torch.tensor(y)) # location s = pyro.param("s", torch.tensor(prior_sd), constraint=constraints.positive) # scale return pyro.sample("theta", dist.Normal(m, s)) # initialize variational parameters pyro.clear_param_store() # set up the optimizer # optimizer = pyro.optim.Adam({"lr": 0.001, "betas": (0.90, 0.999)}) optimizer = pyro.optim.SGD({"lr": 0.001, "momentum": 0.1}) # setup the inference algorithm svi = SVI(model, guide, optimizer, loss=Trace_ELBO()) n_steps = 2000 # do gradient steps obs = torch.tensor(y) loss_history, m_history, s_history = [], [], [] for t in range(num_steps): loss_history.append(svi.step(hparams, obs)) m_history.append(pyro.param("m").item()) s_history.append(pyro.param("s").item()) plt.plot(loss_history) plt.title("ELBO") plt.xlabel("step") plt.ylabel("loss") post_mean = pyro.param("m").item() post_std = pyro.param("s").item() print([post_mean, post_std])
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). """
[9.205821990966797, 0.6176195740699768]
Image in a Jupyter notebook

Example: beta-bernoulli model

Example is from SVI tutorial

The model is θBeta(α,β)xiBer(θ) \begin{align} \theta &\sim \text{Beta}(\alpha, \beta) \\ x_i &\sim \text{Ber}(\theta) \end{align} where α=β=10\alpha=\beta=10.

alpha0 = 10.0 beta0 = 10.0 def model(data): alpha0_tt = torch.tensor(alpha0) beta0_tt = torch.tensor(beta0) f = pyro.sample("theta", dist.Beta(alpha0_tt, beta0_tt)) # loop over the observed data for i in range(len(data)): pyro.sample("obs_{}".format(i), dist.Bernoulli(f), obs=data[i]) def model_binom(data): alpha0_tt = torch.tensor(alpha0) beta0_tt = torch.tensor(beta0) theta = pyro.sample("theta", dist.Beta(alpha0_tt, beta0_tt)) data_np = [x.item() for x in data] N = len(data_np) N1 = np.sum(data_np) N0 = N - N1 pyro.sample("obs", dist.Binomial(N, theta))
# create some data with 6 observed heads and 4 observed tails data = [] for _ in range(6): data.append(torch.tensor(1.0)) for _ in range(4): data.append(torch.tensor(0.0)) data_np = [x.item() for x in data] print(data) print(data_np) N = len(data_np) N1 = np.sum(data_np) N0 = N - N1 print([N1, N0])
[tensor(1.), tensor(1.), tensor(1.), tensor(1.), tensor(1.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(0.)] [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0] [6.0, 4.0]
[tensor(1.), tensor(1.), tensor(1.), tensor(1.), tensor(1.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(0.)] [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]

Exact inference

The posterior is given by θBer(α+N1,β+N0)N1=i=1N[xi=1]N0=i=1N[xi=0] \begin{align} \theta &\sim \text{Ber}(\alpha + N_1, \beta + N_0) \\ N_1 &= \sum_{i=1}^N [x_i=1] \\ N_0 &= \sum_{i=1}^N [x_i=0] \end{align}

alpha1 = alpha0 + N1 beta1 = beta0 + N0 print("exact posterior: alpha={:0.3f}, beta={:0.3f}".format(alpha1, beta1)) post_mean = alpha1 / (alpha1 + beta1) post_var = (post_mean * beta1) / ((alpha1 + beta1) * (alpha1 + beta1 + 1)) post_std = np.sqrt(post_var) print([post_mean, post_std])
exact posterior: alpha=16.000, beta=14.000 [0.5333333333333333, 0.08960286733763294]

MCMC

nuts_kernel = NUTS(model) mcmc = MCMC(nuts_kernel, num_samples=1000, warmup_steps=50) mcmc.run(data) print(mcmc.diagnostics()) samples = mcmc.get_samples() print(samples["theta"].shape)
Sample: 100%|██████████| 1050/1050 [00:09, 111.12it/s, step size=1.50e+00, acc. prob=0.803]
{'theta': OrderedDict([('n_eff', tensor(443.8569)), ('r_hat', tensor(0.9992))]), 'divergences': {'chain 0': []}, 'acceptance rate': {'chain 0': 0.864}} torch.Size([1000])
thetas = samples["theta"].numpy() print(np.mean(thetas)) print(np.std(thetas))
0.5330437 0.09079484
nuts_kernel = NUTS(model_binom) mcmc = MCMC(nuts_kernel, num_samples=1000, warmup_steps=50) mcmc.run(data) print(mcmc.diagnostics()) samples = mcmc.get_samples() print(samples["theta"].shape)
Sample: 100%|██████████| 1050/1050 [00:08, 117.55it/s, step size=9.57e-01, acc. prob=0.919]
{'theta': OrderedDict([('n_eff', tensor(269.4737)), ('r_hat', tensor(0.9990))]), 'divergences': {'chain 0': []}, 'acceptance rate': {'chain 0': 0.951}} torch.Size([1000])
thetas = samples["theta"].numpy() print(np.mean(thetas)) print(np.std(thetas))
0.48617417 0.112258926

Variational inference

def guide(data): alpha_q = pyro.param("alpha_q", torch.tensor(15.0), constraint=constraints.positive) beta_q = pyro.param("beta_q", torch.tensor(15.0), constraint=constraints.positive) pyro.sample("theta", dist.Beta(alpha_q, beta_q))
# optimizer = pyro.optim.Adam({"lr": 0.0005, "betas": (0.90, 0.999)}) optimizer = pyro.optim.SGD({"lr": 0.001, "momentum": 0.1}) svi = SVI(model, guide, optimizer, loss=Trace_ELBO()) n_steps = 2000 loss_history = [] for step in range(n_steps): loss_history.append(svi.step(data)) plt.plot(loss_history) plt.title("ELBO") plt.xlabel("step") plt.ylabel("loss");
Image in a Jupyter notebook
# grab the learned variational parameters alpha_q = pyro.param("alpha_q").item() beta_q = pyro.param("beta_q").item() print("variational posterior: alpha={:0.3f}, beta={:0.3f}".format(alpha_q, beta_q)) post_mean = alpha_q / (alpha_q + beta_q) post_var = (post_mean * beta_q) / ((alpha_q + beta_q) * (alpha_q + beta_q + 1)) post_std = np.sqrt(post_var) print([post_mean, post_std])
variational posterior: alpha=15.414, beta=14.094 [0.5223745147578196, 0.09043264875842827]