GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/tutorials/pyro_intro.ipynb
¹¹⁹² views

Kernel: Python 3

Pyro is a probabilistic programming system built on top of PyTorch. It supports posterior inference based on MCMC and stochastic variational inference; discrete latent variables can be marginalized out exactly using dynamic programmming.

In [1]:

!pip install pyro-ppl

Out[1]:

Collecting pyro-ppl
  Downloading https://files.pythonhosted.org/packages/aa/7a/fbab572fd385154a0c07b0fa138683aa52e14603bb83d37b198e5f9269b1/pyro_ppl-1.6.0-py3-none-any.whl (634kB)
     |████████████████████████████████| 634kB 5.4MB/s 
Requirement already satisfied: torch>=1.8.0 in /usr/local/lib/python3.7/dist-packages (from pyro-ppl) (1.8.1+cu101)
Collecting pyro-api>=0.1.1
  Downloading https://files.pythonhosted.org/packages/fc/81/957ae78e6398460a7230b0eb9b8f1cb954c5e913e868e48d89324c68cec7/pyro_api-0.1.2-py3-none-any.whl
Requirement already satisfied: numpy>=1.7 in /usr/local/lib/python3.7/dist-packages (from pyro-ppl) (1.19.5)
Requirement already satisfied: tqdm>=4.36 in /usr/local/lib/python3.7/dist-packages (from pyro-ppl) (4.41.1)
Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.7/dist-packages (from pyro-ppl) (3.3.0)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch>=1.8.0->pyro-ppl) (3.7.4.3)
Installing collected packages: pyro-api, pyro-ppl
Successfully installed pyro-api-0.1.2 pyro-ppl-1.6.0

In [47]:

import matplotlib.pyplot as plt
import numpy as np
import torch

import pyro
import pyro.infer
import pyro.optim
import pyro.distributions as dist
from torch.distributions import constraints
from pyro.infer import MCMC, NUTS, Predictive, HMC
from pyro.infer import SVI, Trace_ELBO
from pyro.infer import EmpiricalMarginal

from pyro.distributions import Beta, Binomial, HalfCauchy, Normal, Pareto, Uniform
from pyro.distributions.util import scalar_like
from pyro.infer.mcmc.util import initialize_model, summary
from pyro.util import ignore_experimental_warning

pyro.set_rng_seed(101)

Example: inferring mean of 1d Gaussian .

We use the simple example from the Pyro intro. The goal is to infer the weight $\theta$ of an object, given noisy measurements $y$ . We assume the following model: $\begin{align} \theta &\sim N(\mu=8.5, \tau^2=1.0)\\ y \sim &N(\theta, \sigma^2=0.75^2) \end{align}$

Where $\mu=8.5$ is the initial guess.

In [110]:

def model(hparams, data=None):
    prior_mean, prior_sd, obs_sd = hparams
    theta = pyro.sample("theta", dist.Normal(prior_mean, prior_sd))
    y = pyro.sample("y", dist.Normal(theta, obs_sd), obs=data)
    return y

Exact inference

By Bayes rule for Gaussians, we know that the exact posterior, given a single observation $y=9.5$ , is given by

\begin{align} \theta|y &\sim N(m, s^s) \\ m &=\frac{\sigma^2 \mu + \tau^2 y}{\sigma^2 + \tau^2} = \frac{0.75^2 \times 8.5 + 1 \times 9.5}{0.75^2 + 1^2} = 9.14 \\ s^2 &= \frac{\sigma^2 \tau^2}{\sigma^2 + \tau^2} = \frac{0.75^2 \times 1^2}{0.75^2 + 1^2}= 0.6^2 \end{align}

In [93]:

mu = 8.5
tau = 1.0
sigma = 0.75
hparams = (mu, tau, sigma)

y = 9.5
m = (sigma**2 * mu + tau**2 * y) / (sigma**2 + tau**2)  # posterior mean
s2 = (sigma**2 * tau**2) / (sigma**2 + tau**2)  # posterior variance
s = np.sqrt(s2)
print(m)
print(s)

Out[93]:

9.14
0.6

Ancestral sampling

In [68]:

def model2(hparams, data=None):
    prior_mean, prior_sd, obs_sd = hparams
    theta = pyro.sample("theta", dist.Normal(prior_mean, prior_sd))
    y = pyro.sample("y", dist.Normal(theta, obs_sd), obs=data)
    return theta, y


for i in range(5):
    theta, y = model2(hparams)
    print([theta, y])

Out[68]:

[tensor(9.1529), tensor(8.7116)]
[tensor(8.7306), tensor(9.3978)]
[tensor(9.0740), tensor(9.4240)]
[tensor(7.3040), tensor(7.8569)]
[tensor(7.8939), tensor(8.0257)]

MCMC

See the documentation

In [100]:

nuts_kernel = NUTS(model)
obs = torch.tensor(y)
mcmc = MCMC(nuts_kernel, num_samples=1000, warmup_steps=50)
mcmc.run(hparams, obs)
print(type(mcmc))

Out[100]:

Sample: 100%|██████████| 1050/1050 [00:03, 326.67it/s, step size=1.30e+00, acc. prob=0.880]

<class 'pyro.infer.mcmc.api.MCMC'>

In [103]:

samples = mcmc.get_samples()
print(type(samples))
print(samples.keys())
print(samples["theta"].shape)

Out[103]:

<class 'dict'>
dict_keys(['theta'])
torch.Size([1000])

In [104]:

mcmc.diagnostics()

Out[104]:

{'acceptance rate': {'chain 0': 0.924},
 'divergences': {'chain 0': []},
 'theta': OrderedDict([('n_eff', tensor(500.2368)),
              ('r_hat', tensor(1.0050))])}

In [105]:

thetas = samples["theta"].numpy()
print(np.mean(thetas))
print(np.std(thetas))

Out[105]:

9.152181
0.625822

Variational Inference

See the documentation

For the guide (approximate posterior), we use a pytorch.distributions.normal.

In [60]:

# the guide must have the same signature as the model
def guide(hparams, data):
    y = data
    prior_mean, prior_sd, obs_sd = hparams
    m = pyro.param("m", torch.tensor(y))  # location
    s = pyro.param("s", torch.tensor(prior_sd), constraint=constraints.positive)  # scale
    return pyro.sample("theta", dist.Normal(m, s))


# initialize variational parameters
pyro.clear_param_store()

# set up the optimizer
# optimizer = pyro.optim.Adam({"lr": 0.001, "betas": (0.90, 0.999)})
optimizer = pyro.optim.SGD({"lr": 0.001, "momentum": 0.1})

# setup the inference algorithm
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

n_steps = 2000
# do gradient steps
obs = torch.tensor(y)
loss_history, m_history, s_history = [], [], []
for t in range(num_steps):
    loss_history.append(svi.step(hparams, obs))
    m_history.append(pyro.param("m").item())
    s_history.append(pyro.param("s").item())

plt.plot(loss_history)
plt.title("ELBO")
plt.xlabel("step")
plt.ylabel("loss")

post_mean = pyro.param("m").item()
post_std = pyro.param("s").item()
print([post_mean, post_std])

Out[60]:

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  """

[9.205821990966797, 0.6176195740699768]

Example: beta-bernoulli model

Example is from SVI tutorial

The model is $\begin{align} \theta &\sim \text{Beta}(\alpha, \beta) \\ x_i &\sim \text{Ber}(\theta) \end{align}$ where $\alpha=\beta=10$ .

In [125]:

alpha0 = 10.0
beta0 = 10.0


def model(data):
    alpha0_tt = torch.tensor(alpha0)
    beta0_tt = torch.tensor(beta0)
    f = pyro.sample("theta", dist.Beta(alpha0_tt, beta0_tt))
    # loop over the observed data
    for i in range(len(data)):
        pyro.sample("obs_{}".format(i), dist.Bernoulli(f), obs=data[i])


def model_binom(data):
    alpha0_tt = torch.tensor(alpha0)
    beta0_tt = torch.tensor(beta0)
    theta = pyro.sample("theta", dist.Beta(alpha0_tt, beta0_tt))
    data_np = [x.item() for x in data]
    N = len(data_np)
    N1 = np.sum(data_np)
    N0 = N - N1
    pyro.sample("obs", dist.Binomial(N, theta))

In [124]:

# create some data with 6 observed heads and 4 observed tails
data = []
for _ in range(6):
    data.append(torch.tensor(1.0))
for _ in range(4):
    data.append(torch.tensor(0.0))

data_np = [x.item() for x in data]
print(data)
print(data_np)

N = len(data_np)
N1 = np.sum(data_np)
N0 = N - N1
print([N1, N0])

Out[124]:

[tensor(1.), tensor(1.), tensor(1.), tensor(1.), tensor(1.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(0.)]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
[6.0, 4.0]

In [116]:

Out[116]:

[tensor(1.), tensor(1.), tensor(1.), tensor(1.), tensor(1.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(0.)]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]

Exact inference

The posterior is given by $\begin{align} \theta &\sim \text{Ber}(\alpha + N_1, \beta + N_0) \\ N_1 &= \sum_{i=1}^N [x_i=1] \\ N_0 &= \sum_{i=1}^N [x_i=0] \end{align}$

In [91]:

alpha1 = alpha0 + N1
beta1 = beta0 + N0
print("exact posterior: alpha={:0.3f}, beta={:0.3f}".format(alpha1, beta1))
post_mean = alpha1 / (alpha1 + beta1)
post_var = (post_mean * beta1) / ((alpha1 + beta1) * (alpha1 + beta1 + 1))
post_std = np.sqrt(post_var)
print([post_mean, post_std])

Out[91]:

exact posterior: alpha=16.000, beta=14.000
[0.5333333333333333, 0.08960286733763294]

MCMC

In [114]:

nuts_kernel = NUTS(model)
mcmc = MCMC(nuts_kernel, num_samples=1000, warmup_steps=50)
mcmc.run(data)
print(mcmc.diagnostics())
samples = mcmc.get_samples()
print(samples["theta"].shape)

Out[114]:

Sample: 100%|██████████| 1050/1050 [00:09, 111.12it/s, step size=1.50e+00, acc. prob=0.803]

{'theta': OrderedDict([('n_eff', tensor(443.8569)), ('r_hat', tensor(0.9992))]), 'divergences': {'chain 0': []}, 'acceptance rate': {'chain 0': 0.864}}
torch.Size([1000])

In [115]:

thetas = samples["theta"].numpy()
print(np.mean(thetas))
print(np.std(thetas))

Out[115]:

0.5330437
0.09079484

In [126]:

nuts_kernel = NUTS(model_binom)
mcmc = MCMC(nuts_kernel, num_samples=1000, warmup_steps=50)
mcmc.run(data)
print(mcmc.diagnostics())
samples = mcmc.get_samples()
print(samples["theta"].shape)

Out[126]:

Sample: 100%|██████████| 1050/1050 [00:08, 117.55it/s, step size=9.57e-01, acc. prob=0.919]

{'theta': OrderedDict([('n_eff', tensor(269.4737)), ('r_hat', tensor(0.9990))]), 'divergences': {'chain 0': []}, 'acceptance rate': {'chain 0': 0.951}}
torch.Size([1000])

In [127]:

thetas = samples["theta"].numpy()
print(np.mean(thetas))
print(np.std(thetas))

Out[127]:

0.48617417
0.112258926

Variational inference

In [82]:

def guide(data):
    alpha_q = pyro.param("alpha_q", torch.tensor(15.0), constraint=constraints.positive)
    beta_q = pyro.param("beta_q", torch.tensor(15.0), constraint=constraints.positive)
    pyro.sample("theta", dist.Beta(alpha_q, beta_q))

In [89]:

# optimizer = pyro.optim.Adam({"lr": 0.0005, "betas": (0.90, 0.999)})
optimizer = pyro.optim.SGD({"lr": 0.001, "momentum": 0.1})

svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

n_steps = 2000
loss_history = []
for step in range(n_steps):
    loss_history.append(svi.step(data))

plt.plot(loss_history)
plt.title("ELBO")
plt.xlabel("step")
plt.ylabel("loss");

Out[89]:

In [92]:

# grab the learned variational parameters
alpha_q = pyro.param("alpha_q").item()
beta_q = pyro.param("beta_q").item()
print("variational posterior: alpha={:0.3f}, beta={:0.3f}".format(alpha_q, beta_q))

post_mean = alpha_q / (alpha_q + beta_q)
post_var = (post_mean * beta_q) / ((alpha_q + beta_q) * (alpha_q + beta_q + 1))
post_std = np.sqrt(post_var)
print([post_mean, post_std])

Out[92]:

variational posterior: alpha=15.414, beta=14.094
[0.5223745147578196, 0.09043264875842827]

In [ ]:

Example: inferring mean of 1d Gaussian .

Exact inference

Ancestral sampling

MCMC

Variational Inference

Example: beta-bernoulli model

Exact inference

MCMC

Variational inference

Product

Resources

Company