Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book1/21/gmm_chooseK_pymc3.ipynb
1192 views
Kernel: Python [conda env:probml_py3912]

Open In Colab

try: import pymc3 as pm except ModuleNotFoundError: %pip install -qq pymc3 import pymc3 as pm import numpy as np import scipy.stats as stats import pandas as pd try: import theano.tensor as tt except ModuleNotFoundError: %pip install -qq Theano import theano.tensor as tt import matplotlib.pyplot as plt try: import arviz as az except ModuleNotFoundError: %pip install -qq arviz import arviz as az
np.random.seed(42) # url = 'https://github.com/aloctavodia/BAP/tree/master/code/data/chemical_shifts_theo_exp.csv?raw=true' # There is some error reading the abvoe file # Error tokenizing data. C error: Expected 1 fields in line 71, saw 2 url = "https://raw.githubusercontent.com/probml/probml-data/main/data/chemical_shifts_theo_exp.csv" df = pd.read_csv(url, sep=",") data = df["exp"] clusters = [3, 4, 5, 6] models = [] traces = [] for cluster in clusters: with pm.Model() as model: p = pm.Dirichlet("p", a=np.ones(cluster)) means = pm.Normal( "means", mu=np.linspace(data.min(), data.max(), cluster), sd=10, shape=cluster, transform=pm.distributions.transforms.ordered, ) sd = pm.HalfNormal("sd", sd=10) y = pm.NormalMixture("y", w=p, mu=means, sd=sd, observed=data) trace = pm.sample(1000, tune=2000, random_seed=123, cores=1, chains=2) traces.append(trace) models.append(model) # Visualize fitted densities, compared to KDE _, ax = plt.subplots(2, 2, figsize=(11, 8), constrained_layout=True) ax = np.ravel(ax) x = np.linspace(data.min(), data.max(), 200) for idx, trace_x in enumerate(traces): x_ = np.array([x] * clusters[idx]).T for i in range(50): # posterior samples i_ = np.random.randint(0, len(trace_x)) means_y = trace_x["means"][i_] p_y = trace_x["p"][i_] sd = trace_x["sd"][i_] dist = stats.norm(means_y, sd) ax[idx].plot(x, np.sum(dist.pdf(x_) * p_y, 1), "C0", alpha=0.1) means_y = trace_x["means"].mean(0) p_y = trace_x["p"].mean(0) sd = trace_x["sd"].mean() dist = stats.norm(means_y, sd) ax[idx].plot(x, np.sum(dist.pdf(x_) * p_y, 1), "C0", lw=2) ax[idx].plot(x, dist.pdf(x_) * p_y, "k--", alpha=0.7) az.plot_kde(data, plot_kwargs={"linewidth": 2, "color": "k"}, ax=ax[idx]) ax[idx].set_title("K = {}".format(clusters[idx])) ax[idx].set_yticks([]) ax[idx].set_xlabel("x") plt.savefig("gmm_chooseK_pymc3_kde.pdf") plt.show()
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:26: FutureWarning: In v4.0, pm.sample will return an `arviz.InferenceData` object instead of a `MultiTrace` by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning. Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Sequential sampling (2 chains in 1 job) NUTS: [sd, means, p]
Sampling 2 chains for 2_000 tune and 1_000 draw iterations (4_000 + 2_000 draws total) took 64 seconds. /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:26: FutureWarning: In v4.0, pm.sample will return an `arviz.InferenceData` object instead of a `MultiTrace` by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning. Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Sequential sampling (2 chains in 1 job) NUTS: [sd, means, p]
Sampling 2 chains for 2_000 tune and 1_000 draw iterations (4_000 + 2_000 draws total) took 62 seconds. /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:26: FutureWarning: In v4.0, pm.sample will return an `arviz.InferenceData` object instead of a `MultiTrace` by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning. Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Sequential sampling (2 chains in 1 job) NUTS: [sd, means, p]
Sampling 2 chains for 2_000 tune and 1_000 draw iterations (4_000 + 2_000 draws total) took 137 seconds. The acceptance probability does not match the target. It is 0.8788727262351445, but should be close to 0.8. Try to increase the number of tuning steps. The number of effective samples is smaller than 25% for some parameters. /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:26: FutureWarning: In v4.0, pm.sample will return an `arviz.InferenceData` object instead of a `MultiTrace` by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning. Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Sequential sampling (2 chains in 1 job) NUTS: [sd, means, p]
Sampling 2 chains for 2_000 tune and 1_000 draw iterations (4_000 + 2_000 draws total) took 239 seconds. The acceptance probability does not match the target. It is 0.899593512367246, but should be close to 0.8. Try to increase the number of tuning steps. There were 82 divergences after tuning. Increase `target_accept` or reparameterize. The acceptance probability does not match the target. It is 0.583103520212469, but should be close to 0.8. Try to increase the number of tuning steps. The rhat statistic is larger than 1.05 for some parameters. This indicates slight problems during sampling. The estimated number of effective samples is smaller than 200 for some parameters. /usr/local/lib/python3.7/dist-packages/pymc3/sampling.py:1690: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample "samples parameter is smaller than nchains times ndraws, some draws "
/usr/local/lib/python3.7/dist-packages/pymc3/sampling.py:1690: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample "samples parameter is smaller than nchains times ndraws, some draws "
/usr/local/lib/python3.7/dist-packages/arviz/stats/stats.py:146: UserWarning: The default method used to estimate the weights for each model,has changed from BB-pseudo-BMA to stacking "The default method used to estimate the weights for each model," /usr/local/lib/python3.7/dist-packages/arviz/data/io_pymc3.py:100: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context. FutureWarning, /usr/local/lib/python3.7/dist-packages/arviz/data/io_pymc3.py:100: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context. FutureWarning, /usr/local/lib/python3.7/dist-packages/arviz/data/io_pymc3.py:100: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context. FutureWarning, /usr/local/lib/python3.7/dist-packages/arviz/data/io_pymc3.py:100: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context. FutureWarning,
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) /usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 2897 try: -> 2898 return self._engine.get_loc(casted_key) 2899 except KeyError as err: pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() KeyError: 0 The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) <ipython-input-3-be5ea4ed505e> in <module>() 87 comp = az.compare(dict(zip(clusters, traces)), method='BB-pseudo-BMA') 88 comp ---> 89 az.plot_compare(comp) 90 plt.savefig('gmm_chooseK_pymc3_waic.pdf') 91 /usr/local/lib/python3.7/dist-packages/arviz/plots/compareplot.py in plot_compare(comp_df, insample_dev, plot_standard_error, plot_ic_diff, order_by_rank, figsize, textsize, plot_kwargs, ax, backend, backend_kwargs, show) 135 # TODO: Add backend kwargs 136 plot = get_plotting_function("plot_compare", "compareplot", backend) --> 137 ax = plot(**compareplot_kwargs) 138 139 return ax /usr/local/lib/python3.7/dist-packages/arviz/plots/backends/matplotlib/compareplot.py in plot_compare(ax, comp_df, figsize, plot_ic_diff, plot_standard_error, insample_dev, yticks_pos, yticks_labels, plot_kwargs, information_criterion, textsize, step, backend_kwargs, show) 83 84 if insample_dev: ---> 85 scale = comp_df[f"{information_criterion}_scale"][0] 86 p_ic = comp_df[f"p_{information_criterion}"] 87 if scale == "log": /usr/local/lib/python3.7/dist-packages/pandas/core/series.py in __getitem__(self, key) 880 881 elif key_is_scalar: --> 882 return self._get_value(key) 883 884 if is_hashable(key): /usr/local/lib/python3.7/dist-packages/pandas/core/series.py in _get_value(self, label, takeable) 988 989 # Similar to Index.get_value, but we do not fall back to positional --> 990 loc = self.index.get_loc(label) 991 return self.index._get_values_for_loc(self, loc, label) 992 /usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 2898 return self._engine.get_loc(casted_key) 2899 except KeyError as err: -> 2900 raise KeyError(key) from err 2901 2902 if tolerance is not None: KeyError: 0
Image in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebook
# Posteroior predictive check nclusters = len(clusters) ppc_mm = [pm.sample_posterior_predictive(traces[i], 1000, models[i]) for i in range(nclusters)] fig, ax = plt.subplots(2, 2, figsize=(10, 6), sharex=True, constrained_layout=True) ax = np.ravel(ax) def iqr(x, a=0): return np.subtract(*np.percentile(x, [75, 25], axis=a)) T_obs = iqr(data) for idx, d_sim in enumerate(ppc_mm): T_sim = iqr(d_sim["y"][:100].T, 1) p_value = np.mean(T_sim >= T_obs) az.plot_kde(T_sim, ax=ax[idx]) ax[idx].axvline(T_obs, 0, 1, color="k", ls="--") ax[idx].set_title(f"K = {clusters[idx]} \n p-value {p_value:.2f}") # ax[idx].set_yticks([]) ax[idx].set_xlabel("iqr") plt.savefig("gmm_chooseK_pymc3_pval.pdf") plt.show()
# Compute information criteria for the 4 models # Use Bayesian Bootstrapping together with # pseudo Bayes Model Averaging # See "Bayesian analysis with Python" p201 comp = az.compare(dict(zip(clusters, traces)), method="BB-pseudo-BMA") comp az.plot_compare(comp) plt.savefig("gmm_chooseK_pymc3_waic.pdf")