Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/internal/book2.toc
1191 views
\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax 
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{Preface}{xxxv}{chapter*.3}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{List of Figures}{xxxix}{chapter*.7}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {1}Introduction}{1}{chapter.1}
\defcounter {refsection}{0}\relax 
\contentsline {part}{I\hspace {1em}Fundamentals}{3}{part.1}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {2}Probability}{5}{chapter.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {2.1}Introduction}{5}{section.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {2.2}Some common probability distributions}{5}{section.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.2.1}Discrete distributions}{5}{subsection.2.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.1.1}Bernoulli and binomial distributions}{5}{subsubsection.2.2.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.1.2}Categorical and multinomial distributions}{6}{subsubsection.2.2.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.1.3}Poisson distribution}{6}{subsubsection.2.2.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.2.2}Continuous distributions on $\mathbb {R}$}{6}{subsection.2.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.2.1}Gaussian (Normal)}{6}{subsubsection.2.2.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.2.2}Half-normal}{7}{subsubsection.2.2.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.2.3}Student $t$ distribution}{7}{subsubsection.2.2.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.2.4}Cauchy distribution}{8}{subsubsection.2.2.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.2.5}Laplace distribution}{8}{subsubsection.2.2.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.2.6}Sub-Gaussian and super-Gaussian distributions}{9}{subsubsection.2.2.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.2.3}Continuous distributions on $\mathbb {R}^+$}{10}{subsection.2.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.3.1}Gamma distribution}{10}{subsubsection.2.2.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.3.2}Exponential distribution}{10}{subsubsection.2.2.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.3.3}Chi-squared distribution}{10}{subsubsection.2.2.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.3.4}Inverse gamma}{10}{subsubsection.2.2.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.3.5}Pareto distribution}{11}{subsubsection.2.2.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.2.4}Continuous distributions on $[0,1]$}{12}{subsection.2.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.4.1}Beta distribution}{13}{subsubsection.2.2.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.2.5}The multivariate Gaussian (normal) distribution}{13}{subsection.2.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.5.1}Definition}{13}{subsubsection.2.2.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.5.2}Gaussian shells}{14}{subsubsection.2.2.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.5.3}Moment form and canonical form}{15}{subsubsection.2.2.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.5.4}Marginals and conditionals of a MVN}{15}{subsubsection.2.2.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.5.5}Deriving the conditionals of an MVN}{16}{subsubsection.2.2.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.5.6}Bayes rule for linear Gaussian systems}{19}{subsubsection.2.2.5.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.5.7}Deriving Bayes rule for linear Gaussian systems}{20}{subsubsection.2.2.5.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.5.8}Sensor fusion with known measurement noise}{20}{subsubsection.2.2.5.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.5.9}Sensor fusion with unknown measurement noise}{21}{subsubsection.2.2.5.9}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.5.10}Handling missing data}{24}{subsubsection.2.2.5.10}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.2.6}Some other multivariate continuous distributions}{24}{subsection.2.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.6.1}Multivariate Student distribution}{24}{subsubsection.2.2.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.6.2}Circular normal (von Mises Fisher) distribution}{25}{subsubsection.2.2.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.6.3}Matrix-variate Gaussian (MVG) distribution}{26}{subsubsection.2.2.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.6.4}Wishart distribution}{26}{subsubsection.2.2.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.6.5}Visualizing the Wishart distribution}{27}{subsubsection.2.2.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.6.6}Inverse Wishart distribution}{27}{subsubsection.2.2.6.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.2.6.7}Dirichlet distribution}{28}{subsubsection.2.2.6.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {2.3}The exponential family}{30}{section.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.3.1}Definition}{31}{subsection.2.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.3.2}Examples}{31}{subsection.2.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.3.2.1}Bernoulli distribution}{31}{subsubsection.2.3.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.3.2.2}Categorical distribution}{32}{subsubsection.2.3.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.3.2.3}Univariate Gaussian}{33}{subsubsection.2.3.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.3.2.4}Univariate Gaussian with fixed variance}{34}{subsubsection.2.3.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.3.2.5}Multivariate Gaussian}{34}{subsubsection.2.3.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.3.2.6}Non-examples}{35}{subsubsection.2.3.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.3.3}Log partition function is cumulant generating function}{36}{subsection.2.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.3.3.1}Derivation of the mean}{36}{subsubsection.2.3.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.3.3.2}Derivation of the variance}{36}{subsubsection.2.3.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.3.3.3}Connection with the Fisher information matrix}{37}{subsubsection.2.3.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.3.4}Canonical (natural) vs mean (moment) parameters}{37}{subsection.2.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.3.5}MLE for the exponential family}{38}{subsection.2.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.3.6}Exponential dispersion family}{39}{subsection.2.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.3.7}Maximum entropy derivation of the exponential family}{39}{subsection.2.3.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {2.4}Fisher information matrix (FIM)}{40}{section.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.4.1}Definition}{40}{subsection.2.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.4.2}Equivalence between the FIM and the Hessian of the NLL}{41}{subsection.2.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.4.3}Examples}{42}{subsection.2.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.4.3.1}FIM for the Binomial}{42}{subsubsection.2.4.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.4.3.2}FIM for the Gaussian}{42}{subsubsection.2.4.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.4.3.3}FIM for logistic regression}{43}{subsubsection.2.4.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.4.4}Approximating KL divergence using FIM}{43}{subsection.2.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.4.5}Fisher information matrix for exponential family}{44}{subsection.2.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {2.5}Transformations of random variables}{45}{section.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.5.1}Invertible transformations (bijections)}{45}{subsection.2.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.5.2}Monte Carlo approximation}{46}{subsection.2.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.5.3}Probability integral transform}{46}{subsection.2.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {2.6}Markov chains}{48}{section.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.6.1}Parameterization}{48}{subsection.2.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.6.1.1}Markov transition kernels}{48}{subsubsection.2.6.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.6.1.2}Markov transition matrices}{49}{subsubsection.2.6.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.6.1.3}Higher-order Markov models}{50}{subsubsection.2.6.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.6.2}Application: Language modeling}{50}{subsection.2.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.6.3}Parameter estimation}{51}{subsection.2.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.6.3.1}Maximum likelihood estimation}{51}{subsubsection.2.6.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.6.3.2}Sparse data problem}{52}{subsubsection.2.6.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.6.3.3}MAP estimation}{52}{subsubsection.2.6.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.6.4}Stationary distribution of a Markov chain}{53}{subsection.2.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.6.4.1}What is a stationary distribution?}{53}{subsubsection.2.6.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.6.4.2}Computing the stationary distribution}{54}{subsubsection.2.6.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.6.4.3}When does a stationary distribution exist?}{54}{subsubsection.2.6.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.6.4.4}Detailed balance}{56}{subsubsection.2.6.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {2.7}Divergence measures between probability distributions}{56}{section.2.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.7.1}f-divergence}{57}{subsection.2.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.7.1.1}KL divergence}{57}{subsubsection.2.7.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.7.1.2}Alpha divergence}{57}{subsubsection.2.7.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.7.1.3}Hellinger distance}{58}{subsubsection.2.7.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.7.1.4}Chi-squared distance}{58}{subsubsection.2.7.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.7.2}Integral probability metrics}{58}{subsection.2.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.7.3}Maximum mean discrepancy (MMD)}{59}{subsection.2.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.7.3.1}MMD as an IPM}{59}{subsubsection.2.7.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.7.3.2}Computing the MMD using the kernel trick}{60}{subsubsection.2.7.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.7.3.3}Linear time computation}{61}{subsubsection.2.7.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {2.7.3.4}Choosing the right kernel}{61}{subsubsection.2.7.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.7.4}Total variation distance}{62}{subsection.2.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {2.7.5}Comparing distributions using binary classifiers}{62}{subsection.2.7.5}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {3}Bayesian statistics}{65}{chapter.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {3.1}Introduction}{65}{section.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.1.1}Frequentist statistics}{65}{subsection.3.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.1.2}Bayesian statistics}{65}{subsection.3.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.1.3}Arguments for the Bayesian approach}{66}{subsection.3.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.1.3.1}De Finetti's theorem}{66}{subsubsection.3.1.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.1.3.2}The Dutch book theorem}{67}{subsubsection.3.1.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.1.3.3}Online learning}{67}{subsubsection.3.1.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.1.4}Arguments against the Bayesian approach}{67}{subsection.3.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.1.5}Why not just use MAP estimation?}{67}{subsection.3.1.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.1.5.1}The MAP estimate gives no measure of uncertainty}{68}{subsubsection.3.1.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.1.5.2}The plugin approximation does not capture predictive uncertainty}{68}{subsubsection.3.1.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.1.5.3}The MAP estimate is often untypical of the posterior}{70}{subsubsection.3.1.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.1.5.4}The MAP estimate is only optimal for 0-1 loss}{71}{subsubsection.3.1.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.1.5.5}The MAP estimate is not invariant to reparameterization}{71}{subsubsection.3.1.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.1.5.6}MAP estimation cannot handle the cold-start problem}{71}{subsubsection.3.1.5.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {3.2}Closed-form analysis using conjugate priors}{72}{section.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.2.1}The binomial model}{72}{subsection.3.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.2.2}The multinomial model}{73}{subsection.3.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.2.3}The univariate Gaussian model}{74}{subsection.3.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.2.3.1}Posterior of $\mu $ given $\sigma ^2$}{74}{subsubsection.3.2.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.2.3.2}Posterior of $\sigma ^2$ given $\mu $}{76}{subsubsection.3.2.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.2.3.3}Posterior of $\mu $ and $\sigma ^2$: conjugate prior}{77}{subsubsection.3.2.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.2.3.4}Posterior of $\mu $ and $\sigma ^2$: uninformative prior}{78}{subsubsection.3.2.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {3.3}Conjugate Bayesian analysis for the multivariate Gaussian}{79}{section.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.3.1}Posterior of $\boldsymbol {\mu }$ given $\boldsymbol {\Sigma }$}{79}{subsection.3.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.3.2}Posterior of $\boldsymbol {\Sigma }$ given $\boldsymbol {\mu }$}{80}{subsection.3.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.2.1}Likelihood}{80}{subsubsection.3.3.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.2.2}Prior}{80}{subsubsection.3.3.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.2.3}Posterior}{81}{subsubsection.3.3.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.3.3}Posterior of $\boldsymbol {\Sigma }$ and $\boldsymbol {\mu }$}{81}{subsection.3.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.3.1}Likelihood}{81}{subsubsection.3.3.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.3.2}Prior}{82}{subsubsection.3.3.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.3.3}Posterior}{83}{subsubsection.3.3.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.3.4}Posterior marginals}{84}{subsubsection.3.3.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.3.5}Posterior predictive}{84}{subsubsection.3.3.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.3.6}Posterior of $\boldsymbol {\Lambda }$ and $\boldsymbol {\mu }$}{85}{subsubsection.3.3.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.3.4}Conjugate-exponential models}{85}{subsection.3.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.4.1}Likelihood}{85}{subsubsection.3.3.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.4.2}Prior}{85}{subsubsection.3.3.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.4.3}Posterior}{86}{subsubsection.3.3.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.4.4}Posterior predictive density}{86}{subsubsection.3.3.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.3.4.5}Example: Bernoulli distribution}{87}{subsubsection.3.3.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {3.4}Beyond conjugate priors}{87}{section.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.4.1}Robust (heavy-tailed) priors}{88}{subsection.3.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.4.2}Priors for variance parameters}{88}{subsection.3.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.4.2.1}Prior for variance terms}{88}{subsubsection.3.4.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.4.2.2}Priors for covariance matrices}{89}{subsubsection.3.4.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {3.5}Noninformative priors}{89}{section.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.5.1}Maximum entropy priors}{90}{subsection.3.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.5.2}Jeffreys priors}{91}{subsection.3.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.5.2.1}Jeffreys prior for binomial distribution}{92}{subsubsection.3.5.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.5.2.2}Jeffreys prior for multinomial distribution}{93}{subsubsection.3.5.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.5.2.3}Jeffreys prior for the mean and variance of a univariate Gaussian}{93}{subsubsection.3.5.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.5.3}Invariant priors}{94}{subsection.3.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.5.3.1}Translation-invariant priors}{94}{subsubsection.3.5.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.5.3.2}Scale-invariant prior}{94}{subsubsection.3.5.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.5.3.3}Learning invariant priors}{95}{subsubsection.3.5.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.5.4}Reference priors}{95}{subsection.3.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {3.6}Hierarchical priors}{95}{section.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.6.1}A hierarchical binomial model}{96}{subsection.3.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.6.2}A hierarchical Gaussian model}{98}{subsection.3.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.6.2.1}Example: the 8-schools dataset}{98}{subsubsection.3.6.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.6.2.2}Non-centered parameterization}{100}{subsubsection.3.6.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {3.7}Empirical Bayes}{101}{section.3.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.7.1}A hierarchical binomial model}{102}{subsection.3.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.7.2}A hierarchical Gaussian model}{103}{subsection.3.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.7.3}Hierarchical Bayes for n-gram smoothing}{104}{subsection.3.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {3.8}Model selection and evaluation}{106}{section.3.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.8.1}Bayesian model selection}{106}{subsection.3.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.8.2}Estimating the marginal likelihood}{107}{subsection.3.8.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.8.3}Connection between cross validation and marginal likelihood}{108}{subsection.3.8.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.8.4}Pareto-Smoothed Importance Sampling LOO estimate}{109}{subsection.3.8.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.8.5}Information criteria}{110}{subsection.3.8.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.8.5.1}Minimum description length (MDL)}{111}{subsubsection.3.8.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.8.5.2}The Bayesian information criterion (BIC)}{111}{subsubsection.3.8.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.8.5.3}Akaike information criterion}{112}{subsubsection.3.8.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.8.5.4}Widely applicable information criterion (WAIC)}{112}{subsubsection.3.8.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.8.6}Posterior predictive checks}{112}{subsection.3.8.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.8.6.1}Example: 1d Gaussian}{113}{subsubsection.3.8.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {3.8.7}Bayesian p-values}{113}{subsection.3.8.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {3.8.7.1}Example: linear regression}{115}{subsubsection.3.8.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {4}Probabilistic graphical models}{117}{chapter.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {4.1}Introduction}{117}{section.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {4.2}Directed graphical models (Bayes nets)}{117}{section.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.2.1}Representing the joint distribution}{117}{subsection.4.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.2.2}Examples}{118}{subsection.4.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.2.1}Markov chains}{118}{subsubsection.4.2.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.2.2}The ``Student'' network}{119}{subsubsection.4.2.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.2.3}Sigmoid belief nets}{121}{subsubsection.4.2.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.2.3}Gaussian Bayes nets}{122}{subsection.4.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.2.4}Conditional independence properties}{123}{subsection.4.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.4.1}Global Markov properties (d-separation)}{123}{subsubsection.4.2.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.4.2}Explaining away (Berkson's paradox)}{125}{subsubsection.4.2.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.4.3}Other Markov properties}{127}{subsubsection.4.2.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.4.4}Markov blankets and full conditionals}{127}{subsubsection.4.2.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.4.5}I-maps}{128}{subsubsection.4.2.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.2.5}Generation (sampling)}{128}{subsection.4.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.2.6}Inference}{128}{subsection.4.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.6.1}Example: inference in the Student network}{129}{subsubsection.4.2.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.2.7}Learning}{130}{subsection.4.2.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.7.1}Learning from complete data}{130}{subsubsection.4.2.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.7.2}Example: computing the MLE for CPTs}{131}{subsubsection.4.2.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.7.3}Example: Computing the posterior for CPTs}{132}{subsubsection.4.2.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.7.4}Learning from incomplete data}{133}{subsubsection.4.2.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.7.5}Using EM to fit CPTs in the incomplete data case}{133}{subsubsection.4.2.7.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.7.6}Using SGD to fit CPTs in the incomplete data case}{134}{subsubsection.4.2.7.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.2.8}Plate notation}{135}{subsection.4.2.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.8.1}Example: factor analysis}{136}{subsubsection.4.2.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.8.2}Example: Naive Bayes classifier}{137}{subsubsection.4.2.8.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.2.8.3}Example: relaxing the naive Bayes assumption}{137}{subsubsection.4.2.8.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {4.3}Undirected graphical models (Markov random fields)}{138}{section.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.3.1}Representing the joint distribution}{138}{subsection.4.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.1.1}Hammersley-Clifford theorem}{139}{subsubsection.4.3.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.1.2}Gibbs distribution}{140}{subsubsection.4.3.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.3.2}Fully visible MRFs (Ising, Potts, Hopfield, etc)}{140}{subsection.4.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.2.1}Ising models}{140}{subsubsection.4.3.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.2.2}Potts models}{142}{subsubsection.4.3.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.2.3}Potts models for protein structure prediction}{143}{subsubsection.4.3.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.2.4}Hopfield networks}{144}{subsubsection.4.3.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.3.3}MRFs with latent variables (Boltzmann machines, etc)}{146}{subsection.4.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.3.1}Vannilla Boltzmann machines}{146}{subsubsection.4.3.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.3.2}Restricted Boltzmann machines (RBMs)}{146}{subsubsection.4.3.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.3.3}Deep Boltzmann machines}{147}{subsubsection.4.3.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.3.4}Deep belief networks (DBNs)}{148}{subsubsection.4.3.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.3.4}Maximum entropy models}{148}{subsection.4.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.4.1}Log-linear models}{149}{subsubsection.4.3.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.4.2}Feature induction for a maxent spelling model}{149}{subsubsection.4.3.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.3.5}Gaussian MRFs}{150}{subsection.4.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.5.1}Standard GMRFs}{151}{subsubsection.4.3.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.5.2}Nonlinear Gaussian MRFs}{152}{subsubsection.4.3.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.3.6}Conditional independence properties}{153}{subsection.4.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.6.1}Basic results}{153}{subsubsection.4.3.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.6.2}An undirected alternative to d-separation}{154}{subsubsection.4.3.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.3.7}Generation (sampling)}{155}{subsection.4.3.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.3.8}Inference}{155}{subsection.4.3.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.3.9}Learning}{156}{subsection.4.3.9}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.9.1}Learning from complete data}{156}{subsubsection.4.3.9.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.9.2}Computational issues}{157}{subsubsection.4.3.9.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.9.3}Maximum pseudo-likelihood estimation}{158}{subsubsection.4.3.9.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.3.9.4}Learning from incomplete data}{159}{subsubsection.4.3.9.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {4.4}Conditional random fields (CRFs)}{159}{section.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.4.1}1d CRFs}{160}{subsection.4.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.4.1.1}Noun phrase chunking}{161}{subsubsection.4.4.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.4.1.2}Named entity recognition}{162}{subsubsection.4.4.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.4.1.3}Natural language parsing}{162}{subsubsection.4.4.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.4.2}2d CRFs}{163}{subsection.4.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.4.2.1}Semantic segmentation}{164}{subsubsection.4.4.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.4.2.2}Deformable parts models}{165}{subsubsection.4.4.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.4.3}Parameter estimation}{166}{subsection.4.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.4.3.1}Log-linear potentials}{166}{subsubsection.4.4.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.4.3.2}General case}{167}{subsubsection.4.4.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.4.4}Other approaches to structured prediction}{167}{subsection.4.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {4.5}Comparing directed and undirected PGMs}{167}{section.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.5.1}CI properties}{167}{subsection.4.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.5.2}Converting between a directed and undirected model}{169}{subsection.4.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.5.2.1}Converting a DPGM\xspace to a UPGM\xspace }{169}{subsubsection.4.5.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.5.2.2}Converting a UPGM\xspace to a DPGM\xspace }{170}{subsubsection.4.5.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.5.3}Conditional directed vs undirected PGMs and the label bias problem}{170}{subsection.4.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.5.4}Combining directed and undirected graphs}{171}{subsection.4.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.5.4.1}Chain graphs}{171}{subsubsection.4.5.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.5.4.2}Acyclic directed mixed graphs}{172}{subsubsection.4.5.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.5.5}Comparing directed and undirected Gaussian PGMs}{173}{subsection.4.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.5.5.1}Covariance graphs}{174}{subsubsection.4.5.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {4.6}PGM extensions}{175}{section.4.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.6.1}Factor graphs}{175}{subsection.4.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.6.1.1}Bipartite factor graphs}{175}{subsubsection.4.6.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.6.1.2}Forney factor graphs}{176}{subsubsection.4.6.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.6.2}Probabilistic circuits}{178}{subsection.4.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.6.3}Directed relational PGMs}{178}{subsection.4.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.6.4}Undirected relational PGMs}{180}{subsection.4.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.6.4.1}Collective classification}{181}{subsubsection.4.6.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.6.4.2}Markov logic networks}{181}{subsubsection.4.6.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.6.5}Open-universe probability models}{183}{subsection.4.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.6.6}Programs as probability models}{184}{subsection.4.6.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {4.7}Structural causal models}{184}{section.4.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.7.1}Example: causal impact of education on wealth}{185}{subsection.4.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.7.2}Structural equation models}{186}{subsection.4.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.7.3}Do operator and augmented DAGs}{187}{subsection.4.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.7.4}Estimating average treatment effect using path analysis}{188}{subsection.4.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.7.4.1}Direct effect}{188}{subsubsection.4.7.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {4.7.4.2}Indirect effect (mediation analysis)}{189}{subsubsection.4.7.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {4.7.5}Counterfactuals}{189}{subsection.4.7.5}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {5}Information theory}{193}{chapter.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {5.1}KL divergence}{193}{section.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.1.1}Desiderata}{194}{subsection.5.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.1.2}The KL divergence uniquely satisfies the desiderata}{195}{subsection.5.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.1.2.1}Continuity of KL}{195}{subsubsection.5.1.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.1.2.2}Non-negativity of KL divergence}{196}{subsubsection.5.1.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.1.2.3}KL divergence is invariant to reparameterizations}{196}{subsubsection.5.1.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.1.2.4}Montonicity for uniform distributions}{197}{subsubsection.5.1.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.1.2.5}Chain rule for KL divergence}{197}{subsubsection.5.1.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.1.3}Thinking about KL}{198}{subsection.5.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.1.3.1}Units of KL}{198}{subsubsection.5.1.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.1.3.2}Asymmetry of the KL divergence}{198}{subsubsection.5.1.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.1.3.3}KL as expected weight of evidence}{199}{subsubsection.5.1.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.1.4}Properties of KL}{200}{subsection.5.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.1.4.1}Compression Lemma}{200}{subsubsection.5.1.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.1.4.2}Data processing inequality for KL}{201}{subsubsection.5.1.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.1.5}KL divergence and MLE}{202}{subsection.5.1.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.1.6}KL divergence and Bayesian Inference}{203}{subsection.5.1.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.1.7}KL divergence and Exponential Families}{204}{subsection.5.1.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.1.7.1}Example: KL divergence between two Gaussians}{204}{subsubsection.5.1.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.1.8}Bregman divergence}{205}{subsection.5.1.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.1.8.1}KL is a Bregman divergence}{206}{subsubsection.5.1.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {5.2}Entropy}{206}{section.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.2.1}Definition}{206}{subsection.5.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.2.2}Differential entropy for continuous random variables}{207}{subsection.5.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.2.3}Typical sets}{208}{subsection.5.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.2.4}Cross entropy and perplexity}{209}{subsection.5.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {5.3}Mutual information}{210}{section.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.3.1}Definition}{210}{subsection.5.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.3.2}Interpretation}{210}{subsection.5.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.3.3}Data processing inequality}{211}{subsection.5.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.3.4}Sufficient Statistics}{212}{subsection.5.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.3.5}Multivariate mutual information}{212}{subsection.5.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.3.5.1}Total correlation}{212}{subsubsection.5.3.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.3.5.2}Interaction information (co-information)}{213}{subsubsection.5.3.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.3.5.3}Synergy and redundancy}{214}{subsubsection.5.3.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.3.5.4}MMI and causality}{214}{subsubsection.5.3.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.3.5.5}MMI and entropy}{214}{subsubsection.5.3.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.3.6}Variational bounds on mutual information}{215}{subsection.5.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.3.6.1}Upper bound}{215}{subsubsection.5.3.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.3.6.2}BA lower bound}{216}{subsubsection.5.3.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.3.6.3}NWJ lower bound}{216}{subsubsection.5.3.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {5.3.6.4}InfoNCE lower bound}{217}{subsubsection.5.3.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.3.7}Relevance networks}{217}{subsection.5.3.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {5.4}Data compression (source coding)}{218}{section.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.4.1}Lossless compression}{219}{subsection.5.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.4.2}Lossy compression and the rate-distortion tradeoff}{219}{subsection.5.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.4.3}Bits back coding}{221}{subsection.5.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {5.5}Error-correcting codes (channel coding)}{222}{section.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {5.6}The information bottleneck}{224}{section.5.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.6.1}Vanilla IB}{224}{subsection.5.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.6.2}Variational IB}{225}{subsection.5.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {5.6.3}Conditional entropy bottleneck}{226}{subsection.5.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {6}Optimization}{229}{chapter.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {6.1}Introduction}{229}{section.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {6.2}Automatic differentiation}{229}{section.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.2.1}Differentiation in functional form}{229}{subsection.6.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Linear and multilinear functions.}{230}{section*.129}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{The derivative operator.}{231}{section*.130}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Higher-order derivatives.}{231}{section*.131}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Multiple inputs.}{232}{section*.132}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Composition and fan-out.}{233}{section*.133}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.2.2}Differentiating chains, circuits, and programs}{234}{subsection.6.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.2.2.1}Chain compositions and the chain rule}{234}{subsubsection.6.2.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.2.2.2}From chains to circuits}{236}{subsubsection.6.2.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.2.2.3}From circuits to programs}{238}{subsubsection.6.2.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {6.3}Stochastic gradient descent}{239}{section.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {6.4}Natural gradient descent}{240}{section.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.4.1}Defining the natural gradient}{240}{subsection.6.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.4.2}Interpretations of NGD}{241}{subsection.6.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.4.2.1}NGD as a trust region method}{241}{subsubsection.6.4.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.4.2.2}NGD as a Gauss-Newton method}{242}{subsubsection.6.4.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.4.3}Benefits of NGD}{242}{subsection.6.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.4.4}Approximating the natural gradient}{243}{subsection.6.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.4.5}Natural gradients for the exponential family}{244}{subsection.6.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.4.5.1}Analytic computation for the Gaussian case}{245}{subsubsection.6.4.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.4.5.2}Stochastic approximation for the general case}{245}{subsubsection.6.4.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.4.5.3}Natural gradient of the entropy function}{246}{subsubsection.6.4.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {6.5}Gradients of stochastic functions}{246}{section.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.5.1}Minibatch approximation to finite-sum objectives}{247}{subsection.6.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.5.2}Optimizing parameters of a distribution}{247}{subsection.6.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.5.3}Score function estimator (likelihood ratio trick)}{248}{subsection.6.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.5.3.1}Control variates}{248}{subsubsection.6.5.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.5.3.2}Rao-Blackwellisation}{249}{subsubsection.6.5.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.5.4}Reparameterization trick}{249}{subsection.6.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.5.4.1}Example}{249}{subsubsection.6.5.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.5.4.2}Total derivative}{250}{subsubsection.6.5.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.5.5}The delta method}{251}{subsection.6.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.5.6}Gumbel softmax trick}{251}{subsection.6.5.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.5.7}Stochastic computation graphs}{252}{subsection.6.5.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.5.8}Straight-through estimator}{252}{subsection.6.5.8}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {6.6}Bound optimization (MM) algorithms}{253}{section.6.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.6.1}The general algorithm}{253}{subsection.6.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.6.2}Example: logistic regression}{254}{subsection.6.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.6.3}The EM algorithm}{256}{subsection.6.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.3.1}Lower bound}{256}{subsubsection.6.6.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.3.2}E step}{257}{subsubsection.6.6.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.3.3}M step}{257}{subsubsection.6.6.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.6.4}Example: EM for an MVN with missing data}{258}{subsection.6.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.4.1}E step}{258}{subsubsection.6.6.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.4.2}M step}{259}{subsubsection.6.6.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.4.3}Initialization}{259}{subsubsection.6.6.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.4.4}Example}{259}{subsubsection.6.6.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.6.5}Example: robust linear regression using Student-$t$ likelihood}{260}{subsection.6.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.6.6}Extensions to EM}{261}{subsection.6.6.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.6.1}Variational EM}{261}{subsubsection.6.6.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.6.2}Hard EM}{262}{subsubsection.6.6.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.6.3}Monte Carlo EM}{262}{subsubsection.6.6.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.6.4}Generalized EM}{262}{subsubsection.6.6.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.6.5}ECM algorithm}{263}{subsubsection.6.6.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.6.6.6}Online EM}{263}{subsubsection.6.6.6.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {6.7}The Bayesian learning rule}{263}{section.6.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.7.1}Deriving inference algorithms from BLR}{264}{subsection.6.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.7.1.1}Bayesian inference as optimization}{265}{subsubsection.6.7.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.7.1.2}Optimization of BLR using natural gradient descent}{265}{subsubsection.6.7.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.7.1.3}Conjugate variational inference}{266}{subsubsection.6.7.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.7.1.4}Partially conjugate variational inference}{266}{subsubsection.6.7.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.7.2}Deriving optimization algorithms from BLR}{266}{subsection.6.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.7.2.1}Gradient descent}{267}{subsubsection.6.7.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.7.2.2}Newton's method}{267}{subsubsection.6.7.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.7.2.3}Variational online Gauss-Newton}{268}{subsubsection.6.7.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.7.2.4}Adaptive learning rate SGD}{269}{subsubsection.6.7.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.7.3}Variational optimization}{270}{subsection.6.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {6.8}Bayesian optimization}{270}{section.6.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.8.1}Sequential model-based optimization}{271}{subsection.6.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.8.2}Surrogate functions}{272}{subsection.6.8.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.2.1}Gaussian processes}{272}{subsubsection.6.8.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.2.2}Bayesian neural networks}{273}{subsubsection.6.8.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.2.3}Other models}{273}{subsubsection.6.8.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.8.3}Acquisition functions}{273}{subsection.6.8.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.3.1}Probability of improvement}{273}{subsubsection.6.8.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.3.2}Expected improvement}{273}{subsubsection.6.8.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.3.3}Upper confidence bound (UCB)}{274}{subsubsection.6.8.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.3.4}Thompson sampling}{275}{subsubsection.6.8.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.3.5}Entropy search}{275}{subsubsection.6.8.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.3.6}Knowledge gradient}{276}{subsubsection.6.8.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.3.7}Optimizing the acquisition function}{276}{subsubsection.6.8.3.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.8.4}Other issues}{276}{subsection.6.8.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.4.1}Parallel (batch) queries}{276}{subsubsection.6.8.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.4.2}Conditional parameters}{276}{subsubsection.6.8.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.4.3}Multi-fidelity surrogates}{277}{subsubsection.6.8.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.8.4.4}Constraints}{277}{subsubsection.6.8.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {6.9}Derivative free optimization}{277}{section.6.9}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.9.1}Local search}{277}{subsection.6.9.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.9.1.1}Stochastic local search}{278}{subsubsection.6.9.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.9.1.2}Tabu search}{278}{subsubsection.6.9.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.9.1.3}Random search}{279}{subsubsection.6.9.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.9.2}Simulated annealing}{280}{subsection.6.9.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.9.3}Evolutionary algorithms}{283}{subsection.6.9.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.9.4}Estimation of distribution (EDA) algorithms}{285}{subsection.6.9.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.9.5}Cross-entropy method}{287}{subsection.6.9.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.9.5.1}Differentiable CEM}{287}{subsubsection.6.9.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.9.6}Evolutionary strategies}{288}{subsection.6.9.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.9.6.1}Natural evolutionary strategies}{288}{subsubsection.6.9.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.9.6.2}CMA-ES}{288}{subsubsection.6.9.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {6.10}Optimal Transport}{288}{section.6.10}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.10.1}Warm-up: Matching optimally two families of points}{288}{subsection.6.10.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.10.2}From Optimal Matchings to Kantorovich and Monge formulations}{290}{subsection.6.10.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.10.2.1}Mass splitting}{290}{subsubsection.6.10.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.10.2.2}Monge formulation and optimal push-forward maps}{291}{subsubsection.6.10.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.10.2.3}Kantorovich formulation}{292}{subsubsection.6.10.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.10.2.4}Wasserstein distances}{292}{subsubsection.6.10.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.10.3}Solving optimal transport}{292}{subsection.6.10.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.10.3.1}Duality and cost concavity}{292}{subsubsection.6.10.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.10.3.2}Kantorovich-Rubinstein duality and Lipschitz potentials}{293}{subsubsection.6.10.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.10.3.3}Monge maps as gradients of convex functions: the Brenier theorem}{294}{subsubsection.6.10.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.10.3.4}Closed forms for univariate and Gaussian distributions}{295}{subsubsection.6.10.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.10.3.5}Exact evaluation using linear program solvers}{295}{subsubsection.6.10.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.10.3.6}Obtaining smoothness using entropic regularization}{296}{subsubsection.6.10.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {6.11}Submodular optimization}{297}{section.6.11}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.11.1}Intuition, Examples, and Background}{298}{subsection.6.11.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.11.1.1}Coffee, Lemon, Milk, Tea}{298}{subsubsection.6.11.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.11.2}Submodular Basic Definitions}{300}{subsection.6.11.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.11.3}Example Submodular Functions}{301}{subsection.6.11.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.11.4}Submodular Optimization}{304}{subsection.6.11.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.11.4.1}Submodular Maximization}{304}{subsubsection.6.11.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.11.4.2}Discrete Constraints}{306}{subsubsection.6.11.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.11.4.3}Submodular Function Minimization}{307}{subsubsection.6.11.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.11.5}Applications of Submodularity in Machine Learning and AI}{308}{subsection.6.11.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.11.6}Sketching, CoreSets, Distillation, and Data Subset \& Feature Selection}{308}{subsection.6.11.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {6.11.6.1}Summarization Algorithm Design Choices}{309}{subsubsection.6.11.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.11.7}Combinatorial Information Functions}{312}{subsection.6.11.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.11.8}Clustering, Data Partitioning, and Parallel Machine Learning}{313}{subsection.6.11.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.11.9}Active and Semi-Supervised Learning}{314}{subsection.6.11.9}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.11.10}Probabilistic Modeling}{315}{subsection.6.11.10}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.11.11}Structured Norms and Loss Functions}{316}{subsection.6.11.11}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {6.11.12}Conclusions}{316}{subsection.6.11.12}
\defcounter {refsection}{0}\relax 
\contentsline {part}{II\hspace {1em}Inference}{317}{part.2}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {7}Inference algorithms: an overview}{319}{chapter.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {7.1}Introduction}{319}{section.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {7.2}Common inference patterns}{319}{section.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {7.2.1}Global latents}{320}{subsection.7.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {7.2.2}Local latents}{320}{subsection.7.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {7.2.3}Global and local latents}{321}{subsection.7.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {7.3}Exact inference algorithms}{321}{section.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {7.4}Approximate inference algorithms}{322}{section.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {7.4.1}MAP estimation}{322}{subsection.7.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {7.4.2}Grid approximation}{322}{subsection.7.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {7.4.3}Laplace (quadratic) approximation}{323}{subsection.7.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {7.4.4}Variational inference}{324}{subsection.7.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {7.4.5}Markov Chain Monte Carlo (MCMC)}{326}{subsection.7.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {7.4.6}Sequential Monte Carlo}{327}{subsection.7.4.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {7.4.7}Challenging posteriors}{328}{subsection.7.4.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {7.5}Evaluating approximate inference algorithms}{328}{section.7.5}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {8}Inference for state-space models}{331}{chapter.8}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {8.1}Introduction}{331}{section.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {8.2}Inference for discrete chains}{331}{section.8.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.2.1}Example: casino HMM}{332}{subsection.8.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.2.2}Forwards filtering}{333}{subsection.8.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.2.1}Prediction step}{333}{subsubsection.8.2.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.2.2}Update step}{334}{subsubsection.8.2.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.2.3}Implementation}{334}{subsubsection.8.2.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.2.4}Example}{335}{subsubsection.8.2.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.2.3}Backwards smoothing}{335}{subsection.8.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.3.1}Backwards recursion}{335}{subsubsection.8.2.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.3.2}Implementation}{336}{subsubsection.8.2.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.3.3}Example}{337}{subsubsection.8.2.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.2.4}The forwards-backwards algorithm}{337}{subsection.8.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.4.1}Backwards recursion}{337}{subsubsection.8.2.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.4.2}Two-slice smoothed marginals}{338}{subsubsection.8.2.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.2.5}Numerically stable implementation}{338}{subsection.8.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.2.6}Time and space complexity}{340}{subsection.8.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.2.7}The Viterbi algorithm}{340}{subsection.8.2.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.7.1}Forwards pass}{341}{subsubsection.8.2.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.7.2}Backwards pass}{342}{subsubsection.8.2.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.7.3}Example}{342}{subsubsection.8.2.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.7.4}Time and space complexity}{343}{subsubsection.8.2.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.2.7.5}N-best list}{343}{subsubsection.8.2.7.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.2.8}Forwards filtering, backwards sampling}{343}{subsection.8.2.8}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {8.3}Inference for linear-Gaussian chains}{344}{section.8.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.3.1}Examples}{344}{subsection.8.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.3.1.1}Tracking and state estimation}{344}{subsubsection.8.3.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.3.1.2}Recursive least squares}{346}{subsubsection.8.3.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.3.1.3}Adaptive filtering}{347}{subsubsection.8.3.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.3.2}The Kalman filter}{347}{subsection.8.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.3.2.1}Predict step}{348}{subsubsection.8.3.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.3.2.2}Update step}{348}{subsubsection.8.3.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.3.2.3}Posterior predictive}{349}{subsubsection.8.3.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.3.2.4}Steady state solution}{349}{subsubsection.8.3.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.3.2.5}Derivation of the Kalman filter}{349}{subsubsection.8.3.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.3.3}The Kalman (RTS) smoother}{351}{subsection.8.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.3.3.1}Algorithm}{351}{subsubsection.8.3.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.3.3.2}Two-filter smoothing}{351}{subsubsection.8.3.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.3.3.3}Time and space complexity}{351}{subsubsection.8.3.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.3.3.4}Forwards filtering backwards sampling}{352}{subsubsection.8.3.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.3.4}Information form filtering and smoothing}{352}{subsection.8.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {8.4}Inference for non-linear and/or non-Gaussian chains}{352}{section.8.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {8.5}Inference based on local linearization}{353}{section.8.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.5.1}Taylor series expansion}{354}{subsection.8.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.5.2}The extended Kalman filter (EKF)}{356}{subsection.8.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.5.2.1}Algorithm}{356}{subsubsection.8.5.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.5.2.2}Derivation}{356}{subsubsection.8.5.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.5.2.3}Accuracy}{357}{subsubsection.8.5.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.5.2.4}Diagonal approximation}{357}{subsubsection.8.5.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.5.3}The extended Kalman smoother}{358}{subsection.8.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {8.6}Inference based on the unscented transform}{359}{section.8.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.6.1}The unscented transform}{359}{subsection.8.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.6.2}The unscented Kalman filter (UKF)}{361}{subsection.8.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.6.2.1}Prediction step}{361}{subsubsection.8.6.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.6.2.2}Update step}{362}{subsubsection.8.6.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.6.2.3}Example: noisy 2d tracking problem}{363}{subsubsection.8.6.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.6.3}The unscented Kalman smoother}{363}{subsection.8.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {8.7}Other variants of the Kalman filter}{363}{section.8.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.7.1}Ensemble Kalman filter}{363}{subsection.8.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.7.2}Robust Kalman filters}{365}{subsection.8.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.7.3}Gaussian filtering}{365}{subsection.8.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.7.3.1}The Gaussian approximation}{365}{subsubsection.8.7.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.7.3.2}Application to online filtering}{366}{subsubsection.8.7.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {8.7.3.3}Deriving EKF, UKF, and QKF}{367}{subsubsection.8.7.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {8.8}Assumed density filtering}{367}{section.8.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.8.1}Gaussian sum filter}{368}{subsection.8.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.8.2}ADF for logistic regression}{370}{subsection.8.8.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {8.8.3}ADF for DNNs}{373}{subsection.8.8.3}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {9}Inference for graphical models}{375}{chapter.9}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {9.1}Introduction}{375}{section.9.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {9.2}Belief propagation on trees}{376}{section.9.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.2.1}BP for polytrees}{376}{subsection.9.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.2.1.1}Computing the messages}{378}{subsubsection.9.2.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.2.1.2}Message passing protocol}{379}{subsubsection.9.2.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.2.2}BP for undirected graphs with pairwise potentials}{379}{subsection.9.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.2.3}Max product belief propagation}{380}{subsection.9.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.2.3.1}Connection between MMM and MAP}{381}{subsubsection.9.2.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.2.3.2}Connection between MPM and MAP}{381}{subsubsection.9.2.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.2.3.3}Connection between MPE and MAP}{382}{subsubsection.9.2.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {9.3}Loopy belief propagation}{382}{section.9.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.3.1}Loopy BP for factor graphs}{383}{subsection.9.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.3.2}Gaussian belief propagation}{384}{subsection.9.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.3.3}Convergence}{385}{subsection.9.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.3.3.1}When will LBP converge?}{385}{subsubsection.9.3.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.3.3.2}Making LBP converge}{386}{subsubsection.9.3.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.3.3.3}Increasing the convergence rate with adaptive scheduling}{387}{subsubsection.9.3.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.3.4}Accuracy}{388}{subsection.9.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.3.5}Connection with variational inference}{388}{subsection.9.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.3.6}Generalized belief propagation}{388}{subsection.9.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.3.7}Application: error correcting codes}{389}{subsection.9.3.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.3.8}Application: Affinity propagation}{390}{subsection.9.3.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.3.9}Emulating BP with graph neural nets}{391}{subsection.9.3.9}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {9.4}The variable elimination (VE) algorithm}{392}{section.9.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.4.1}Derivation of the algorithm}{392}{subsection.9.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.4.2}Computational complexity of VE}{394}{subsection.9.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.4.3}Computational complexity of exact inference}{396}{subsection.9.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.4.4}Drawbacks of VE}{397}{subsection.9.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {9.5}The junction tree algorithm (JTA)}{398}{section.9.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.5.1}Creating a junction tree}{398}{subsection.9.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.1.1}What is a tree decomposition?}{398}{subsubsection.9.5.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.1.2}Why create a tree decomposition?}{399}{subsubsection.9.5.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.1.3}Computing a tree decomposition}{399}{subsubsection.9.5.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.1.4}Picking a good elimination order}{401}{subsubsection.9.5.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.1.5}Computing a jtree from a directed graphical model}{402}{subsubsection.9.5.1.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.5.2}Message passing on a junction tree}{402}{subsection.9.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.2.1}Discrete potential functions}{402}{subsubsection.9.5.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.2.2}Initialization}{404}{subsubsection.9.5.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.2.3}Calibration}{404}{subsubsection.9.5.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.5.3}Gaussian message passing}{405}{subsection.9.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.3.1}Moment and canonical parameterization}{405}{subsubsection.9.5.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.3.2}Multiplication and division}{406}{subsubsection.9.5.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.3.3}Marginalization}{406}{subsubsection.9.5.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.3.4}Conditioning on evidence}{406}{subsubsection.9.5.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.3.5}Initialization}{407}{subsubsection.9.5.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.5.3.6}Product of Gaussians}{407}{subsubsection.9.5.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.5.4}The generalized distributive law}{408}{subsection.9.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {9.6}Inference as optimization}{409}{section.9.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.6.1}Inference as backpropagation}{409}{subsection.9.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.6.1.1}Example: inference in a small model}{409}{subsubsection.9.6.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.6.1.2}Example: Inference in an HMM}{410}{subsubsection.9.6.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {9.6.2}Perturb and MAP}{411}{subsection.9.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.6.2.1}Gaussian case}{411}{subsubsection.9.6.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {9.6.2.2}Discrete case}{412}{subsubsection.9.6.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {10}Variational inference}{413}{chapter.10}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {10.1}Introduction}{413}{section.10.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.1.1}Variational free energy}{413}{subsection.10.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.1.2}Evidence lower bound (ELBO)}{414}{subsection.10.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {10.2}Mean field VI}{415}{section.10.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.2.1}Coordinate ascent variational inference (CAVI)}{415}{subsection.10.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.2.2}Example: CAVI for the Ising model}{416}{subsection.10.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.2.3}Variational Bayes}{418}{subsection.10.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.2.4}Example: VB for a univariate Gaussian}{419}{subsection.10.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.2.4.1}Target distribution}{420}{subsubsection.10.2.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.2.4.2}Updating $q(\mu |\boldsymbol {\psi }_{\mu })$}{420}{subsubsection.10.2.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.2.4.3}Updating $q(\lambda |\boldsymbol {\psi }_{\lambda })$}{420}{subsubsection.10.2.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.2.4.4}Computing the expectations}{421}{subsubsection.10.2.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.2.4.5}Illustration}{421}{subsubsection.10.2.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.2.4.6}Lower bound}{421}{subsubsection.10.2.4.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.2.5}Variational Bayes EM}{422}{subsection.10.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.2.6}Example: VBEM for a GMM}{423}{subsection.10.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.2.6.1}The variational posterior}{424}{subsubsection.10.2.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.2.6.2}Derivation of $q({\bm {\theta }})$ (variational M step)}{424}{subsubsection.10.2.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.2.6.3}Derivation of $q({\bm {z}})$ (variational E step)}{425}{subsubsection.10.2.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.2.6.4}Automatic sparsity inducing effects of VBEM}{426}{subsubsection.10.2.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.2.6.5}Lower bound on the marginal likelihood}{428}{subsubsection.10.2.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.2.6.6}Model selection using VBEM}{429}{subsubsection.10.2.6.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.2.7}Variational message passing (VMP)}{429}{subsection.10.2.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.2.8}Autoconj}{430}{subsection.10.2.8}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {10.3}Fixed-form VI}{430}{section.10.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.3.1}Stochastic variational inference}{430}{subsection.10.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.3.2}Black-box variational inference}{431}{subsection.10.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.3.3}Reparameterization VI}{433}{subsection.10.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.3.1}``Sticking the landing'' estimator}{433}{subsubsection.10.3.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.3.4}Full-rank Gaussian VI}{434}{subsection.10.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.3.5}Low-rank Gaussian VI}{434}{subsection.10.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.5.1}Example: GVI for the mean of an MVN}{435}{subsubsection.10.3.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.5.2}Example: GVI for logistic regression}{436}{subsubsection.10.3.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.3.6}Automatic differentiation VI}{436}{subsection.10.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.6.1}Basic idea}{436}{subsubsection.10.3.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.6.2}Example: ADVI for Beta-Binomial model}{437}{subsubsection.10.3.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.6.3}Example: ADVI for the covariance of an MVN}{437}{subsubsection.10.3.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.6.4}Example: ADVI for GMMs}{438}{subsubsection.10.3.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.6.5}Example: ADVI for sparse Bayesian linear regression}{439}{subsubsection.10.3.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.6.6}More complex posteriors}{440}{subsubsection.10.3.6.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.3.7}Sparse Gaussian VI}{440}{subsection.10.3.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.7.1}Sparsity of the ELBO}{440}{subsubsection.10.3.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.7.2}Sparsity of the natural gradient of the ELBO}{441}{subsubsection.10.3.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.7.3}Computing posterior expectations}{441}{subsubsection.10.3.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.3.7.4}Gaussian VI for nonlinear least squares problems}{441}{subsubsection.10.3.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.3.8}Non-Gaussian reparameterized VI}{442}{subsection.10.3.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.3.9}Amortized inference}{443}{subsection.10.3.9}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {10.4}More accurate variational posteriors}{445}{section.10.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.4.1}Structured mean field}{445}{subsection.10.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.4.2}Hierarchical (auxiliary variable) posteriors}{445}{subsection.10.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.4.3}Normalizing flow posteriors}{446}{subsection.10.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.4.4}Implicit posteriors}{448}{subsection.10.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.4.5}Combining VI with MCMC inference}{448}{subsection.10.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {10.5}Lower bounds}{448}{section.10.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.5.1}Multi-sample ELBO (IWAE bound)}{448}{subsection.10.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.5.1.1}Pathologies of optimizing the IWAE bound}{449}{subsubsection.10.5.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.5.2}The thermodynamic variational objective (TVO)}{449}{subsection.10.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {10.6}Upper bounds}{450}{section.10.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.6.1}Minimizing the $\chi $-divergence upper bound}{451}{subsection.10.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.6.2}Minimizing the evidence upper bound}{452}{subsection.10.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {10.7}Expectation propagation (EP)}{452}{section.10.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.7.1}Minimizing forwards vs reverse KL}{452}{subsection.10.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.7.1.1}Moment projection}{453}{subsubsection.10.7.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {10.7.1.2}Information projection}{454}{subsubsection.10.7.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.7.2}EP as generalized ADF}{454}{subsection.10.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.7.3}Algorithm}{455}{subsection.10.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.7.4}Example}{456}{subsection.10.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.7.5}Optimization issues}{457}{subsection.10.7.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.7.6}Power EP and $\alpha $-divergence}{457}{subsection.10.7.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.7.7}Stochastic EP}{457}{subsection.10.7.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {10.7.8}Applications}{458}{subsection.10.7.8}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {11}Monte Carlo inference}{459}{chapter.11}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {11.1}Introduction}{459}{section.11.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {11.2}Monte Carlo integration}{459}{section.11.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.2.1}Example: estimating $\pi $ by Monte Carlo integration}{460}{subsection.11.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.2.2}Accuracy of Monte Carlo integration}{460}{subsection.11.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {11.3}Generating random samples from simple distributions}{462}{section.11.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.3.1}Sampling using the inverse cdf}{462}{subsection.11.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.3.2}Sampling from a Gaussian (Box-Muller method)}{463}{subsection.11.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {11.4}Rejection sampling}{463}{section.11.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.4.1}Basic idea}{464}{subsection.11.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.4.2}Example}{465}{subsection.11.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.4.3}Adaptive rejection sampling}{465}{subsection.11.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.4.4}Rejection sampling in high dimensions}{466}{subsection.11.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {11.5}Importance sampling}{466}{section.11.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.5.1}Direct importance sampling}{467}{subsection.11.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.5.2}Self-normalized importance sampling}{467}{subsection.11.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.5.3}Choosing the proposal}{468}{subsection.11.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.5.4}Annealed importance sampling (AIS)}{469}{subsection.11.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {11.5.4.1}Estimating normalizing constants using AIS}{470}{subsubsection.11.5.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {11.6}Controlling Monte Carlo variance}{470}{section.11.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.6.1}Common random numbers}{470}{subsection.11.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.6.2}Rao-Blackwellisation}{470}{subsection.11.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.6.3}Control variates}{471}{subsection.11.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {11.6.3.1}Example}{472}{subsubsection.11.6.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.6.4}Antithetic sampling}{472}{subsection.11.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {11.6.4.1}Example}{472}{subsubsection.11.6.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {11.6.5}Quasi Monte Carlo (QMC)}{473}{subsection.11.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {12}Markov Chain Monte Carlo inference}{475}{chapter.12}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {12.1}Introduction}{475}{section.12.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {12.2}Metropolis Hastings algorithm}{476}{section.12.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.2.1}Basic idea}{476}{subsection.12.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.2.2}Why MH works}{477}{subsection.12.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.2.3}Proposal distributions}{478}{subsection.12.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.2.3.1}Independence sampler}{478}{subsubsection.12.2.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.2.3.2}Random walk Metropolis (RWM) algorithm}{479}{subsubsection.12.2.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.2.3.3}Composing proposals}{480}{subsubsection.12.2.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.2.3.4}Data-driven MCMC}{480}{subsubsection.12.2.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.2.3.5}Adaptive MCMC}{480}{subsubsection.12.2.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.2.4}Initialization}{481}{subsection.12.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {12.3}Gibbs sampling}{481}{section.12.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.3.1}Basic idea}{481}{subsection.12.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.3.2}Gibbs sampling is a special case of MH}{482}{subsection.12.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.3.3}Example: Gibbs sampling for Ising models}{482}{subsection.12.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.3.4}Example: Gibbs sampling for Potts models}{484}{subsection.12.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.3.5}Example: Gibbs sampling for GMMs}{484}{subsection.12.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.3.5.1}Known parameters}{484}{subsubsection.12.3.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.3.5.2}Unknown parameters}{486}{subsubsection.12.3.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.3.6}Sampling from the full conditionals}{486}{subsection.12.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.3.6.1}Adaptive rejection Metropolis sampling}{487}{subsubsection.12.3.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.3.6.2}Metropolis within Gibbs}{487}{subsubsection.12.3.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.3.7}Blocked Gibbs sampling}{487}{subsection.12.3.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.3.7.1}Example: Blocked Gibbs for HMMs}{488}{subsubsection.12.3.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.3.8}Collapsed Gibbs sampling}{488}{subsection.12.3.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.3.8.1}Example: collapsed Gibbs for GMMs}{488}{subsubsection.12.3.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.3.8.2}Example: Collapsed Gibbs sampling for LDA}{490}{subsubsection.12.3.8.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {12.4}Auxiliary variable MCMC}{490}{section.12.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.4.1}Slice sampling}{491}{subsection.12.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.4.2}Swendsen Wang}{492}{subsection.12.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {12.5}Hamiltonian Monte Carlo (HMC)}{494}{section.12.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.5.1}Hamiltonian mechanics}{494}{subsection.12.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.5.2}Integrating Hamilton's equations}{495}{subsection.12.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.5.2.1}Euler's method}{495}{subsubsection.12.5.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.5.2.2}Modified Euler's method}{495}{subsubsection.12.5.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.5.2.3}Leapfrog integrator}{496}{subsubsection.12.5.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.5.2.4}Higher order integrators}{496}{subsubsection.12.5.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.5.3}The HMC algorithm}{496}{subsection.12.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.5.4}Tuning HMC}{497}{subsection.12.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.5.4.1}Choosing the number of steps using NUTS}{497}{subsubsection.12.5.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.5.4.2}Choosing the step size}{497}{subsubsection.12.5.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.5.4.3}Choosing the covariance (inverse mass) matrix}{498}{subsubsection.12.5.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.5.5}Riemann Manifold HMC}{498}{subsection.12.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.5.6}Langevin Monte Carlo (MALA)}{499}{subsection.12.5.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.5.7}Connection between SGD and Langevin sampling}{500}{subsection.12.5.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.5.8}Applying HMC to constrained parameters}{502}{subsection.12.5.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.5.9}Speeding up HMC}{503}{subsection.12.5.9}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {12.6}MCMC convergence}{503}{section.12.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.6.1}Mixing rates of Markov chains}{504}{subsection.12.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.6.2}Practical convergence diagnostics}{504}{subsection.12.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.6.2.1}Trace plots}{505}{subsubsection.12.6.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.6.2.2}Estimated potential scale reduction (EPSR)}{506}{subsubsection.12.6.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.6.2.3}Effective sample size}{508}{subsubsection.12.6.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {12.6.2.4}Estimating the ACF from multiple chains using variograms}{511}{subsubsection.12.6.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.6.3}Improving speed of convergence}{512}{subsection.12.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.6.4}Non-centered parameterizations and Neal's funnel}{512}{subsection.12.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {12.7}Stochastic gradient MCMC}{513}{section.12.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.7.1}Stochastic Gradient Langevin Dynamics (SGLD)}{514}{subsection.12.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.7.2}Preconditionining}{514}{subsection.12.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.7.3}Reducing the variance of the gradient estimate}{515}{subsection.12.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.7.4}SG-HMC}{516}{subsection.12.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.7.5}Underdamped Langevin Dynamics}{517}{subsection.12.7.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {12.8}Reversible jump (trans-dimensional) MCMC}{518}{section.12.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.8.1}Basic idea}{518}{subsection.12.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.8.2}Example}{520}{subsection.12.8.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.8.3}Discussion}{521}{subsection.12.8.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {12.9}Annealing methods}{521}{section.12.9}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {12.9.1}Parallel tempering}{522}{subsection.12.9.1}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {13}Sequential Monte Carlo inference}{523}{chapter.13}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {13.1}Introduction}{523}{section.13.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.1.1}Problem statement}{523}{subsection.13.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.1.2}Particle filtering for state-space models}{523}{subsection.13.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.1.3}SMC samplers for static parameter estimation}{525}{subsection.13.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {13.2}Particle filtering}{525}{section.13.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.2.1}Importance sampling}{525}{subsection.13.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.2.2}Sequential importance sampling}{526}{subsection.13.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.2.3}Sequential importance sampling with resampling}{527}{subsection.13.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {13.2.3.1}Bootstrap filter}{529}{subsubsection.13.2.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {13.2.3.2}Estimating the normalizing constant}{529}{subsubsection.13.2.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {13.2.3.3}Path degeneracy problem}{530}{subsubsection.13.2.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.2.4}Resampling methods}{530}{subsection.13.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {13.2.4.1}Multinomial resampling}{530}{subsubsection.13.2.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {13.2.4.2}Stratified resampling}{531}{subsubsection.13.2.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {13.2.4.3}Systematic resampling}{532}{subsubsection.13.2.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {13.2.4.4}Comparison}{532}{subsubsection.13.2.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.2.5}Adaptive resampling}{532}{subsection.13.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {13.3}Proposal distributions}{533}{section.13.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.3.1}Locally optimal proposal}{533}{subsection.13.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.3.2}Proposals based on the Laplace approximation}{534}{subsection.13.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.3.3}Proposals based on the extended and unscented Kalman filter}{534}{subsection.13.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {13.3.3.1}Example: neural decoding}{535}{subsubsection.13.3.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.3.4}Proposals based on SMC}{536}{subsection.13.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.3.5}Learned (``neural'') proposals: TODO}{536}{subsection.13.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {13.4}Rao-Blackwellised particle filtering (RBPF)}{537}{section.13.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.4.1}Mixture of Kalman filters}{537}{subsection.13.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {13.4.1.1}Improvements}{538}{subsubsection.13.4.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {13.4.1.2}Example: tracking a maneuvering object}{539}{subsubsection.13.4.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.4.2}FastSLAM}{539}{subsection.13.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {13.5}SMC samplers}{541}{section.13.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.5.1}Ingredients of an SMC sampler}{541}{subsection.13.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.5.2}Likelihood tempering (geometric path)}{543}{subsection.13.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {13.5.2.1}Example: sampling from a 1d bimodal distribution}{543}{subsubsection.13.5.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.5.3}Data tempering}{545}{subsection.13.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {13.5.3.1}Example: IBIS for a 1d Gaussian}{546}{subsubsection.13.5.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.5.4}Sampling rare events and extrema}{547}{subsection.13.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.5.5}SMC-ABC and likelihood-free inference}{547}{subsection.13.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.5.6}SMC$^2$}{548}{subsection.13.5.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {13.6}Particle MCMC methods}{548}{section.13.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.6.1}Particle Marginal Metropolis Hastings}{549}{subsection.13.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.6.2}Particle Independent Metropolis Hastings}{549}{subsection.13.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {13.6.3}Particle Gibbs}{550}{subsection.13.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {part}{III\hspace {1em}Prediction}{553}{part.3}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {14}Predictive models: an overview}{555}{chapter.14}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {14.1}Introduction}{555}{section.14.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {14.1.1}Types of model}{555}{subsection.14.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {14.1.2}Model fitting using ERM, MLE and MAP}{556}{subsection.14.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {14.1.3}Model fitting using Bayes, VI and generalized Bayes}{557}{subsection.14.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {14.2}Evaluating predictive models}{558}{section.14.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {14.2.1}Proper scoring rules}{558}{subsection.14.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {14.2.2}Calibration}{558}{subsection.14.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {14.2.2.1}Expected calibration error}{558}{subsubsection.14.2.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {14.2.2.2}Improving calibration}{560}{subsubsection.14.2.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {14.2.2.3}Platt scaling}{560}{subsubsection.14.2.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {14.2.2.4}Nonparametric (histogram) methods}{560}{subsubsection.14.2.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {14.2.2.5}Temperature scaling}{560}{subsubsection.14.2.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {14.2.2.6}Label smoothing}{561}{subsubsection.14.2.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {14.2.2.7}Bayesian methods}{562}{subsubsection.14.2.2.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {14.2.3}Beyond evaluating marginal probabilities}{562}{subsection.14.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {14.2.3.1}Proof of claim}{564}{subsubsection.14.2.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {14.3}Conformal prediction}{565}{section.14.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {14.3.1}Conformalizing classification}{566}{subsection.14.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {14.3.2}Conformalizing regression}{567}{subsection.14.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {14.3.2.1}Conformalizing quantile regression}{567}{subsubsection.14.3.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {14.3.2.2}Conformalizing predicted variances}{568}{subsubsection.14.3.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {14.3.3}Conformalizing Bayes}{568}{subsection.14.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {14.3.4}What do we do if we don't have a calibration set?}{569}{subsection.14.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {15}Generalized linear models}{571}{chapter.15}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {15.1}Introduction}{571}{section.15.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.1.1}Examples}{571}{subsection.15.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.1.1.1}Linear regression}{571}{subsubsection.15.1.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.1.1.2}Binomial regression}{572}{subsubsection.15.1.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.1.1.3}Poisson regression}{573}{subsubsection.15.1.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.1.1.4}Zero-inflated Poisson regression}{573}{subsubsection.15.1.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.1.2}GLMs with non-canonical link functions}{574}{subsection.15.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.1.3}Maximum likelihood estimation}{574}{subsection.15.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.1.4}Bayesian inference}{575}{subsection.15.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {15.2}Linear regression}{576}{section.15.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.2.1}Conjugate priors}{576}{subsection.15.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.2.1.1}Noise variance is known}{576}{subsubsection.15.2.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.2.1.2}Noise variance is unknown}{577}{subsubsection.15.2.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.2.1.3}Posterior predictive distribution}{578}{subsubsection.15.2.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.2.2}Uninformative priors}{578}{subsection.15.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.2.2.1}Jeffreys prior}{578}{subsubsection.15.2.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.2.2.2}Connection to frequentist statistics}{579}{subsubsection.15.2.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.2.2.3}Zellner's $g$-prior}{579}{subsubsection.15.2.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.2.3}Informative priors}{580}{subsection.15.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.2.4}Spike and slab prior}{582}{subsection.15.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.2.5}Laplace prior (Bayesian lasso)}{583}{subsection.15.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.2.6}Horseshoe prior}{584}{subsection.15.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.2.7}Automatic relevancy determination}{585}{subsection.15.2.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.2.7.1}ARD for linear models}{585}{subsubsection.15.2.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.2.7.2}Why does ARD result in a sparse solution?}{586}{subsubsection.15.2.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.2.7.3}Algorithms for ARD}{587}{subsubsection.15.2.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.2.7.4}Relevance vector machines}{587}{subsubsection.15.2.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {15.3}Logistic regression}{588}{section.15.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.3.1}Binary logistic regression}{588}{subsection.15.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.3.2}Multinomial logistic regression}{588}{subsection.15.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.3.3}Priors}{589}{subsection.15.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.3.4}Posteriors}{590}{subsection.15.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.3.5}Laplace approximation}{590}{subsection.15.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.3.6}MCMC inference}{593}{subsection.15.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.3.7}Variational inference}{594}{subsection.15.3.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.3.8}Assumed density filtering}{594}{subsection.15.3.8}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {15.4}Probit regression}{594}{section.15.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.4.1}Latent variable interpretation}{594}{subsection.15.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.4.2}Maximum likelihood estimation}{595}{subsection.15.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.4.2.1}MLE using SGD}{596}{subsubsection.15.4.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.4.2.2}MLE using EM}{596}{subsubsection.15.4.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.4.3}Bayesian inference}{597}{subsection.15.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.4.4}Ordinal probit regression}{597}{subsection.15.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.4.5}Multinomial probit models }{598}{subsection.15.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {15.5}Multi-level GLMs}{598}{section.15.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.5.1}Generalized linear mixed models (GLMMs)}{599}{subsection.15.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.5.2}Model fitting}{599}{subsection.15.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {15.5.3}Example: radon regression}{599}{subsection.15.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.5.3.1}Posterior inference}{600}{subsubsection.15.5.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {15.5.3.2}Non-centered parameterization}{602}{subsubsection.15.5.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {16}Deep neural networks}{603}{chapter.16}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {16.1}Introduction}{603}{section.16.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {16.2}Building blocks of differentiable circuits}{603}{section.16.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.2.1}Linear layers}{604}{subsection.16.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.2.2}Non-linearities}{604}{subsection.16.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.2.3}Convolutional layers}{605}{subsection.16.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.2.4}Residual (skip) connections}{606}{subsection.16.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.2.5}Normalization layers}{607}{subsection.16.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.2.6}Dropout layers}{607}{subsection.16.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.2.7}Attention layers}{608}{subsection.16.2.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.2.8}Recurrent layers}{611}{subsection.16.2.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.2.9}Multiplicative layers}{611}{subsection.16.2.9}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.2.10}Implicit layers}{612}{subsection.16.2.10}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {16.3}Canonical examples of neural networks}{612}{section.16.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.3.1}Multi-layer perceptrons (MLP)}{613}{subsection.16.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.3.2}Convolutional neural networks (CNN)}{613}{subsection.16.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.3.3}Recurrent neural networks (RNN)}{613}{subsection.16.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.3.4}Transformers}{615}{subsection.16.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {16.3.4.1}Encoder}{616}{subsubsection.16.3.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {16.3.4.2}Decoder}{617}{subsubsection.16.3.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {16.3.4.3}Putting it all together}{617}{subsubsection.16.3.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {16.3.4.4}Discussion}{618}{subsubsection.16.3.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {16.3.5}Graph neural networks (GNNs)}{619}{subsection.16.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {16.3.5.1}Basics of GNNs}{619}{subsubsection.16.3.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {16.3.5.2}Message passing}{620}{subsubsection.16.3.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {16.3.5.3}More complex types of graphs}{620}{subsubsection.16.3.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {16.3.5.4}Graph attention networks}{621}{subsubsection.16.3.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {16.3.5.5}Transformers are fully connected GNNs}{622}{subsubsection.16.3.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {17}Bayesian neural networks}{625}{chapter.17}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {17.1}Introduction}{625}{section.17.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {17.2}Priors for BNNs}{625}{section.17.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.2.1}Gaussian priors}{625}{subsection.17.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.2.2}Sparsity-promoting priors}{628}{subsection.17.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.2.3}Learning the prior}{628}{subsection.17.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.2.4}Priors in function space}{628}{subsection.17.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.2.5}Architectural priors}{628}{subsection.17.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {17.3}Likelihoods for BNNs}{629}{section.17.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {17.4}Posteriors for BNNs}{630}{section.17.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.4.1}Laplace approximation}{630}{subsection.17.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.4.2}Variational inference}{631}{subsection.17.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.4.3}Expectation propagation}{632}{subsection.17.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.4.4}Last layer methods}{632}{subsection.17.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.4.5}Dropout}{633}{subsection.17.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.4.6}MCMC methods}{633}{subsection.17.4.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.4.7}Methods based on the SGD trajectory}{633}{subsection.17.4.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.4.8}Deep ensembles}{635}{subsection.17.4.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {17.4.8.1}Multi-SWAG}{635}{subsubsection.17.4.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {17.4.8.2}Deep ensembles with random priors}{636}{subsubsection.17.4.8.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {17.4.8.3}Deep ensembles as approximate Bayesian inference}{636}{subsubsection.17.4.8.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {17.4.8.4}Batch ensemble}{638}{subsubsection.17.4.8.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.4.9}Approximating the posterior predictive distibution}{639}{subsection.17.4.9}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {17.4.9.1}A linearized approximation}{639}{subsubsection.17.4.9.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {17.4.9.2}Distillation}{640}{subsubsection.17.4.9.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {17.5}Generalization in Bayesian deep learning}{640}{section.17.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.5.1}Sharp vs flat minima}{640}{subsection.17.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.5.2}Effective dimensionality of a model}{642}{subsection.17.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.5.3}The hypothesis space of DNNs}{642}{subsection.17.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.5.4}Double descent}{643}{subsection.17.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.5.5}A Bayesian Resolution to Double Descent}{646}{subsection.17.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.5.6}PAC-Bayes}{648}{subsection.17.5.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.5.7}Out-of-Distribution Generalization for BNNs}{648}{subsection.17.5.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {17.6}Online inference}{651}{section.17.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.6.1}Extended Kalman Filtering for DNNs}{651}{subsection.17.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {17.6.1.1}Example}{652}{subsubsection.17.6.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {17.6.1.2}Setting the variance terms}{652}{subsubsection.17.6.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {17.6.1.3}Reducing the computational complexity}{652}{subsubsection.17.6.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {17.6.1.4}Beyond squared error}{654}{subsubsection.17.6.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.6.2}Assumed Density Filtering for DNNs}{654}{subsection.17.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.6.3}Sequential Laplace for DNNs}{655}{subsection.17.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.6.4}Variational methods}{656}{subsection.17.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {17.7}Hierarchical Bayesian neural networks}{656}{section.17.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {17.7.1}Solving multiple related classification problems}{657}{subsection.17.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {18}Gaussian processes}{661}{chapter.18}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {18.1}Introduction}{661}{section.18.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.1.1}GPs: What and why?}{661}{subsection.18.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {18.2}Mercer kernels}{663}{section.18.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.2.1}Some popular Mercer kernels}{664}{subsection.18.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.2.1.1}Stationary kernels for real-valued vectors}{664}{subsubsection.18.2.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.2.1.2}Making new kernels from old}{668}{subsubsection.18.2.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.2.1.3}Combining kernels by addition and multiplication}{668}{subsubsection.18.2.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.2.1.4}Kernels for structured inputs}{670}{subsubsection.18.2.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.2.2}Mercer's theorem}{670}{subsection.18.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.2.3}Kernels from Spectral Densities}{671}{subsection.18.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.2.3.1}Random feature kernels}{672}{subsubsection.18.2.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {18.3}GPs with Gaussian likelihoods}{672}{section.18.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.3.1}Predictions using noise-free observations}{672}{subsection.18.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.3.2}Predictions using noisy observations}{674}{subsection.18.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.3.3}Weight space vs function space}{675}{subsection.18.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.3.4}Semi-parametric GPs}{675}{subsection.18.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.3.5}Marginal likelihood}{676}{subsection.18.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.3.6}Computational and numerical issues}{677}{subsection.18.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.3.7}Kernel ridge regression}{677}{subsection.18.3.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.3.7.1}Reproducing kernel Hilbert spaces}{678}{subsubsection.18.3.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.3.7.2}Complexity of a function in an RKHS}{679}{subsubsection.18.3.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.3.7.3}Representer theorem}{679}{subsubsection.18.3.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.3.7.4}Example of KRR vs GPR}{680}{subsubsection.18.3.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {18.4}GPs with non-Gaussian likelihoods}{680}{section.18.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.4.1}Binary classification}{681}{subsection.18.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.4.2}Multi-class classification}{682}{subsection.18.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.4.3}GPs for Poisson regression (Cox process)}{683}{subsection.18.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.4.4}Other likelihoods}{683}{subsection.18.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {18.5}Scaling GP inference to large datasets}{684}{section.18.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.5.1}Subset of data}{684}{subsection.18.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.1.1}Informative vector machine}{685}{subsubsection.18.5.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.1.2}Discussion}{685}{subsubsection.18.5.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.5.2}Nystr{\"o}m\xspace approximation}{685}{subsection.18.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.5.3}Inducing point methods}{686}{subsection.18.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.3.1}SOR/ DIC}{687}{subsubsection.18.5.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.3.2}DTC}{688}{subsubsection.18.5.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.3.3}FITC}{688}{subsubsection.18.5.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.3.4}Learning the inducing points}{689}{subsubsection.18.5.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.5.4}Sparse variational methods}{689}{subsection.18.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.4.1}Gaussian likelihood}{691}{subsubsection.18.5.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.4.2}Non-Gaussian likelihood}{692}{subsubsection.18.5.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.4.3}Minibatch SVI}{693}{subsubsection.18.5.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.5.5}Exploiting parallelization and structure via kernel matrix multiplies}{693}{subsection.18.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.5.1}Using conjugate gradient and Lanczos methods}{693}{subsubsection.18.5.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.5.2}Kernels with compact support}{694}{subsubsection.18.5.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.5.3}KISS}{694}{subsubsection.18.5.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.5.5.4}Tensor train methods}{695}{subsubsection.18.5.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.5.6}Converting a GP to a SSM}{695}{subsection.18.5.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {18.6}Learning the kernel}{695}{section.18.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.6.1}Empirical Bayes for the kernel parameters}{696}{subsection.18.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.6.1.1}Example}{697}{subsubsection.18.6.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.6.2}Bayesian inference for the kernel parameters}{698}{subsection.18.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.6.3}Multiple kernel learning for additive kernels}{699}{subsection.18.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.6.4}Automatic search for compositional kernels}{701}{subsection.18.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.6.5}Spectral mixture kernel learning}{703}{subsection.18.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.6.6}Deep kernel learning}{706}{subsection.18.6.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {18.7}GPs and DNNs}{707}{section.18.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.7.1}Kernels derived from random DNNs (NN-GP)}{707}{subsection.18.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.7.1.1}Result for MLP with one hidden layer}{707}{subsubsection.18.7.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.7.1.2}Result for deep MLPs}{708}{subsubsection.18.7.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.7.2}Kernels derived from trained DNNs (neural tangent kernel)}{710}{subsection.18.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.7.2.1}Computing the NTK}{711}{subsubsection.18.7.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.7.2.2}Connections with GPs}{711}{subsubsection.18.7.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.7.2.3}Discussion}{712}{subsubsection.18.7.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.7.3}Deep GPs}{712}{subsection.18.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.7.3.1}Construction of a deep GP}{712}{subsubsection.18.7.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.7.3.2}Example: 1d step function}{712}{subsubsection.18.7.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.7.3.3}Posterior inference}{714}{subsubsection.18.7.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.7.3.4}Behavior in the limit of infinite width}{715}{subsubsection.18.7.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {18.7.3.5}Connection with Bayesian neural networks}{716}{subsubsection.18.7.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {18.8}Gaussian processes for timeseries forecasting}{717}{section.18.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {18.8.1}Example: Mauna Loa}{717}{subsection.18.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {19}Beyond the iid assumption}{719}{chapter.19}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {19.1}Introduction}{719}{section.19.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {19.2}Distribution shift}{719}{section.19.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.2.1}Motivating examples}{719}{subsection.19.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.2.2}A causal view of distribution shift}{721}{subsection.19.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.2.3}Covariate shift}{721}{subsection.19.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.2.4}Domain shift}{722}{subsection.19.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.2.5}Label / prior shift}{723}{subsection.19.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.2.6}Concept shift}{723}{subsection.19.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.2.7}Manifestation shift}{723}{subsection.19.2.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.2.8}Selection bias}{724}{subsection.19.2.8}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {19.3}Training-time techniques for distribution shift}{724}{section.19.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.3.1}Importance weighting for covariate shift}{725}{subsection.19.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {19.3.1.1}Conformal prediction with covariate shift}{725}{subsubsection.19.3.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.3.2}Domain adaptation}{726}{subsection.19.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.3.3}Domain randomization}{726}{subsection.19.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.3.4}Data augmentation}{727}{subsection.19.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.3.5}Unsupervised label shift estimation}{727}{subsection.19.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.3.6}Distributionally robust optimization}{728}{subsection.19.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {19.4}Test-time techniques for distribution shift}{728}{section.19.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.4.1}Detecting shifts using two-sample testing}{728}{subsection.19.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.4.2}Detecting single out-of-distribution (OOD) inputs}{728}{subsection.19.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {19.4.2.1}Supervised ID/OOD methods (outlier exposure)}{729}{subsubsection.19.4.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {19.4.2.2}Classification confidence methods}{729}{subsubsection.19.4.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {19.4.2.3}Conformal prediction}{730}{subsubsection.19.4.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {19.4.2.4}Unsupervised methods}{730}{subsubsection.19.4.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.4.3}Selective prediction}{731}{subsection.19.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {19.4.3.1}Example: SGLD vs SGD for MLPs}{731}{subsubsection.19.4.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.4.4}Open world recognition}{732}{subsection.19.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.4.5}Online adaptation}{733}{subsection.19.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {19.5}Learning from multiple distributions}{734}{section.19.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.5.1}Transfer learning}{735}{subsection.19.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {19.5.1.1}Pre-train and fine-tune}{735}{subsubsection.19.5.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.5.2}Few-shot learning}{736}{subsection.19.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.5.3}Prompt tuning}{736}{subsection.19.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.5.4}Zero-shot learning}{736}{subsection.19.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.5.5}Multi-task learning}{737}{subsection.19.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.5.6}Domain generalization}{738}{subsection.19.5.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.5.7}Invariant risk minimization}{739}{subsection.19.5.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {19.6}Meta-learning}{740}{section.19.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.6.1}Meta-learning as probabilistic inference for prediction}{741}{subsection.19.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.6.2}Gradient-based meta-learning}{742}{subsection.19.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.6.3}Metric-based few-shot learning}{742}{subsection.19.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.6.4}VERSA}{742}{subsection.19.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.6.5}Neural processes}{743}{subsection.19.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {19.7}Continual learning}{743}{section.19.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.7.1}Domain drift}{743}{subsection.19.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.7.2}Concept drift}{743}{subsection.19.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.7.3}Task incremental learning}{745}{subsection.19.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.7.4}Catastrophic forgetting}{746}{subsection.19.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.7.5}Online learning}{748}{subsection.19.7.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {19.8}Adversarial examples}{749}{section.19.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.8.1}Whitebox (gradient-based) attacks}{751}{subsection.19.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.8.2}Blackbox (gradient-free) attacks}{751}{subsection.19.8.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.8.3}Real world adversarial attacks}{753}{subsection.19.8.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.8.4}Defenses based on robust optimization}{753}{subsection.19.8.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {19.8.5}Why models have adversarial examples}{754}{subsection.19.8.5}
\defcounter {refsection}{0}\relax 
\contentsline {part}{IV\hspace {1em}Generation}{757}{part.4}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {20}Generative models: an overview}{759}{chapter.20}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {20.1}Introduction}{759}{section.20.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {20.2}Types of generative model}{759}{section.20.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {20.3}Goals of generative modeling}{761}{section.20.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.3.1}Generating data}{761}{subsection.20.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.3.2}Density estimation}{762}{subsection.20.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.3.3}Imputation}{763}{subsection.20.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.3.4}Structure discovery}{764}{subsection.20.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.3.5}Latent space interpolation}{765}{subsection.20.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.3.6}Representation learning}{766}{subsection.20.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {20.4}Evaluating generative models}{766}{section.20.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.4.1}Likelihood}{767}{subsection.20.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {20.4.1.1}Evaluating log-likelihood}{767}{subsubsection.20.4.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {20.4.1.2}Challenges with using likelihood}{767}{subsubsection.20.4.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {20.4.1.3}Likelihood can be hard to compute}{768}{subsubsection.20.4.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {20.4.1.4}Likelihood is not related to sample quality}{768}{subsubsection.20.4.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.4.2}Distances and divergences in feature space}{768}{subsection.20.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.4.3}Precision and recall metrics}{769}{subsection.20.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.4.4}Statistical tests}{770}{subsection.20.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.4.5}Challenges with using pretrained classifiers}{771}{subsection.20.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.4.6}Using model samples to train classifiers}{771}{subsection.20.4.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.4.7}Assessing overfitting}{771}{subsection.20.4.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {20.4.8}Human evaluation}{771}{subsection.20.4.8}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {21}Variational autoencoders}{773}{chapter.21}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {21.1}Introduction}{773}{section.21.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {21.2}VAE basics}{773}{section.21.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.2.1}Modeling assumptions}{774}{subsection.21.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.2.2}Evidence lower bound}{775}{subsection.21.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.2.3}Optimization}{776}{subsection.21.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.2.4}The reparameterization trick}{776}{subsection.21.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.2.5}Computing the reparameterized ELBO}{778}{subsection.21.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {21.2.5.1}Fully factorized Gaussian}{778}{subsubsection.21.2.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {21.2.5.2}Full covariance Gaussian}{779}{subsubsection.21.2.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {21.2.5.3}Inverse autoregressive flows}{779}{subsubsection.21.2.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.2.6}Comparison of VAEs and autoencoders}{779}{subsection.21.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.2.7}VAEs optimize in an augmented space}{780}{subsection.21.2.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {21.3}VAE generalizations}{783}{section.21.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.3.1}$\sigma $-VAE}{783}{subsection.21.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.3.2}$\beta $-VAE}{785}{subsection.21.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {21.3.2.1}Connection with $\sigma $-VAE}{786}{subsubsection.21.3.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {21.3.2.2}Disentangled representations}{786}{subsubsection.21.3.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {21.3.2.3}Connection with information bottleneck}{786}{subsubsection.21.3.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.3.3}InfoVAE}{787}{subsection.21.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {21.3.3.1}Connection with MMD VAE}{788}{subsubsection.21.3.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {21.3.3.2}Connection with $\beta $-VAEs}{788}{subsubsection.21.3.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {21.3.3.3}Connection with adversarial autoencoders}{788}{subsubsection.21.3.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {21.3.3.4}Example}{788}{subsubsection.21.3.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.3.4}Multi-modal VAEs}{790}{subsection.21.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.3.5}VAEs with missing data}{791}{subsection.21.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.3.6}Semi-supervised VAEs}{793}{subsection.21.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.3.7}VAEs with sequential encoders/decoders}{794}{subsection.21.3.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {21.3.7.1}Models}{795}{subsubsection.21.3.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {21.3.7.2}Applications}{795}{subsubsection.21.3.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {21.4}Avoiding posterior collapse}{797}{section.21.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.4.1}KL annealing}{798}{subsection.21.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.4.2}Lower bounding the rate}{798}{subsection.21.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.4.3}Free bits}{799}{subsection.21.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.4.4}Adding skip connections}{799}{subsection.21.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.4.5}Improved variational inference}{799}{subsection.21.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.4.6}Alternative objectives}{800}{subsection.21.4.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.4.7}Enforcing identifiability}{800}{subsection.21.4.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {21.5}VAEs with hierarchical structure}{800}{section.21.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.5.1}Bottom-up vs top-down inference}{801}{subsection.21.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.5.2}Example: Very deep VAE}{802}{subsection.21.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.5.3}Connection with autoregressive models}{804}{subsection.21.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.5.4}Variational pruning}{805}{subsection.21.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.5.5}Other optimization difficulties}{806}{subsection.21.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {21.6}Vector quantization VAE}{807}{section.21.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.6.1}Autoencoder with binary code}{807}{subsection.21.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.6.2}VQ-VAE model}{807}{subsection.21.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.6.3}Learning the prior}{809}{subsection.21.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.6.4}Hierarchical extension (VQ-VAE-2)}{809}{subsection.21.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.6.5}Discrete VAE}{810}{subsection.21.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.6.6}VQ-GAN}{811}{subsection.21.6.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {21.7}Wake-sleep algorithm}{812}{section.21.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.7.1}Wake phase}{813}{subsection.21.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.7.2}Sleep phase}{813}{subsection.21.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.7.3}Daydream phase}{814}{subsection.21.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {21.7.4}Summary of algorithm}{815}{subsection.21.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {22}Auto-regressive models}{817}{chapter.22}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {22.1}Introduction}{817}{section.22.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {22.2}Neural autoregressive density estimators (NADE)}{818}{section.22.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {22.3}Causal CNNs}{818}{section.22.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {22.3.1}1d causal CNN (Convolutional Markov models)}{819}{subsection.22.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {22.3.2}2d causal CNN (PixelCNN)}{819}{subsection.22.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {22.4}Transformer decoders}{820}{section.22.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {22.4.1}Text generation (GPT)}{821}{subsection.22.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {22.4.2}Music generation}{821}{subsection.22.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {22.4.3}Text-to-image generation (DALL-E)}{822}{subsection.22.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {23}Normalizing Flows}{825}{chapter.23}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {23.1}Introduction}{825}{section.23.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {23.1.1}Preliminaries}{825}{subsection.23.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {23.1.2}Example }{827}{subsection.23.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {23.1.3}How to train a flow model}{828}{subsection.23.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.1.3.1}Density estimation}{828}{subsubsection.23.1.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.1.3.2}Variational inference}{828}{subsubsection.23.1.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {23.2}Constructing Flows}{829}{section.23.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {23.2.1}Affine flows}{829}{subsection.23.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {23.2.2}Elementwise flows}{830}{subsection.23.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.2.2.1}Affine scalar bijection}{830}{subsubsection.23.2.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.2.2.2}Higher-order perturbations}{830}{subsubsection.23.2.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.2.2.3}Combinations of strictly monotonic scalar functions}{831}{subsubsection.23.2.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.2.2.4}Scalar bijections from integration}{831}{subsubsection.23.2.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.2.2.5}Splines}{832}{subsubsection.23.2.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {23.2.3}Coupling flows}{832}{subsection.23.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {23.2.4}Autoregressive flows}{834}{subsection.23.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.2.4.1}Affine autoregressive flows}{835}{subsubsection.23.2.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.2.4.2}Masked autoregressive flows}{836}{subsubsection.23.2.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.2.4.3}Inverse autoregressive flows}{837}{subsubsection.23.2.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.2.4.4}Connection with autoregressive models}{838}{subsubsection.23.2.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {23.2.5}Residual flows}{839}{subsection.23.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.2.5.1}Contractive residual blocks}{839}{subsubsection.23.2.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {23.2.5.2}Residual blocks with low-rank Jacobian}{840}{subsubsection.23.2.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {23.2.6}Continuous-time flows}{841}{subsection.23.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {23.3}Applications}{843}{section.23.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {23.3.1}Density estimation}{843}{subsection.23.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {23.3.2}Generative Modeling}{844}{subsection.23.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {23.3.3}Inference}{844}{subsection.23.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {24}Energy-based models}{847}{chapter.24}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {24.1}Introduction}{847}{section.24.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.1.1}Example: Products of experts (PoE)}{848}{subsection.24.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.1.2}Computational difficulties}{848}{subsection.24.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {24.2}Maximum Likelihood Training}{849}{section.24.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.2.1}Gradient-based MCMC methods}{850}{subsection.24.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.2.2}Contrastive divergence}{850}{subsection.24.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {24.2.2.1}Fitting RBMs with CD}{851}{subsubsection.24.2.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {24.2.2.2}Persistent CD}{853}{subsubsection.24.2.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {24.2.2.3}Other methods}{853}{subsubsection.24.2.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {24.3}Score Matching (SM)}{854}{section.24.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.3.1}Basic score matching}{854}{subsection.24.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.3.2}Denoising Score Matching (DSM)}{855}{subsection.24.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {24.3.2.1}Difficulties}{855}{subsubsection.24.3.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.3.3}Sliced Score Matching (SSM)}{856}{subsection.24.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.3.4}Connection to Contrastive Divergence}{857}{subsection.24.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.3.5}Score-Based Generative Models}{858}{subsection.24.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {24.3.5.1}Adding noise at multiple scales}{860}{subsubsection.24.3.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {24.4}Noise Contrastive Estimation}{861}{section.24.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.4.1}Connection to Score Matching}{862}{subsection.24.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {24.5}Other Methods}{863}{section.24.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.5.1}Minimizing Differences/Derivatives of KL Divergences}{863}{subsection.24.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.5.2}Minimizing the Stein Discrepancy}{864}{subsection.24.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {24.5.3}Adversarial Training}{864}{subsection.24.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {25}Diffusion models}{867}{chapter.25}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {25.1}Variational diffusion models}{867}{section.25.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {25.1.1}Encoder}{867}{subsection.25.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {25.1.1.1}Signal to noise ratio}{868}{subsubsection.25.1.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {25.1.1.2}Markov chain encoder}{869}{subsubsection.25.1.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {25.1.2}Decoder}{869}{subsection.25.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {25.1.3}Model fitting}{871}{subsection.25.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {25.1.3.1}Deriving the ELBO}{871}{subsubsection.25.1.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {25.1.3.2}Deriving the KL terms}{872}{subsubsection.25.1.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {25.1.3.3}Weighting terms}{873}{subsubsection.25.1.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {25.1.3.4}Stochastic estimator for deep models}{873}{subsubsection.25.1.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {25.1.3.5}Infinite depth $T \rightarrow \infty $}{873}{subsubsection.25.1.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {25.1.4}Connection to DDPM}{874}{subsection.25.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {25.1.5}2d Example}{874}{subsection.25.1.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {25.1.6}Application to image generation}{875}{subsection.25.1.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {25.2}Conditional diffusion models}{875}{section.25.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {25.2.1}Classifier guidance}{876}{subsection.25.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {25.2.2}Classifier-free guidance}{876}{subsection.25.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {25.2.3}Conditional image generation}{877}{subsection.25.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {25.2.4}Other forms of conditional generation}{877}{subsection.25.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {25.3}Speeding up the generation process}{877}{section.25.3}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {26}Generative adversarial networks}{881}{chapter.26}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {26.1}Introduction}{881}{section.26.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {26.2}Learning by Comparison}{882}{section.26.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.2.1}Guiding principles}{883}{subsection.26.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.2.2}Class probability estimation}{884}{subsection.26.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.2.3}Bounds on $f$-divergences}{887}{subsection.26.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.2.4}Integral probability metrics}{888}{subsection.26.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.2.5}Moment matching}{890}{subsection.26.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.2.6}On density ratios and differences}{891}{subsection.26.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {26.3}Generative Adversarial Networks}{892}{section.26.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.3.1}From learning principles to loss functions}{893}{subsection.26.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.3.2}Gradient Descent}{894}{subsection.26.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.3.3}Challenges with GAN training}{895}{subsection.26.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.3.4}Improving GAN optimization}{897}{subsection.26.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.3.5}Convergence of GAN training}{897}{subsection.26.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {26.4}Conditional GANs}{901}{section.26.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {26.5}Inference with GANs}{902}{section.26.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {26.6}Neural architectures in GANs}{903}{section.26.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.6.1}The importance of discriminator architectures}{903}{subsection.26.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.6.2}Architectural inductive biases}{903}{subsection.26.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.6.3}Attention in GANs}{904}{subsection.26.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.6.4}Progressive generation}{905}{subsection.26.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.6.5}Regularization}{906}{subsection.26.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.6.6}Scaling up GAN models}{907}{subsection.26.6.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {26.7}Applications}{907}{section.26.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.7.1}GANs for image generation}{907}{subsection.26.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {26.7.1.1}Conditional image generation}{907}{subsubsection.26.7.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {26.7.1.2}Paired image-to-image generation}{908}{subsubsection.26.7.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {26.7.1.3}Unpaired image-to-image generation}{908}{subsubsection.26.7.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.7.2}Video generation}{909}{subsection.26.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.7.3}Audio generation}{910}{subsection.26.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.7.4}Text generation}{911}{subsection.26.7.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.7.5}Imitation Learning}{912}{subsection.26.7.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.7.6}Domain Adaptation}{912}{subsection.26.7.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {26.7.7}Design, Art and Creativity}{913}{subsection.26.7.7}
\defcounter {refsection}{0}\relax 
\contentsline {part}{V\hspace {1em}Discovery}{915}{part.5}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {27}Discovery methods: an overview}{917}{chapter.27}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {27.1}Introduction}{917}{section.27.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {27.2}Overview of \cref {part:discovery}}{918}{section.27.2}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {28}Latent factor models}{919}{chapter.28}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {28.1}Introduction}{919}{section.28.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {28.2}Mixture models}{919}{section.28.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.2.1}Gaussian mixture models (GMMs)}{920}{subsection.28.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.2.2}Bernoulli mixture models}{922}{subsection.28.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.2.3}Gaussian scale mixtures (GSMs)}{922}{subsection.28.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.2.3.1}Student $t$ distribution as GSM}{923}{subsubsection.28.2.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.2.3.2}Laplace distribution as GSM}{923}{subsubsection.28.2.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.2.3.3}Spike and slab distribution}{923}{subsubsection.28.2.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.2.3.4}Horseshoe distribution}{923}{subsubsection.28.2.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.2.4}Using GMMs as a prior for inverse imaging problems}{924}{subsection.28.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.2.4.1}Why does the method work?}{926}{subsubsection.28.2.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.2.4.2}Speeding up inference using discriminative models}{927}{subsubsection.28.2.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.2.4.3}Blind inverse problems}{927}{subsubsection.28.2.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.2.5}Using mixture models for classification problems}{927}{subsection.28.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {28.3}Factor analysis}{929}{section.28.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.3.1}Factor analysis: the basics}{929}{subsection.28.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.1.1}FA as a Gaussian with low-rank plus diagonal covariance}{929}{subsubsection.28.3.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.1.2}Computing the posterior}{930}{subsubsection.28.3.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.1.3}Computing the likelihood}{931}{subsubsection.28.3.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.1.4}Model fitting using EM}{931}{subsubsection.28.3.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.1.5}Handling missing data}{932}{subsubsection.28.3.1.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.1.6}Unidentifiability of the parameters}{932}{subsubsection.28.3.1.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.3.2}Probabilistic PCA}{933}{subsection.28.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.2.1}Derivation of the MLE}{933}{subsubsection.28.3.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.2.2}PCA is recovered in the noise-free limit}{934}{subsubsection.28.3.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.2.3}Computing the posterior}{934}{subsubsection.28.3.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.2.4}Model fitting using EM}{935}{subsubsection.28.3.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.3.3}Mixture of factor analysers}{935}{subsection.28.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.3.1}Model definition}{936}{subsubsection.28.3.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.3.2}Model fitting using EM}{937}{subsubsection.28.3.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.3.3}Model fitting using SGD}{938}{subsubsection.28.3.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.3.4}Model selection}{938}{subsubsection.28.3.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.3.5}MixFA for image generation}{938}{subsubsection.28.3.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.3.4}Factor analysis models for paired data}{942}{subsection.28.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.4.1}Supervised PCA}{943}{subsubsection.28.3.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.4.2}Partial least squares}{943}{subsubsection.28.3.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.4.3}Canonical correlation analysis}{944}{subsubsection.28.3.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.3.5}Factor analysis with exponential family likelihoods}{945}{subsection.28.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.5.1}Example: binary PCA}{946}{subsubsection.28.3.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.3.5.2}Example: categorical PCA}{946}{subsubsection.28.3.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.3.6}Factor analysis with DNN likelihoods}{946}{subsection.28.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.3.7}Factor analysis with GP likelihoods (GP-LVM)}{947}{subsection.28.3.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {28.4}LFMs with non-Gaussian priors}{949}{section.28.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.4.1}Non-negative matrix factorization (NMF)}{949}{subsection.28.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.4.2}Multinomial PCA}{950}{subsection.28.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.4.2.1}Example: roll call data}{951}{subsubsection.28.4.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.4.2.2}Advantage of Dirichlet prior over Gaussian prior}{951}{subsubsection.28.4.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.4.2.3}Connection to mixture models}{952}{subsubsection.28.4.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {28.5}Topic models}{952}{section.28.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.5.1}Latent Dirichlet Allocation (LDA)}{952}{subsection.28.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.5.1.1}Model definition}{952}{subsubsection.28.5.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.5.1.2}Polysemy}{953}{subsubsection.28.5.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.5.1.3}Posterior inference}{955}{subsubsection.28.5.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.5.1.4}Determining the number of topics}{955}{subsubsection.28.5.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.5.2}Correlated topic model}{956}{subsection.28.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.5.3}Dynamic topic model}{956}{subsection.28.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.5.4}LDA-HMM}{957}{subsection.28.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {28.6}Independent components analysis (ICA)}{960}{section.28.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.6.1}Noiseless ICA model}{960}{subsection.28.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.6.2}The need for non-Gaussian priors}{962}{subsection.28.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.6.3}Maximum likelihood estimation}{963}{subsection.28.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.6.4}Alternatives to MLE}{964}{subsection.28.6.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.6.4.1}Maximizing non-Gaussianity}{964}{subsubsection.28.6.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.6.4.2}Minimizing total correlation}{965}{subsubsection.28.6.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {28.6.4.3}Maximizing mutual information (InfoMax)}{965}{subsubsection.28.6.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.6.5}Sparse coding}{965}{subsection.28.6.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {28.6.6}Nonlinear ICA}{966}{subsection.28.6.6}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {29}State-space models}{969}{chapter.29}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {29.1}Introduction}{969}{section.29.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {29.2}Hidden Markov models (HMMs)}{970}{section.29.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.2.1}Conditional independence properties}{970}{subsection.29.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.2.2}State transition model}{970}{subsection.29.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.2.3}Discrete likelihoods}{971}{subsection.29.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.2.4}Gaussian likelihoods}{971}{subsection.29.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.2.5}Autoregressive likelihoods}{972}{subsection.29.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {29.3}HMMs: Applications}{974}{section.29.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.3.1}Time series segmentation}{974}{subsection.29.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.3.2}Protein sequence alignment}{976}{subsection.29.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.3.3}Spelling correction}{977}{subsection.29.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.3.3.1}Baseline model}{978}{subsubsection.29.3.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.3.3.2}HMM model}{978}{subsubsection.29.3.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.3.3.3}Extended HMM model}{979}{subsubsection.29.3.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {29.4}HMMs: parameter learning}{980}{section.29.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.4.1}The Baum-Welch (EM) algorithm}{980}{subsection.29.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.4.1.1}Log likelihood}{980}{subsubsection.29.4.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.4.1.2}E step}{980}{subsubsection.29.4.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.4.1.3}M step}{981}{subsubsection.29.4.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.4.1.4}Initialization}{983}{subsubsection.29.4.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.4.1.5}Example: casino HMM}{983}{subsubsection.29.4.1.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.4.2}Parameter estimation using SGD}{983}{subsection.29.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.4.2.1}Example: Bach Chorales}{984}{subsubsection.29.4.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.4.3}Parameter estimation using spectral methods}{986}{subsection.29.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.4.4}Bayesian HMM}{986}{subsection.29.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {29.5}HMMs: Generalizations}{987}{section.29.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.5.1}Hidden semi-Markov model (HSMM)}{987}{subsection.29.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.5.2}Hierarchical HMMs}{989}{subsection.29.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.5.3}Factorial HMMs}{991}{subsection.29.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.5.4}Coupled HMMs}{992}{subsection.29.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.5.5}Dynamic Bayes nets (DBN)}{993}{subsection.29.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.5.6}Changepoint detection}{993}{subsection.29.5.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.5.6.1}Example}{995}{subsubsection.29.5.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.5.6.2}Extensions}{995}{subsubsection.29.5.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {29.6}Linear dynamical systems (LDS)}{996}{section.29.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.6.1}Conditional independence properties}{996}{subsection.29.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.6.2}Parameterization}{996}{subsection.29.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {29.7}LDS: Applications}{997}{section.29.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.7.1}Object tracking and state estimation}{997}{subsection.29.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.7.2}Online linear regression}{998}{subsection.29.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.7.3}Timeseries forecasting}{998}{subsection.29.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {29.8}Structural time series models}{998}{section.29.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.8.1}Basics}{999}{subsection.29.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.8.2}Details}{999}{subsection.29.8.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.8.2.1}Local level model}{1000}{subsubsection.29.8.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.8.2.2}Local linear model}{1000}{subsubsection.29.8.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.8.2.3}Adding covariates}{1001}{subsubsection.29.8.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.8.2.4}Modelling seasonality}{1001}{subsubsection.29.8.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.8.2.5}Adding it all up}{1002}{subsubsection.29.8.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.8.3}Example: modeling $\text {CO}_2$ levels from Mauna Loa}{1002}{subsection.29.8.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.8.4}Example: forecasting electricity demand}{1003}{subsection.29.8.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.8.5}Causal impact of a time series intervention}{1003}{subsection.29.8.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.8.5.1}Basic idea}{1005}{subsubsection.29.8.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.8.5.2}Example}{1006}{subsubsection.29.8.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.8.6}Prophet}{1007}{subsection.29.8.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.8.7}Neural forecasting methods}{1009}{subsection.29.8.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {29.9}LDS: parameter learning}{1010}{section.29.9}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.9.1}EM for LDS}{1010}{subsection.29.9.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.9.2}Subspace identification methods}{1012}{subsection.29.9.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.9.3}Ensuring stability of the dynamical system}{1013}{subsection.29.9.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.9.4}Bayesian LDS}{1013}{subsection.29.9.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {29.10}Switching linear dynamical systems (SLDS)}{1013}{section.29.10}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.10.1}Parameterization}{1014}{subsection.29.10.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.10.2}Posterior inference}{1014}{subsection.29.10.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.10.3}Application: Multi-target tracking}{1015}{subsection.29.10.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.10.3.1}Warmup}{1015}{subsubsection.29.10.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.10.3.2}Data association}{1016}{subsubsection.29.10.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.10.3.3}Nearest neighbor approximation using Hungarian algorithm}{1016}{subsubsection.29.10.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.10.3.4}Other pproximate inference schemes}{1017}{subsubsection.29.10.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {29.10.3.5}Handling an unknown number of targets}{1017}{subsubsection.29.10.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {29.11}Non-linear dynamical systems}{1017}{section.29.11}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.11.1}Application: Nonlinear tracking problems}{1018}{subsection.29.11.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.11.2}Application: Simultaneous localization and mapping (SLAM)}{1019}{subsection.29.11.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {29.12}Deep SSMs}{1021}{section.29.12}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.12.1}Deep Markov models}{1021}{subsection.29.12.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.12.2}Recurrent SSM}{1023}{subsection.29.12.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.12.3}Improving multi-step predictions}{1024}{subsection.29.12.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {29.12.4}Variational RNNs}{1025}{subsection.29.12.4}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {30}Graph learning}{1027}{chapter.30}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {30.1}Introduction}{1027}{section.30.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {30.2}Latent variable models for graphs}{1027}{section.30.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {30.2.1}Stochastic block model}{1027}{subsection.30.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {30.2.2}Mixed membership stochastic block model}{1029}{subsection.30.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {30.2.3}Infinite relational model}{1031}{subsection.30.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {30.2.3.1}Learning ontologies}{1032}{subsubsection.30.2.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {30.2.3.2}Clustering based on relations and features}{1032}{subsubsection.30.2.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {30.3}Graphical model structure learning}{1033}{section.30.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {30.3.1}Applications}{1034}{subsection.30.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {30.3.2}Methods}{1035}{subsection.30.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {31}Non-parametric Bayesian models}{1037}{chapter.31}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {31.1}Introduction}{1037}{section.31.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {31.2}Dirichlet processes}{1038}{section.31.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.2.1}Definition of a DP}{1038}{subsection.31.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.2.2}Stick breaking construction of the DP}{1040}{subsection.31.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.2.3}The Chinese restaurant process (CRP)}{1041}{subsection.31.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {31.3}Dirichlet process mixture models}{1043}{section.31.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.3.1}Model definition}{1044}{subsection.31.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.3.2}Fitting using collapsed Gibbs sampling}{1045}{subsection.31.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {31.3.2.1}Example}{1046}{subsubsection.31.3.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.3.3}Other fitting algorithms}{1046}{subsection.31.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.3.4}Choosing the hyper-parameters}{1048}{subsection.31.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {31.4}Generalizations of the Dirichlet process}{1048}{section.31.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.4.1}Pitman-Yor process}{1049}{subsection.31.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.4.2}Dependent random probability measures}{1050}{subsection.31.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {31.5}The Indian buffet process and the Beta process}{1053}{section.31.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {31.6}Small-variance asymptotics}{1055}{section.31.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {31.7}Completely random measures}{1058}{section.31.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {31.8}L\'evy processes}{1060}{section.31.8}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {31.9}Point processes with repulsion and reinforcement}{1061}{section.31.9}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.9.1}Poisson process}{1061}{subsection.31.9.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.9.2}Renewal process}{1063}{subsection.31.9.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.9.3}Hawkes process}{1063}{subsection.31.9.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.9.4}Gibbs point process}{1065}{subsection.31.9.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {31.9.5}Determinantal point process}{1066}{subsection.31.9.5}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {32}Representation learning {\bf (Unfinished)} }{1069}{chapter.32}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {32.1}CLIP}{1069}{section.32.1}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {33}Interpretability}{1071}{chapter.33}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {33.1}Introduction}{1071}{section.33.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {33.1.1}The Role of Interpretability}{1072}{subsection.33.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {33.1.2}Terminology and Framework }{1073}{subsection.33.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {33.2}Methods for Interpretable Machine Learning}{1077}{section.33.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {33.2.1}Inherently Interpretable Models: The Model is its Explanation }{1077}{subsection.33.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {33.2.2}Semi-Inherently Interpretable Models: Example-Based Methods}{1080}{subsection.33.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {33.2.3}Post-hoc or Joint training: The Explanation gives a Partial View of the Model}{1080}{subsection.33.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {33.2.3.1}What does the explanation consist of?}{1080}{subsubsection.33.2.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {33.2.3.2}How the explanation is computed}{1082}{subsubsection.33.2.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {33.2.4}Transparency and Visualization}{1084}{subsection.33.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {33.3}Properties: The Abstraction Between Context and Method}{1085}{section.33.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {33.3.1}Properties of Explanations from Interpretable Machine Learning}{1086}{subsection.33.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {33.3.2}Properties of Explanations from Cognitive Science }{1088}{subsection.33.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {33.4}Evaluation of Interpretable Machine Learning Models }{1089}{section.33.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {33.4.1}Computational Evaluation: Does the Method have Desired Properties?}{1090}{subsection.33.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {33.4.2}User Study-based Evaluation: Does the Method Help a User Perform a Task?}{1095}{subsection.33.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {33.4.2.1}User Studies in Real Contexts.}{1095}{subsubsection.33.4.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {33.4.2.2}Basic elements of user studies}{1095}{subsubsection.33.4.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {33.4.2.3}User Studies in Synthetic Contexts}{1098}{subsubsection.33.4.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {33.5}Discussion: How to Think about Interpretable Machine Learning}{1098}{section.33.5}
\defcounter {refsection}{0}\relax 
\contentsline {part}{VI\hspace {1em}Action}{1105}{part.6}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {34}Decision making under uncertainty}{1107}{chapter.34}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {34.1}Bayesian decision theory}{1107}{section.34.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.1.1}Basics}{1107}{subsection.34.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.1.2}Classification}{1108}{subsection.34.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.1.3}Regression}{1108}{subsection.34.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.1.4}Structured prediction}{1109}{subsection.34.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.1.5}Fairness}{1110}{subsection.34.1.5}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {34.2}Decision (influence) diagrams}{1110}{section.34.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.2.1}Example: oil wildcatter}{1110}{subsection.34.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.2.2}Information arcs}{1111}{subsection.34.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.2.3}Value of information}{1112}{subsection.34.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.2.4}Computing the optimal policy}{1113}{subsection.34.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {34.3}A/B testing}{1113}{section.34.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.3.1}A Bayesian approach}{1114}{subsection.34.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {34.3.1.1}Optimal policy}{1114}{subsubsection.34.3.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {34.3.1.2}Optimal sample size}{1115}{subsubsection.34.3.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {34.3.1.3}Regret}{1116}{subsubsection.34.3.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {34.3.1.4}Expected error rate}{1116}{subsubsection.34.3.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.3.2}Example}{1117}{subsection.34.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {34.4}Contextual bandits}{1118}{section.34.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.4.1}Types of bandit}{1119}{subsection.34.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.4.2}Applications}{1120}{subsection.34.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.4.3}Exploration-exploitation tradeoff}{1120}{subsection.34.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.4.4}The optimal solution}{1121}{subsection.34.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.4.5}Upper confidence bounds (UCB)}{1122}{subsection.34.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {34.4.5.1}Frequentist approach}{1123}{subsubsection.34.4.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {34.4.5.2}Bayesian approach}{1123}{subsubsection.34.4.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {34.4.5.3}Example}{1124}{subsubsection.34.4.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.4.6}Thompson sampling}{1124}{subsection.34.4.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.4.7}Regret}{1125}{subsection.34.4.7}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {34.5}Markov decision problems}{1127}{section.34.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.5.1}Basics}{1127}{subsection.34.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.5.2}Partially observed MDPs}{1128}{subsection.34.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.5.3}Episodes and returns}{1128}{subsection.34.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.5.4}Value functions}{1130}{subsection.34.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.5.5}Optimal value functions and policies}{1130}{subsection.34.5.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {34.5.5.1}Example}{1131}{subsubsection.34.5.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {34.6}Planning in an MDP}{1131}{section.34.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.6.1}Value iteration}{1132}{subsection.34.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.6.2}Policy iteration}{1133}{subsection.34.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.6.3}Linear programming}{1134}{subsection.34.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {34.7}Active learning}{1135}{section.34.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.7.1}Relationship to other forms of sequential decision making}{1135}{subsection.34.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.7.2}Common heursitics}{1135}{subsection.34.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {34.7.3}Batch methods}{1136}{subsection.34.7.3}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {35}Reinforcement learning}{1137}{chapter.35}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {35.1}Introduction}{1137}{section.35.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.1.1}Overview of methods}{1137}{subsection.35.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.1.2}Value based methods}{1139}{subsection.35.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.1.3}Policy search methods}{1139}{subsection.35.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.1.4}Model-based RL}{1139}{subsection.35.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.1.5}Exploration-exploitation tradeoff}{1140}{subsection.35.1.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.1.5.1}$\epsilon $-greedy}{1140}{subsubsection.35.1.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.1.5.2}Boltzmann exploration}{1140}{subsubsection.35.1.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.1.5.3}Upper confidence bounds and Thompson sampling}{1140}{subsubsection.35.1.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.1.5.4}Optimal solution using Bayes-adaptive MDPs}{1141}{subsubsection.35.1.5.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {35.2}Value-based RL}{1142}{section.35.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.2.1}Monte Carlo RL}{1142}{subsection.35.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.2.2}Temporal difference (TD) learning}{1142}{subsection.35.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.2.3}TD learning with eligibility traces}{1143}{subsection.35.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.2.4}SARSA: on-policy TD control}{1144}{subsection.35.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.2.5}Q-learning: off-policy TD control}{1145}{subsection.35.2.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.2.5.1}Example}{1145}{subsubsection.35.2.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.2.5.2}Double Q-learning}{1145}{subsubsection.35.2.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.2.6}Deep Q-network (DQN)}{1147}{subsection.35.2.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {35.3}Policy-based RL}{1148}{section.35.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.3.1}The policy gradient theorem}{1148}{subsection.35.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.3.2}REINFORCE}{1149}{subsection.35.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.3.3}Actor-critic methods}{1150}{subsection.35.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.3.3.1}A2C and A3C}{1151}{subsubsection.35.3.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.3.3.2}Eligibility traces}{1151}{subsubsection.35.3.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.3.4}Bound optimization methods}{1152}{subsection.35.3.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.3.5}Deterministic policy gradient methods}{1154}{subsection.35.3.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.3.6}Gradient-free methods}{1155}{subsection.35.3.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {35.4}Model-based RL}{1155}{section.35.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.4.1}Model predictive control (MPC)}{1155}{subsection.35.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.4.1.1}Heuristic search}{1156}{subsubsection.35.4.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.4.1.2}Monte-Carlo tree search (MCTS)}{1156}{subsubsection.35.4.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.4.1.3}Trajectory optimization for continuous actions}{1157}{subsubsection.35.4.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.4.2}Combining model-based and model-free}{1157}{subsection.35.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.4.3}MBRL using Gaussian processes}{1157}{subsection.35.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.4.3.1}PILCO}{1157}{subsubsection.35.4.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.4.3.2}GP-MPC}{1159}{subsubsection.35.4.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.4.4}MBRL using DNNs}{1159}{subsection.35.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.4.5}MBRL using latent-variable models}{1159}{subsection.35.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.4.5.1}World models}{1159}{subsubsection.35.4.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.4.5.2}PlaNet and Dreamer}{1161}{subsubsection.35.4.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.4.6}Robustness to model errors}{1162}{subsection.35.4.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {35.5}Off-policy learning}{1162}{section.35.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.5.1}Basic techniques}{1163}{subsection.35.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.5.1.1}Direct method}{1163}{subsubsection.35.5.1.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.5.1.2}Importance sampling}{1163}{subsubsection.35.5.1.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.5.1.3}Doubly robust}{1164}{subsubsection.35.5.1.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.5.1.4}Behavior regularized method}{1165}{subsubsection.35.5.1.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.5.2}The curse of horizon}{1166}{subsection.35.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.5.3}The deadly triad}{1167}{subsection.35.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {35.6}Control as inference}{1168}{section.35.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.6.1}Maximum entropy reinforcement learning}{1168}{subsection.35.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.6.2}Other approaches}{1171}{subsection.35.6.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {35.6.3}Imitation learning}{1172}{subsection.35.6.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.6.3.1}Imitation learning by behavior cloning}{1172}{subsubsection.35.6.3.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.6.3.2}Imitation learning by inverse reinforcement learning}{1173}{subsubsection.35.6.3.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {35.6.3.3}Imitation learning by divergence minimization}{1173}{subsubsection.35.6.3.3}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{\numberline {36}Causality}{1175}{chapter.36}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {36.1}Introduction}{1175}{section.36.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {36.2}Causal Formalism}{1177}{section.36.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.2.1}Structural Causal Models}{1177}{subsection.36.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.2.2}Causal DAGs}{1179}{subsection.36.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.2.3}Identification}{1181}{subsection.36.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.2.4}Counterfactuals and the Causal Hierarchy}{1182}{subsection.36.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {36.3}Randomized Control Trials}{1184}{section.36.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {36.4}Confounder Adjustment}{1185}{section.36.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.4.1}Causal Estimand, Statistical Estimand, and Identification}{1185}{subsection.36.4.1}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Overlap}{1188}{section*.670}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.4.2}ATE Estimation with Observed Confounders}{1188}{subsection.36.4.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {36.4.2.1}Outcome Model Adjustment}{1188}{subsubsection.36.4.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Linear regression}{1189}{section*.671}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {36.4.2.2}Propensity Score Adjustment}{1189}{subsubsection.36.4.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {36.4.2.3}Double Machine Learning}{1191}{subsubsection.36.4.2.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {36.4.2.4}Cross Fitting}{1193}{subsubsection.36.4.2.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.4.3}Uncertainity Quantification}{1193}{subsection.36.4.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.4.4}Matching}{1193}{subsection.36.4.4}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.4.5}Practical Considerations and Procedures}{1194}{subsection.36.4.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {36.4.5.1}What to adjust for}{1195}{subsubsection.36.4.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {36.4.5.2}Overlap}{1196}{subsubsection.36.4.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {36.4.5.3}Choice of Estimand and Average Treatment Effect on the Treated}{1196}{subsubsection.36.4.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.4.6}Summary and Practical Advice}{1197}{subsection.36.4.6}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {36.5}Instrumental Variable Strategies}{1199}{section.36.5}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.5.1}Additive Unobserved Confounding}{1201}{subsection.36.5.1}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Estimation}{1202}{section*.674}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.5.2}Instrument Monotonicity and Local Average Treatment Effect}{1202}{subsection.36.5.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {36.5.2.1}Estimation}{1204}{subsubsection.36.5.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.5.3}Two Stage Least Squares}{1206}{subsection.36.5.3}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {36.6}Difference in Differences}{1206}{section.36.6}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.6.1}Estimation}{1210}{subsection.36.6.1}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {36.7}Credibility Checks}{1210}{section.36.7}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.7.1}Placebo Checks}{1211}{subsection.36.7.1}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.7.2}Sensitivity Analysis to Unobserved Confounding}{1211}{subsection.36.7.2}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Austen plots}{1212}{figure.caption.677}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Setup}{1214}{section*.678}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Sensitivity Model}{1214}{section*.679}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Bias}{1215}{section*.680}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Sensitivity Parameters}{1216}{section*.681}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Estimating bias}{1217}{section*.682}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {36.7.2.1}Calibration using observed data}{1217}{subsubsection.36.7.2.1}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Grouping covariates}{1217}{section*.683}
\defcounter {refsection}{0}\relax 
\contentsline {subsubsection}{\numberline {36.7.2.2}Practical Use}{1217}{subsubsection.36.7.2.2}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Calibration using observed data}{1218}{section*.684}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {36.8}The Do Calculus}{1219}{section.36.8}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.8.1}The three rules}{1219}{subsection.36.8.1}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Rule 1}{1219}{section*.685}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Rule 2}{1219}{section*.686}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Rule 3}{1220}{section*.687}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.8.2}Revisiting Backdoor Adjustment}{1220}{subsection.36.8.2}
\defcounter {refsection}{0}\relax 
\contentsline {subsection}{\numberline {36.8.3}Frontdoor Adjustment}{1221}{subsection.36.8.3}
\defcounter {refsection}{0}\relax 
\contentsline {paragraph}{Estimation}{1222}{section*.703}
\defcounter {refsection}{0}\relax 
\contentsline {section}{\numberline {36.9}Further Reading}{1223}{section.36.9}
\defcounter {refsection}{0}\relax 
\contentsline {chapter}{Bibliography}{1238}{chapter*.704}