Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/internal/book1.lof
1191 views
\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.1}{\ignorespaces Three types of Iris flowers: Setosa, Versicolor and Virginica. Used with kind permission of Dennis Kramb and SIGNA. \relax }}{2}{figure.caption.8}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.2}{\ignorespaces Illustration of the image classification problem. From \url {https://cs231n.github.io/}. Used with kind permission of Andrej Karpathy. \relax }}{3}{figure.caption.9}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.3}{\ignorespaces Visualization of the Iris data as a pairwise scatter plot. On the diagonal we plot the marginal distribution of each feature for each class. The off-diagonals contain scatterplots of all possible pairs of features. Generated by \href {https://probml.github.io/notebooks\#iris\_plot.ipynb}{iris\_plot.ipynb} \relax }}{4}{figure.caption.11}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.4}{\ignorespaces Example of a decision tree of depth 2 applied to the Iris data, using just the petal length and petal width features. Leaf nodes are color coded according to the predicted class. The number of training samples that pass from the root to a node is shown inside each box; we show how many values of each class fall into this node. This vector of counts can be normalized to get a distribution over class labels for each node. We can then pick the majority class. Adapted from Figures 6.1 and 6.2 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#iris\_dtree.ipynb}{iris\_dtree.ipynb}. \relax }}{5}{figure.caption.12}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.5}{\ignorespaces (a) Linear regression on some 1d data. (b) The vertical lines denote the residuals between the observed output value for each input (blue circle) and its predicted value (red cross). The goal of least squares regression is to pick a line that minimizes the sum of squared residuals. Generated by \href {https://probml.github.io/notebooks\#linreg\_residuals\_plot.ipynb}{linreg\_residuals\_plot.ipynb}. \relax }}{10}{figure.caption.14}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.6}{\ignorespaces Linear and polynomial regression applied to 2d data. Vertical axis is temperature, horizontal axes are location within a room. Data was collected by some remote sensing motes at Intel's lab in Berkeley, CA (data courtesy of Romain Thibaux). (a) The fitted plane has the form $\cc@accent {"705E}{f}({\bm {x}}) = w_0 + w_1 x_1 + w_2 x_2$. (b) Temperature data is fitted with a quadratic of the form $\cc@accent {"705E}{f}({\bm {x}}) = w_0 + w_1 x_1 + w_2 x_2 + w_3 x_1^2 + w_4 x_2^2$. Generated by \href {https://probml.github.io/notebooks\#linreg\_2d\_surface\_demo.ipynb}{linreg\_2d\_surface\_demo.ipynb}. \relax }}{11}{figure.caption.15}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.7}{\ignorespaces (a-c) Polynomials of degrees 2, 14 and 20 fit to 21 datapoints (the same data as in \cref {fig:linreg}). (d) MSE vs degree. Generated by \href {https://probml.github.io/notebooks\#linreg\_poly\_vs\_degree.ipynb}{linreg\_poly\_vs\_degree.ipynb}. \relax }}{12}{figure.caption.16}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.8}{\ignorespaces (a) A scatterplot of the petal features from the iris dataset. (b) The result of unsupervised clustering using $K=3$. Generated by \href {https://probml.github.io/notebooks\#iris\_kmeans.ipynb}{iris\_kmeans.ipynb}. \relax }}{15}{figure.caption.17}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.9}{\ignorespaces (a) Scatterplot of iris data (first 3 features). Points are color coded by class. (b) We fit a 2d linear subspace to the 3d data using PCA. The class labels are ignored. Red dots are the original data, black dots are points generated from the model using $\cc@accent {"705E}{{\bm {x}}} = \mathbf {W}{\bm {z}}+ \boldsymbol {\mu }$, where ${\bm {z}}$ are latent points on the underlying inferred 2d linear manifold. Generated by \href {https://probml.github.io/notebooks\#iris\_pca.ipynb}{iris\_pca.ipynb}. \relax }}{15}{figure.caption.18}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.10}{\ignorespaces Examples of some control problems. (a) Space Invaders Atari game. From \url {https://gym.openai.com/envs/SpaceInvaders-v0/}. (b) Controlling a humanoid robot in the MuJuCo simulator so it walks as fast as possible without falling over. From \url {https://gym.openai.com/envs/Humanoid-v2/}. \relax }}{17}{figure.caption.19}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.11}{\ignorespaces The three types of machine learning visualized as layers of a chocolate cake. This figure (originally from \url {https://bit.ly/2m65Vs1}) was used in a talk by Yann LeCun at NIPS'16, and is used with his kind permission. \relax }}{18}{figure.caption.20}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.12}{\ignorespaces (a) Visualization of the MNIST dataset. Each image is $28 \times 28$. There are 60k training examples and 10k test examples. We show the first 25 images from the training set. Generated by \href {https://probml.github.io/notebooks\#mnist\_viz\_tf.ipynb}{mnist\_viz\_tf.ipynb}. (b) Visualization of the EMNIST dataset. There are 697,932 training examples, and 116,323 test examples, each of size $28 \times 28$. There are 62 classes (a-z, A-Z, 0-9). We show the first 25 images from the training set. Generated by \href {https://probml.github.io/notebooks\#emnist\_viz\_jax.ipynb}{emnist\_viz\_jax.ipynb}. \relax }}{19}{figure.caption.21}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.13}{\ignorespaces (a) Visualization of the Fashion-MNIST dataset \citep {fashion}. The dataset has the same size as MNIST, but is harder to classify. There are 10 classes: T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt, Sneaker, Bag, Ankle-boot. We show the first 25 images from the training set. Generated by \href {https://probml.github.io/notebooks\#fashion\_viz\_tf.ipynb}{fashion\_viz\_tf.ipynb}. (b) Some images from the CIFAR-10 dataset \citep {Krizhevsky2009}. Each image is $32 \times 32 \times 3$, where the final dimension of size 3 refers to RGB. There are 50k training examples and 10k test examples. There are 10 classes: plane, car, bird, cat, deer, dog, frog, horse, ship, and truck. We show the first 25 images from the training set. Generated by \href {https://probml.github.io/notebooks\#cifar\_viz\_tf.ipynb}{cifar\_viz\_tf.ipynb}. \relax }}{20}{figure.caption.22}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.14}{\ignorespaces (a) Sample images from the {\bf ImageNet} dataset \citep {ILSVRC15}. This subset consists of 1.3M color training images, each of which is $256 \times 256$ pixels in size. There are 1000 possible labels, one per image, and the task is to minimize the top-5 error rate, i.e., to ensure the correct label is within the 5 most probable predictions. Below each image we show the true label, and a distribution over the top 5 predicted labels. If the true label is in the top 5, its probability bar is colored red. Predictions are generated by a convolutional neural network (CNN) called ``AlexNet'' (\cref {sec:alexNet}). From Figure 4 of \citep {Krizhevsky12}. Used with kind permission of Alex Krizhevsky. (b) Misclassification rate (top 5) on the ImageNet competition over time. Used with kind permission of Andrej Karpathy. \relax }}{21}{figure.caption.23}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.15}{\ignorespaces Example of a term-document matrix, where raw counts have been replaced by their TF-IDF values (see \cref {sec:tfidf}). Darker cells are larger values. From \url {https://bit.ly/2kByLQI}. Used with kind permission of Christoph Carl Kling. \relax }}{25}{figure.caption.26}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.1}{\ignorespaces Some discrete distributions on the state space $\mathcal {X}=\{1,2,3,4\}$. (a) A uniform distribution with $p(x=k)=1/4$. (b) A degenerate distribution (delta function) that puts all its mass on $x=1$. Generated by \href {https://probml.github.io/notebooks\#discrete\_prob\_dist\_plot.ipynb}{discrete\_prob\_dist\_plot.ipynb}. \relax }}{34}{figure.caption.27}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.2}{\ignorespaces (a) Plot of the cdf for the standard normal, $\mathcal {N}(0,1)$. Generated by \href {https://probml.github.io/notebooks\#gauss\_plot.ipynb}{gauss\_plot.ipynb}. (b) Corresponding pdf. The shaded regions each contain $\alpha /2$ of the probability mass. Therefore the nonshaded region contains $1-\alpha $ of the probability mass. The leftmost cutoff point is $\Phi ^{-1}(\alpha /2)$, where $\Phi $ is the cdf of the Gaussian. By symmetry, the rightmost cutoff point is $\Phi ^{-1}(1-\alpha /2)=-\Phi ^{-1}(\alpha /2)$. Generated by \href {https://probml.github.io/notebooks\#quantile\_plot.ipynb}{quantile\_plot.ipynb}. \relax }}{35}{figure.caption.28}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.3}{\ignorespaces Computing $p(x,y) = p(x) p(y)$, where ${X} \perp {Y}$. Here $X$ and $Y$ are discrete random variables; $X$ has 6 possible states (values) and $Y$ has 5 possible states. A general joint distribution on two such variables would require $(6 \times 5) - 1 = 29$ parameters to define it (we subtract 1 because of the sum-to-one constraint). By assuming (unconditional) independence, we only need $(6-1) + (5-1) = 9$ parameters to define $p(x,y)$. \relax }}{37}{figure.caption.29}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.4}{\ignorespaces Illustration of a mixture of two 1d Gaussians, $p(x) = 0.5 \mathcal {N}(x|0,0.5) + 0.5 \mathcal {N}(x|2,0.5)$. Generated by \href {https://probml.github.io/notebooks\#bimodal\_dist\_plot.ipynb}{bimodal\_dist\_plot.ipynb}. \relax }}{40}{figure.caption.30}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.5}{\ignorespaces Illustration of Anscombe's quartet. All of these datasets have the same low order summary statistics. Generated by \href {https://probml.github.io/notebooks\#anscombes\_quartet.ipynb}{anscombes\_quartet.ipynb}. \relax }}{41}{figure.caption.31}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.6}{\ignorespaces Illustration of the Datasaurus Dozen. All of these datasets have the same low order summary statistics. Adapted from Figure 1 of \citep {Matejka2017}. Generated by \href {https://probml.github.io/notebooks\#datasaurus\_dozen.ipynb}{datasaurus\_dozen.ipynb}. \relax }}{42}{figure.caption.32}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.7}{\ignorespaces Illustration of 7 different datasets (left), the corresponding box plots (middle) and violin box plots (right). From Figure 8 of \url {https://www.autodesk.com/research/publications/same-stats-different-graphs}. Used with kind permission of Justin Matejka. \relax }}{43}{figure.caption.33}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.8}{\ignorespaces Any planar line-drawing is geometrically consistent with infinitely many 3-D structures. From Figure 11 of \citep {Sinha1993}. Used with kind permission of Pawan Sinha. \relax }}{47}{figure.caption.36}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.9}{\ignorespaces Illustration of the binomial distribution with $N=10$ and (a) $\theta =0.25$ and (b) $\theta =0.9$. Generated by \href {https://probml.github.io/notebooks\#binom\_dist\_plot.ipynb}{binom\_dist\_plot.ipynb}. \relax }}{48}{figure.caption.37}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.10}{\ignorespaces (a) The sigmoid (logistic) function $\sigma (a)=(1+e^{-a})^{-1}$. (b) The Heaviside function $\mathbb {I}\left ({a>0}\right )$. Generated by \href {https://probml.github.io/notebooks\#activation\_fun\_plot.ipynb}{activation\_fun\_plot.ipynb}. \relax }}{49}{figure.caption.38}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.11}{\ignorespaces Logistic regression applied to a 1-dimensional, 2-class version of the Iris dataset. Generated by \href {https://probml.github.io/notebooks\#iris\_logreg.ipynb}{iris\_logreg.ipynb}. Adapted from Figure 4.23 of \citep {Geron2019}. \relax }}{50}{figure.caption.40}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.12}{\ignorespaces Softmax distribution $\mathrm {softmax}({\bm {a}}/T)$, where ${\bm {a}}=(3,0,1)$, at temperatures of $T=100$, $T=2$ and $T=1$. When the temperature is high (left), the distribution is uniform, whereas when the temperature is low (right), the distribution is ``spiky'', with most of its mass on the largest element. Generated by \href {https://probml.github.io/notebooks\#softmax\_plot.ipynb}{softmax\_plot.ipynb}. \relax }}{52}{figure.caption.41}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.13}{\ignorespaces Logistic regression on the 3-class, 2-feature version of the Iris dataset. Adapted from Figure of 4.25 \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#iris\_logreg.ipynb}{iris\_logreg.ipynb}. \relax }}{53}{figure.caption.42}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.14}{\ignorespaces Linear regression using Gaussian output with mean $\mu (x)=b + w x$ and (a) fixed variance $\sigma ^2$ (homoskedastic) or (b) input-dependent variance $\sigma (x)^2$ (heteroscedastic). Generated by \href {https://probml.github.io/notebooks\#linreg\_1d\_hetero\_tfp.ipynb}{linreg\_1d\_hetero\_tfp.ipynb}. \relax }}{57}{figure.caption.43}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.15}{\ignorespaces (a) The pdf's for a $\mathcal {N}(0,1)$, $\mathcal {T}(\mu =0,\sigma =1,\nu =1)$, $\mathcal {T}(\mu =0,\sigma =1,\nu =2)$, and $\mathrm {Laplace}(0,1/\sqrt {2})$. The mean is 0 and the variance is 1 for both the Gaussian and Laplace. When $\nu =1$, the Student is the same as the Cauchy, which does not have a well-defined mean and variance. (b) Log of these pdf's. Note that the Student distribution is not log-concave for any parameter value, unlike the Laplace distribution. Nevertheless, both are unimodal. Generated by \href {https://probml.github.io/notebooks\#student\_laplace\_pdf\_plot.ipynb}{student\_laplace\_pdf\_plot.ipynb}. \relax }}{59}{figure.caption.44}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.16}{\ignorespaces Illustration of the effect of outliers on fitting Gaussian, Student and Laplace distributions. (a) No outliers (the Gaussian and Student curves are on top of each other). (b) With outliers. We see that the Gaussian is more affected by outliers than the Student and Laplace distributions. Adapted from Figure 2.16 of \citep {BishopBook}. Generated by \href {https://probml.github.io/notebooks\#robust\_pdf\_plot.ipynb}{robust\_pdf\_plot.ipynb}. \relax }}{60}{figure.caption.45}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.17}{\ignorespaces (a) Some beta distributions. If $a<1$, we get a ``spike'' on the left, and if $b<1$, we get a ``spike'' on the right. if $a=b=1$, the distribution is uniform. If $a>1$ and $b>1$, the distribution is unimodal. Generated by \href {https://probml.github.io/notebooks\#beta\_dist\_plot.ipynb}{beta\_dist\_plot.ipynb}. (b) Some gamma distributions. If $a\leq 1$, the mode is at 0, otherwise the mode is away from 0. As we increase the rate $b$, we reduce the horizontal scale, thus squeezing everything leftwards and upwards. Generated by \href {https://probml.github.io/notebooks\#gamma\_dist\_plot.ipynb}{gamma\_dist\_plot.ipynb}. \relax }}{62}{figure.caption.46}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.18}{\ignorespaces Illustration of the (a) empirical pdf and (b) empirical cdf derived from a set of $N=5$ samples. From \url {https://bit.ly/3hFgi0e}. Used with kind permission of Mauro Escudero. \relax }}{63}{figure.caption.47}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.19}{\ignorespaces (a) Mapping a uniform pdf through the function $f(x) = 2x + 1$. (b) Illustration of how two nearby points, $x$ and $x+dx$, get mapped under $f$. If $\frac {dy}{dx}>0$, the function is locally increasing, but if $\frac {dy}{dx}<0$, the function is locally decreasing. From \citep {JangBlog}. Used with kind permission of Eric Jang. \relax }}{65}{figure.caption.48}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.20}{\ignorespaces Illustration of an affine transformation applied to a unit square, $f({\bm {x}}) = \mathbf {A}{\bm {x}}+ {\bm {b}}$. (a) Here $\mathbf {A}=\mathbf {I}$. (b) Here ${\bm {b}}=\boldsymbol {0}$. From \citep {JangBlog}. Used with kind permission of Eric Jang. \relax }}{66}{figure.caption.49}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.21}{\ignorespaces Change of variables from polar to Cartesian. The area of the shaded patch is $r \tmspace +\thickmuskip {.2777em} dr \tmspace +\thickmuskip {.2777em} d\theta $. Adapted from Figure 3.16 of \citep {Rice95}. \relax }}{67}{figure.caption.50}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.22}{\ignorespaces Distribution of the sum of two dice rolls, i.e., $p(y)$ where $y=x_1 + x_2$ and $x_i \sim \mathrm {Unif}(\{1,2,\ldots ,6\})$. From \url {https://en.wikipedia.org/wiki/Probability\_distribution}. Used with kind permission of Wikipedia author Tim Stellmach. \relax }}{69}{figure.caption.52}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.23}{\ignorespaces The central limit theorem in pictures. We plot a histogram of $\cc@accent {"705E}{\mu }_N^s = \frac {1}{{N_{{\mathcal {D}}}}} \DOTSB \sum@ \slimits@ _{n=1}^{N_{{\mathcal {D}}}}x_{ns}$, where $x_{ns} \sim \mathrm {Beta}(1,5)$, for $s=1:10000$. As ${N_{{\mathcal {D}}}}\rightarrow \infty $, the distribution tends towards a Gaussian. (a) $N=1$. (b) $N=5$. Adapted from Figure 2.6 of \citep {BishopBook}. Generated by \href {https://probml.github.io/notebooks\#centralLimitDemo.ipynb}{centralLimitDemo.ipynb}. \relax }}{70}{figure.caption.53}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {2.24}{\ignorespaces Computing the distribution of $y=x^2$, where $p(x)$ is uniform (left). The analytic result is shown in the middle, and the Monte Carlo approximation is shown on the right. Generated by \href {https://probml.github.io/notebooks\#change\_of\_vars\_demo1d.ipynb}{change\_of\_vars\_demo1d.ipynb}. \relax }}{71}{figure.caption.54}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.1}{\ignorespaces Several sets of $(x, y)$ points, with the correlation coefficient of $x$ and $y$ for each set. Note that the correlation reflects the noisiness and direction of a linear relationship (top row), but not the slope of that relationship (middle), nor many aspects of nonlinear relationships (bottom). (Note: the figure in the center has a slope of 0 but in that case the correlation coefficient is undefined because the variance of $Y$ is zero.) From \url {https://en.wikipedia.org/wiki/Pearson_correlation_coefficient}. Used with kind permission of Wikipedia author Imagecreator. \relax }}{76}{figure.caption.55}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.2}{\ignorespaces Examples of spurious correlation between causally unrelated time series. Consumption of ice cream (red) and violent crime rate (yellow). over time. From \url {http://icbseverywhere.com/blog/2014/10/the-logic-of-causal-conclusions/}. Used with kind permission of Barbara Drescher. \relax }}{77}{figure.caption.56}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.3}{\ignorespaces Illustration of Simpson's paradox on the Iris dataset. (Left) Overall, $y$ (sepal width) decreases with $x$ (sepal length). (Right) Within each group, $y$ increases with $x$. Generated by \href {https://probml.github.io/notebooks\#simpsons\_paradox.ipynb}{simpsons\_paradox.ipynb}. \relax }}{78}{figure.caption.57}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.4}{\ignorespaces Illustration of Simpson's paradox using COVID-19\xspace , (a) Case fatality rates (CFRs) in Italy and China by age group, and in aggregated form (``Total'', last pair of bars), up to the time of reporting (see legend). (b) Proportion of all confirmed cases included in (a) within each age group by country. From Figure 1 of \citep {VonKugelgen2020}. Used with kind permission of Julius von K{\"u}gelgen. \relax }}{78}{figure.caption.58}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.5}{\ignorespaces Visualization of a 2d Gaussian density as a surface plot. (a) Distribution using a full covariance matrix can be oriented at any angle. (b) Distribution using a diagonal covariance matrix must be parallel to the axis. (c) Distribution using a spherical covariance matrix must have a symmetric shape. Generated by \href {https://probml.github.io/notebooks\#gauss\_plot\_2d.ipynb}{gauss\_plot\_2d.ipynb}. \relax }}{80}{figure.caption.59}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.6}{\ignorespaces Visualization of a 2d Gaussian density in terms of level sets of constant probability density. (a) A full covariance matrix has elliptical contours. (b) A diagonal covariance matrix is an {\bf axis aligned} ellipse. (c) A spherical covariance matrix has a circular shape. Generated by \href {https://probml.github.io/notebooks\#gauss\_plot\_2d.ipynb}{gauss\_plot\_2d.ipynb}. \relax }}{80}{figure.caption.60}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.7}{\ignorespaces Illustration of data imputation using an MVN. (a) Visualization of the data matrix. Blank entries are missing (not observed). Blue are positive, green are negative. Area of the square is proportional to the value. (This is known as a {\bf Hinton diagram}, named after Geoff Hinton, a famous ML researcher.) (b) True data matrix (hidden). (c) Mean of the posterior predictive distribution, based on partially observed data in that row, using the true model parameters. Generated by \href {https://probml.github.io/notebooks\#gauss\_imputation\_known\_params\_demo.ipynb}{gauss\_imputation\_known\_params\_demo.ipynb}. \relax }}{84}{figure.caption.61}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.8}{\ignorespaces Inference about $z$ given a noisy observation $y=3$. (a) Strong prior $\mathcal {N}(0,1)$. The posterior mean is ``shrunk'' towards the prior mean, which is 0. (b) Weak prior $\mathcal {N}(0,5)$. The posterior mean is similar to the MLE. Generated by \href {https://probml.github.io/notebooks\#gauss\_infer\_1d.ipynb}{gauss\_infer\_1d.ipynb}. \relax }}{87}{figure.caption.62}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.9}{\ignorespaces Illustration of Bayesian inference for a 2d Gaussian random vector ${\bm {z}}$. (a) The data is generated from ${\bm {y}}_n \sim \mathcal {N}({\bm {z}},\boldsymbol {\Sigma }_y)$, where ${\bm {z}}=[0.5, 0.5]^{{\mkern -1.5mu\mathsf {T}}}$ and $\boldsymbol {\Sigma }_y=0.1 [2, 1; 1, 1])$. We assume the sensor noise covariance $\boldsymbol {\Sigma }_y$ is known but ${\bm {z}}$ is unknown. The black cross represents ${\bm {z}}$. (b) The prior is $p({\bm {z}}) = \mathcal {N}({\bm {z}}|\boldsymbol {0},0.1 \mathbf {I}_2)$. (c) We show the posterior after 10 data points have been observed. Generated by \href {https://probml.github.io/notebooks\#gauss\_infer\_2d.ipynb}{gauss\_infer\_2d.ipynb}. \relax }}{89}{figure.caption.63}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.10}{\ignorespaces We observe ${\bm {y}}_1=(0,-1)$ (red cross) and ${\bm {y}}_2=(1,0)$ (green cross) and estimate $\mathbb {E}\left [{{\bm {z}}|{\bm {y}}_1,{\bm {y}}_2}\right ]$ (black cross). (a) Equally reliable sensors, so the posterior mean estimate is in between the two circles. (b) Sensor 2 is more reliable, so the estimate shifts more towards the green circle. (c) Sensor 1 is more reliable in the vertical direction, Sensor 2 is more reliable in the horizontal direction. The estimate is an appropriate combination of the two measurements. Generated by \href {https://probml.github.io/notebooks\#sensor\_fusion\_2d.ipynb}{sensor\_fusion\_2d.ipynb}. \relax }}{90}{figure.caption.64}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.11}{\ignorespaces A mixture of 3 Gaussians in 2d. (a) We show the contours of constant probability for each component in the mixture. (b) A surface plot of the overall density. Adapted from Figure 2.23 of \citep {BishopBook}. Generated by \href {https://probml.github.io/notebooks\#gmm\_plot\_demo.ipynb}{gmm\_plot\_demo.ipynb} \relax }}{94}{figure.caption.65}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.12}{\ignorespaces (a) Some data in 2d. (b) A possible clustering using $K=5$ clusters computed using a GMM. Generated by \href {https://probml.github.io/notebooks\#gmm\_2d.ipynb}{gmm\_2d.ipynb}. \relax }}{94}{figure.caption.66}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.13}{\ignorespaces We fit a mixture of 20 Bernoullis to the binarized MNIST digit data. We visualize the estimated cluster means $\cc@accent {"705E}{\boldsymbol {\mu }}_k$. The numbers on top of each image represent the estimated mixing weights $\cc@accent {"705E}{\pi }_k$. No labels were used when training the model. Generated by \href {https://probml.github.io/notebooks\#mix\_bernoulli\_em\_mnist.ipynb}{mix\_bernoulli\_em\_mnist.ipynb}. \relax }}{95}{figure.caption.67}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.14}{\ignorespaces Water sprinkler PGM with corresponding binary CPTs. T and F stand for true and false. \relax }}{96}{figure.caption.68}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.15}{\ignorespaces Illustration of first and second order autoregressive (Markov) models. \relax }}{98}{figure.caption.69}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.16}{\ignorespaces Left: data points ${\bm {y}}_n$ are conditionally independent given ${\bm {\theta }}$. Right: Same model, using plate notation. This represents the same model as the one on the left, except the repeated ${\bm {y}}_n$ nodes are inside a box, known as a plate; the number in the lower right hand corner, $N$, specifies the number of repetitions of the ${\bm {y}}_n$ node. \relax }}{100}{figure.caption.70}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {3.17}{\ignorespaces A Gaussian mixture model represented as a graphical model. \relax }}{101}{figure.caption.71}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.1}{\ignorespaces (a) Covariance matrix for the features in the iris dataset from \cref {sec:iris}. (b) Correlation matrix. We only show the lower triangle, since the matrix is symmetric and has a unit diagonal. Compare this to \cref {fig:irisPairs}. Generated by \href {https://probml.github.io/notebooks\#iris\_cov\_mat.ipynb}{iris\_cov\_mat.ipynb}. \relax }}{110}{figure.caption.72}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.2}{\ignorespaces Illustration of various loss functions for binary classification. The horizontal axis is the margin $z=\cc@accent {"707E}{y}\eta $, the vertical axis is the loss. 0-1 loss is $\mathbb {I}\left ({z < 0}\right )$. Hinge-loss is $\qopname \relax m{max}(0,1-z)$. Log-loss is $\qopname \relax o{log}_2(1+e^{-z})$. Exp-loss is $e^{-z}$. Generated by \href {https://probml.github.io/notebooks\#hinge\_loss\_plot.ipynb}{hinge\_loss\_plot.ipynb}. \relax }}{113}{figure.caption.73}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of exponentially-weighted moving average with and without bias correction. (a) Short memory: $\beta =0.9$. (a) Long memory: $\beta =0.99$. Generated by \href {https://probml.github.io/notebooks\#ema\_demo.ipynb}{ema\_demo.ipynb}. \relax }}{116}{figure.caption.74}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.4}{\ignorespaces Estimating a covariance matrix in $D=50$ dimensions using $N \in \{100, 50, 25\}$ samples. We plot the eigenvalues in descending order for the true covariance matrix (solid black), the MLE (dotted blue) and the MAP estimate (dashed red), using \cref {eqn:covShrinkLedoit} with $\lambda =0.9$. We also list the condition number of each matrix in the legend. We see that the MLE is often poorly conditioned, but the MAP estimate is numerically well behaved. Adapted from Figure 1 of \citep {Schafer05}. Generated by \href {https://probml.github.io/notebooks\#shrinkcov\_plots.ipynb}{shrinkcov\_plots.ipynb}. \relax }}{118}{figure.caption.75}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.5}{\ignorespaces (a-c) Ridge regression applied to a degree 14 polynomial fit to 21 datapoints. (d) MSE vs strength of regularizer. The degree of regularization increases from left to right, so model complexity decreases from left to right. Generated by \href {https://probml.github.io/notebooks\#linreg\_poly\_ridge.ipynb}{linreg\_poly\_ridge.ipynb}. \relax }}{119}{figure.caption.76}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.6}{\ignorespaces Schematic of 5-fold cross validation. \relax }}{121}{figure.caption.77}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.7}{\ignorespaces Ridge regression is applied to a degree 14 polynomial fit to 21 datapoints shown in \cref {fig:polyfitRidge} for different values of the regularizer $\lambda $. The degree of regularization increases from left to right, so model complexity decreases from left to right. (a) MSE on train (blue) and test (red) vs $\qopname \relax o{log}(\lambda )$. (b) 5-fold cross-validation estimate of test MSE; error bars are standard error of the mean. Vertical line is the point chosen by the one standard error rule. Generated by \href {https://probml.github.io/notebooks\#polyfitRidgeCV.ipynb}{polyfitRidgeCV.ipynb}. \relax }}{122}{figure.caption.78}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.8}{\ignorespaces Performance of a text classifier (a neural network applied to a bag of word embeddings using average pooling) vs number of training epochs on the IMDB movie sentiment dataset. Blue = train, red = validation. (a) Cross entropy loss. Early stopping is triggered at about epoch 25. (b) Classification accuracy. Generated by \href {https://probml.github.io/notebooks\#imdb\_mlp\_bow\_tf.ipynb}{imdb\_mlp\_bow\_tf.ipynb}. \relax }}{123}{figure.caption.79}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.9}{\ignorespaces MSE on training and test sets vs size of training set, for data generated from a degree 2 polynomial with Gaussian noise of variance $\sigma ^2=4$. We fit polynomial models of varying degree to this data. Generated by \href {https://probml.github.io/notebooks\#linreg\_poly\_vs\_n.ipynb}{linreg\_poly\_vs\_n.ipynb}. \relax }}{124}{figure.caption.80}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.10}{\ignorespaces Updating a Beta prior with a Bernoulli likelihood with sufficient statistics ${N_{{\mathcal {D}}}}_1=4,{N_{{\mathcal {D}}}}_0=1$. (a) Beta(2,2) prior. (b) Uniform Beta(1,1) prior. Generated by \href {https://probml.github.io/notebooks\#beta\_binom\_post\_plot.ipynb}{beta\_binom\_post\_plot.ipynb}. \relax }}{127}{figure.caption.81}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.11}{\ignorespaces Illustration of sequential Bayesian updating for the beta-Bernoulli model. Each colored box represents the predicted distribution $p(x_t|{\bm {h}}_t)$, where ${\bm {h}}_t=(N_{1,t},N_{0,t})$ is the sufficient statistic derived from history of observations up until time $t$, namely the total number of heads and tails. The probability of heads (blue bar) is given by $p(x_t=1|{\bm {h}}_t) = (N_{t,1} + 1)/(t+2)$, assuming we start with a uniform $\mathrm {Beta}(\theta |1,1)$ prior. From Figure 3 of \citep {Ortega2019}. Used with kind permission of Pedro Ortega. \relax }}{130}{figure.caption.83}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.12}{\ignorespaces (a) Posterior predictive distributions for 10 future trials after seeing ${N_{{\mathcal {D}}}}_1=4$ heads and ${N_{{\mathcal {D}}}}_0=1$ tails. (b) Plug-in approximation based on the same data. In both cases, we use a uniform prior. Generated by \href {https://probml.github.io/notebooks\#beta\_binom\_post\_pred\_plot.ipynb}{beta\_binom\_post\_pred\_plot.ipynb}. \relax }}{131}{figure.caption.85}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.13}{\ignorespaces A mixture of two Beta distributions. Generated by \href {https://probml.github.io/notebooks\#mixbetademo.ipynb}{mixbetademo.ipynb}. \relax }}{133}{figure.caption.86}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.14}{\ignorespaces (a) The Dirichlet distribution when $K=3$ defines a distribution over the simplex, which can be represented by the triangular surface. Points on this surface satisfy $0 \leq \theta _k \leq 1$ and $\DOTSB \sum@ \slimits@ _{k=1}^3 \theta _k = 1$. Generated by \href {https://probml.github.io/notebooks\#dirichlet\_3d\_triangle\_plot.ipynb}{dirichlet\_3d\_triangle\_plot.ipynb}. (b) Plot of the Dirichlet density for $\oset {\smallsmile }{\boldsymbol {\alpha }}=(20,20,20)$. (c) Plot of the Dirichlet density for $\oset {\smallsmile }{\boldsymbol {\alpha }}=(3,3,20)$. (d) Plot of the Dirichlet density for $\oset {\smallsmile }{\boldsymbol {\alpha }}=(0.1, 0.1, 0.1)$. Generated by \href {https://probml.github.io/notebooks\#dirichlet\_3d\_spiky\_plot.ipynb}{dirichlet\_3d\_spiky\_plot.ipynb}. \relax }}{134}{figure.caption.87}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.15}{\ignorespaces Samples from a 5-dimensional symmetric Dirichlet distribution for different parameter values. (a) $\oset {\smallsmile }{\boldsymbol {\alpha }} = (0.1,\ldots ,0.1)$. This results in very sparse distributions, with many 0s. (b) $\oset {\smallsmile }{\boldsymbol {\alpha }} = (1,\ldots ,1)$. This results in more uniform (and dense) distributions. Generated by \href {https://probml.github.io/notebooks\#dirichlet\_samples\_plot.ipynb}{dirichlet\_samples\_plot.ipynb}. \relax }}{135}{figure.caption.88}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.16}{\ignorespaces Inferring the mean of a univariate Gaussian with known $\sigma ^2$ given observation $y=3$. (a) Using strong prior, $p(\mu ) = \mathcal {N}(\mu |0,1)$. (b) Using weak prior, $p(\mu ) = \mathcal {N}(\mu |0,5)$. Generated by \href {https://probml.github.io/notebooks\#gauss\_infer\_1d.ipynb}{gauss\_infer\_1d.ipynb}. \relax }}{138}{figure.caption.89}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.17}{\ignorespaces Illustration of Bayesian inference for the mean of a 2d Gaussian. (a) The data is generated from ${\bm {y}}_n \sim \mathcal {N}(\boldsymbol {\mu },\boldsymbol {\Sigma })$, where $\boldsymbol {\mu }=[0.5, 0.5]^{{\mkern -1.5mu\mathsf {T}}}$ and $\boldsymbol {\Sigma }=0.1 [2, 1; 1, 1])$. (b) The prior is $p(\boldsymbol {\mu }) = \mathcal {N}(\boldsymbol {\mu }|\boldsymbol {0},0.1 \mathbf {I}_2)$. (c) We show the posterior after 10 data points have been observed. Generated by \href {https://probml.github.io/notebooks\#gauss\_infer\_2d.ipynb}{gauss\_infer\_2d.ipynb}. \relax }}{140}{figure.caption.92}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.18}{\ignorespaces (a) Central interval and (b) HPD region for a Beta(3,9) posterior. The CI is (0.06, 0.52) and the HPD is (0.04, 0.48). Adapted from Figure 3.6 of \citep {Hoff09}. Generated by \href {https://probml.github.io/notebooks\#betaHPD.ipynb}{betaHPD.ipynb}. \relax }}{142}{figure.caption.93}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.19}{\ignorespaces (a) Central interval and (b) HPD region for a hypothetical multimodal posterior. Adapted from Figure 2.2 of \citep {Gelman04}. Generated by \href {https://probml.github.io/notebooks\#postDensityIntervals.ipynb}{postDensityIntervals.ipynb}. \relax }}{143}{figure.caption.94}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.20}{\ignorespaces (a) Logistic regression for classifying if an Iris flower is Versicolor ($y=1$) or setosa ($y=0$) using a single input feature $x$ corresponding to sepal length. Labeled points have been (vertically) jittered to avoid overlapping too much. Vertical line is the decision boundary. Generated by \href {https://probml.github.io/notebooks\#logreg\_iris\_1d.ipynb}{logreg\_iris\_1d.ipynb}. (b) Same as (a) but showing posterior distribution. Adapted from Figure 4.4 of \citep {Martin2018}. Generated by \href {https://probml.github.io/notebooks\#logreg\_iris\_bayes\_1d\_pymc3.ipynb}{logreg\_iris\_bayes\_1d\_pymc3.ipynb}. \relax }}{144}{figure.caption.95}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.21}{\ignorespaces Distribution of arrival times for two different shipping companies. ETA is the expected time of arrival. A's distribution has greater uncertainty, and may be too risky. From \url {https://bit.ly/39bc4XL}. Used with kind permission of Brendan Hasz. \relax }}{146}{figure.caption.96}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.22}{\ignorespaces Approximating the posterior of a beta-Bernoulli model. (a) Grid approximation using 20 grid points. (b) Laplace approximation. Generated by \href {https://probml.github.io/notebooks\#laplace\_approx\_beta\_binom\_jax.ipynb}{laplace\_approx\_beta\_binom\_jax.ipynb}. \relax }}{147}{figure.caption.97}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.23}{\ignorespaces Bootstrap (top row) vs Bayes (bottom row). The $N$ data cases were generated from $\mathrm {Ber}(\theta =0.7)$. Left column: $N=10$. Right column: $N=100$. (a-b) A bootstrap approximation to the sampling distribution of the MLE for a Bernoulli distribution. We show the histogram derived from $B=10,000$ bootstrap samples. (c-d) Histogram of 10,000 samples from the posterior distribution using a uniform prior. Generated by \href {https://probml.github.io/notebooks\#bootstrapDemoBer.ipynb}{bootstrapDemoBer.ipynb}. \relax }}{152}{figure.caption.98}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.24}{\ignorespaces Left: Sampling distribution of the MAP estimate (equivalent to the posterior mean) under a $\mathcal {N}(\theta _0=0,\sigma ^2/\kappa _0)$ prior with different prior strengths $\kappa _0$. (If we set $\kappa =0$, the MAP estimate reduces to the MLE.) The data is $n=5$ samples drawn from $\mathcal {N}(\theta ^*=1,\sigma ^2=1)$. Right: MSE relative to that of the MLE versus sample size. Adapted from Figure 5.6 of \citep {Hoff09}. Generated by \href {https://probml.github.io/notebooks\#samplingDistributionGaussianShrinkage.ipynb}{samplingDistributionGaussianShrinkage.ipynb}. \relax }}{157}{figure.caption.99}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.25}{\ignorespaces Illustration of bias-variance tradeoff for ridge regression. We generate 100 data sets from the true function, shown in solid green. Left: we plot the regularized fit for 20 different data sets. We use linear regression with a Gaussian RBF expansion, with 25 centers evenly spread over the $[0,1]$ interval. Right: we plot the average of the fits, averaged over all 100 datasets. Top row: strongly regularized: we see that the individual fits are similar to each other (low variance), but the average is far from the truth (high bias). Bottom row: lightly regularized: we see that the individual fits are quite different from each other (high variance), but the average is close to the truth (low bias). Adapted from \citep {BishopBook} Figure 3.5. Generated by \href {https://probml.github.io/notebooks\#biasVarModelComplexity3.ipynb}{biasVarModelComplexity3.ipynb}. \relax }}{158}{figure.caption.100}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {4.26}{\ignorespaces Cartoon illustration of the bias variance tradeoff. From \url {http://scott.fortmann-roe.com/docs/BiasVariance.html}. Used with kind permission of Scott Fortmann-Roe. \relax }}{158}{figure.caption.101}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {5.1}{\ignorespaces For some regions of input space, where the class posteriors are uncertain, we may prefer not to choose class 1 or 2; instead we may prefer the reject option. Adapted from Figure 1.26 of \citep {BishopBook}. \relax }}{167}{figure.caption.104}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {5.2}{\ignorespaces (a) ROC curves for two hypothetical classification systems. The red curve for system A is better than the blue curve for system B. We plot the true positive rate (TPR) vs the false positive rate (FPR) as we vary the threshold $\tau $. We also indicate the equal error rate (EER) with the red and blue dots, and the area under the curve (AUC) for classifier B by the shaded area. Generated by \href {https://probml.github.io/notebooks\#roc\_plot.ipynb}{roc\_plot.ipynb}. (b) A precision-recall curve for two hypothetical classification systems. The red curve for system A is better than the blue curve for system B. Generated by \href {https://probml.github.io/notebooks\#pr\_plot.ipynb}{pr\_plot.ipynb}. \relax }}{169}{figure.caption.108}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {5.3}{\ignorespaces Illustration of $\ell _2$, $\ell _1$, and Huber loss functions with $\delta =1.5$. Generated by \href {https://probml.github.io/notebooks\#huberLossPlot.ipynb}{huberLossPlot.ipynb}. \relax }}{173}{figure.caption.109}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {5.4}{\ignorespaces (a) Log marginal likelihood vs number of heads for the coin tossing example. (b) BIC approximation. (The vertical scale is arbitrary, since we are holding $N$ fixed.) Generated by \href {https://probml.github.io/notebooks\#coins\_model\_sel\_demo.ipynb}{coins\_model\_sel\_demo.ipynb}. \relax }}{176}{figure.caption.111}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {5.5}{\ignorespaces Ilustration of Bayesian model selection for polynomial regression. (a-c) We fit polynomials of degrees 1, 2 and 3 fit to $N=5$ data points. The solid green curve is the true function, the dashed red curve is the prediction (dotted blue lines represent $\pm 2 \sigma $ around the mean). (d) We plot the posterior over models, $p(m|{\mathcal {D}})$, assuming a uniform prior $p(m) \propto 1$. Generated by \href {https://probml.github.io/notebooks\#linreg\_eb\_modelsel\_vs\_n.ipynb}{linreg\_eb\_modelsel\_vs\_n.ipynb}. \relax }}{178}{figure.caption.112}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {5.6}{\ignorespaces Same as \cref {fig:linregEbModelSelVsN5} except now $N=30$. Generated by \href {https://probml.github.io/notebooks\#linreg\_eb\_modelsel\_vs\_n.ipynb}{linreg\_eb\_modelsel\_vs\_n.ipynb}. \relax }}{179}{figure.caption.113}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {5.7}{\ignorespaces A schematic illustration of the Bayesian Occam's razor. The broad (green) curve corresponds to a complex model, the narrow (blue) curve to a simple model, and the middle (red) curve is just right. Adapted from Figure 3.13 of \citep {BishopBook}. See also \citep [Figure 2]{Murray05} for a similar plot produced on real data. \relax }}{180}{figure.caption.114}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {5.8}{\ignorespaces Risk functions for estimating the mean of a Gaussian. Each curve represents $R(\cc@accent {"705E}{\theta }_i(\cdot ),\theta ^*)$ plotted vs $\theta ^*$, where $i$ indexes the estimator. Each estimator is applied to $N$ samples from $\mathcal {N}(\theta ^*,\sigma ^2=1)$. The dark blue horizontal line is the sample mean (MLE); the red line horizontal line is the sample median; the black curved line is the estimator $\cc@accent {"705E}{\theta }=\theta _0=0$; the green curved line is the posterior mean when $\kappa =1$; the light blue curved line is the posterior mean when $\kappa =5$. (a) ${N_{{\mathcal {D}}}}=5$ samples. (b) ${N_{{\mathcal {D}}}}=20$ samples. Adapted from Figure B.1 of \citep {Bernardo94}. Generated by \href {https://probml.github.io/notebooks\#riskFnGauss.ipynb}{riskFnGauss.ipynb}. \relax }}{185}{figure.caption.116}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {5.9}{\ignorespaces Risk functions for two decision procedures, $\pi _1$ and $\pi _2$. Since $\pi _1$ has lower worst case risk, it is the minimax estimator, even though $\pi _2$ has lower risk for most values of $\theta $. Thus minimax estimators are overly conservative. \relax }}{186}{figure.caption.117}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {5.10}{\ignorespaces (a) Illustration of the Neyman-Pearson hypothesis testing paradigm. Generated by \href {https://probml.github.io/notebooks\#neymanPearson2.ipynb}{neymanPearson2.ipynb}. (b) Two hypothetical two-sided power curves. B dominates A. Adapted from Figure 6.3.5 of \citep {Larsen86}. Generated by \href {https://probml.github.io/notebooks\#twoPowerCurves.ipynb}{twoPowerCurves.ipynb}. \relax }}{193}{figure.caption.118}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {5.11}{\ignorespaces Cartoon illustrating the difference between frequentists and Bayesians. (The $p < 0.05$ comment is explained in \cref {sec:pvaluesBad}. The betting comment is a reference to the Dutch book theorem, which essentially proves that the Bayesian approach to gambling (and other decision theory problems) is optimal, as explained in e.g., \citep {Hajek2008}.) From \url {https://xkcd.com/1132/}. Used with kind permission of Rundall Munroe (author of xkcd). \relax }}{197}{figure.caption.120}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {6.1}{\ignorespaces Entropy of a Bernoulli random variable as a function of $\theta $. The maximum entropy is $\qopname \relax o{log}_2 2 = 1$. Generated by \href {https://probml.github.io/notebooks\#bernoulli\_entropy\_fig.ipynb}{bernoulli\_entropy\_fig.ipynb}. \relax }}{202}{figure.caption.121}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {6.2}{\ignorespaces (a) Some aligned DNA sequences. Each row is a sequence, each column is a location within the sequence. (b) The corresponding {\bf position weight matrix} represented as a sequence logo. Each column represents a probability distribution over the alphabet $\{A,C,G,T\}$ for the corresponding location in the sequence. The size of the letter is proportional to the probability. The height of column $t$ is given by $2-H_t$, where $0 \leq H_t \leq 2$ is the entropy (in bits) of the distribution ${\bm {p}}_t$. Thus deterministic distributions (with an entropy of 0, corresponding to highly conserved locations) have height 2, and uniform distributions (with an entropy of 2) have height 0. Generated by \href {https://probml.github.io/notebooks\#seq\_logo\_demo.ipynb}{seq\_logo\_demo.ipynb}. \relax }}{202}{figure.caption.122}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {6.3}{\ignorespaces Illustrating forwards vs reverse KL on a bimodal distribution. The blue curves are the contours of the true distribution $p$. The red curves are the contours of the unimodal approximation $q$. (a) Minimizing forwards KL, $D_{\mathbb {KL}}\left ({p} \mathrel {\delimiter "026B30D } {q}\right )$, wrt $q$ causes $q$ to ``cover'' $p$. (b-c) Minimizing reverse KL, $D_{\mathbb {KL}}\left ({q} \mathrel {\delimiter "026B30D } {p}\right )$ wrt $q$ causes $q$ to ``lock onto'' one of the two modes of $p$. Adapted from Figure 10.3 of \citep {BishopBook}. Generated by \href {https://probml.github.io/notebooks\#KLfwdReverseMixGauss.ipynb}{KLfwdReverseMixGauss.ipynb}. \relax }}{211}{figure.caption.123}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {6.4}{\ignorespaces The marginal entropy, joint entropy, conditional entropy and mutual information represented as information diagrams. Used with kind permission of Katie Everett. \relax }}{212}{figure.caption.124}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {6.5}{\ignorespaces Illustration of how the maximal information coefficient (MIC) is computed. (a) We search over different grid resolutions, and grid cell locations, and compute the MI for each. (b) For each grid resolution $(k,l)$, we define set $M(k,l)$ to be the maximum MI for any grid of that size, normalized by $\qopname \relax o{log}(\qopname \relax m{min}(k,l))$. (c) We visualize the matrix $\mathbf {M}$. The maximum entry (denoted by a star) is defined to be the MIC. From Figure 1 of \citep {Reshef11}. Used with kind permission of David Reshef. \relax }}{216}{figure.caption.125}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {6.6}{\ignorespaces Plots of some 2d distributions and the corresponding estimate of correlation coefficient $R^2$ and the maximal information coefficient (MIC). Compare to \cref {fig:corrcoefWikipedia}. Generated by \href {https://probml.github.io/notebooks\#MIC\_correlation\_2d.ipynb}{MIC\_correlation\_2d.ipynb}. \relax }}{216}{figure.caption.126}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {6.7}{\ignorespaces Left: Correlation coefficient vs maximal information criterion (MIC) for all pairwise relationships in the WHO data. Right: scatter plots of certain pairs of variables. The red lines are non-parametric smoothing regressions fit separately to each trend. From Figure 4 of \citep {Reshef11}. Used with kind permission of David Reshef. \relax }}{217}{figure.caption.127}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {7.1}{\ignorespaces Illustration of a 1d vector, 2d matrix, and 3d tensor. The colors are used to represent individual entries of the vector; this list of numbers can also be stored in a 2d matrix, as shown. (In this example, the matrix is layed out in column-major order, which is the opposite of that used by Python.) We can also reshape the vector into a 3d tensor, as shown. \relax }}{225}{figure.caption.128}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {7.2}{\ignorespaces Illustration of (a) row-major vs (b) column-major order. From \url {https://commons.wikimedia.org/wiki/File:Row_and_column_major_order.svg}. Used with kind permission of Wikipedia author Cmglee. \relax }}{226}{figure.caption.129}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {7.3}{\ignorespaces (a) Top: A vector ${\bm {v}}$ (blue) is added to another vector ${\bm {w}}$ (red). Bottom: ${\bm {w}}$ is stretched by a factor of 2, yielding the sum ${\bm {v}}+ 2{\bm {w}}$. From \url {https://en.wikipedia.org/wiki/Vector_space}. Used with kind permission of Wikipedia author IkamusumeFan (b) A vector ${\bm {v}}$ in $\mathbb {R}^2$ (blue) expressed in terms of different bases: using the standard basis of $\mathbb {R}^2$, ${\bm {v}}= x {\bm {e}}_1 + y {\bm {e}}_2$ (black), and using a different, non-orthogonal basis: ${\bm {v}}= {\bm {f}}_1 + {\bm {f}}_2$ (red). From \url {https://en.wikipedia.org/wiki/Vector_space}. Used with kind permission of Wikiepdia author Jakob.scholbach. \relax }}{226}{figure.caption.130}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {7.4}{\ignorespaces Visualization of the nullspace and range of an $m \times n$ matrix $\mathbf {A}$. Here ${\bm {y}}_1 = \mathbf {A}{\bm {x}}_1$ and ${\bm {y}}_2 = \mathbf {A}{\bm {x}}_4$, so ${\bm {y}}_1$ and ${\bm {y}}_2$ are in the range of $\mathbf {A}$ (are reachable from some ${\bm {x}}$). Also $\mathbf {A}{\bm {x}}_2 = \boldsymbol {0}$ and $\mathbf {A}{\bm {x}}_3=\boldsymbol {0}$, so ${\bm {x}}_2$ and ${\bm {x}}_3$ are in the nullspace of $\mathbf {A}$ (get mapped to 0). We see that the range is often a subset of the input domain of the mapping. \relax }}{228}{figure.caption.131}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {7.5}{\ignorespaces Illustration of matrix multiplication. From \url {https://en.wikipedia.org/wiki/Matrix_multiplication}. Used with kind permission of Wikipedia author Bilou. \relax }}{238}{figure.caption.132}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {7.6}{\ignorespaces Visualization of a level set of the quadratic form $({\bm {x}}-\boldsymbol {\mu })^{{\mkern -1.5mu\mathsf {T}}} \mathbf {A}({\bm {x}}-\boldsymbol {\mu })$ in 2d. The major and minor axes of the ellipse are defined by the first two eigenvectors of $\mathbf {A}$, namely ${\bm {u}}_1$ and ${\bm {u}}_2$. Adapted from Figure 2.7 of \citep {BishopBook}. Generated by \href {https://probml.github.io/notebooks\#gaussEvec.ipynb}{gaussEvec.ipynb}. \relax }}{250}{figure.caption.133}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {7.7}{\ignorespaces (a) Height/weight data. (b) Standardized. (c) PCA Whitening. (d) ZCA whitening. Numbers refer to the first 4 datapoints, but there are 73 datapoints in total. Generated by \href {https://probml.github.io/notebooks\#height\_weight\_whiten\_plot.ipynb}{height\_weight\_whiten\_plot.ipynb}. \relax }}{251}{figure.caption.134}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {7.8}{\ignorespaces SVD decomposition of a matrix, $\mathbf {A}=\mathbf {U}\mathbf {S}\mathbf {V}^{{\mkern -1.5mu\mathsf {T}}}$. The shaded parts of each matrix are not computed in the economy-sized version. (a) Tall skinny matrix. (b) Short wide matrix. \relax }}{254}{figure.caption.135}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {7.9}{\ignorespaces Low rank approximations to an image. Top left: The original image is of size $200 \times 320$, so has rank 200. Subsequent images have ranks 2, 5, and 20. Generated by \href {https://probml.github.io/notebooks\#svd\_image\_demo.ipynb}{svd\_image\_demo.ipynb}. \relax }}{257}{figure.caption.136}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {7.10}{\ignorespaces First 100 log singular values for the clown image (red line), and for a data matrix obtained by randomly shuffling the pixels (blue line). Generated by \href {https://probml.github.io/notebooks\#svd\_image\_demo.ipynb}{svd\_image\_demo.ipynb}. Adapted from Figure 14.24 of \citep {HastieBook}. \relax }}{257}{figure.caption.137}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {7.11}{\ignorespaces Illustration of QR decomposition, $\mathbf {A}=\mathbf {Q}\mathbf {R}$, where $\mathbf {Q}^{{\mkern -1.5mu\mathsf {T}}} \mathbf {Q}= \mathbf {I}$ and $\mathbf {R}$ is upper triangular. (a) Tall, skinny matrix. The shaded parts are not computed in the economy-sized version, since they are not needed. (b) Short, wide matrix. \relax }}{259}{figure.caption.138}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {7.12}{\ignorespaces Solution of a set of $m$ linear equations in $n=2$ variables. (a) $m=1<n$ so the system is underdetermined. We show the minimal norm solution as a blue circle. (The dotted red line is orthogonal to the line, and its length is the distance to the origin.) (b) $m=n=2$, so there is a unique solution. (c) $m=3>n$, so there is no unique solution. We show the least squares solution. \relax }}{261}{figure.caption.139}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.1}{\ignorespaces (a) Illustration of local and global minimum in 1d. Generated by \href {https://probml.github.io/notebooks\#extrema\_fig\_1d.ipynb}{extrema\_fig\_1d.ipynb}. (b) Illustration of a saddle point in 2d. Generated by \href {https://probml.github.io/notebooks\#saddle.ipynb}{saddle.ipynb}. \relax }}{270}{figure.caption.143}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.2}{\ignorespaces Illustration of constrained maximization of a nonconvex 1d function. The area between the dotted vertical lines represents the feasible set. (a) There is a unique global maximum since the function is concave within the support of the feasible set. (b) There are two global maxima, both occuring at the boundary of the feasible set. (c) In the unconstrained case, this function has no global maximum, since it is unbounded. \relax }}{272}{figure.caption.144}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.3}{\ignorespaces Illustration of some convex and non-convex sets. \relax }}{272}{figure.caption.145}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.4}{\ignorespaces (a) Illustration of the epigraph of a function. (b) For a convex function $f(x)$, its epipgraph can be represented as the intersection of half-spaces defined by linear lower bounds derived from the {\bf conjugate function} $f^*(\lambda ) = \qopname \relax m{max}_x \lambda x - f(x)$. \relax }}{273}{figure.caption.146}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.5}{\ignorespaces (a) Illustration of a convex function. We see that the chord joining $(x, f(x))$ to $(y, f(y))$ lies above the function. (b) A function that is neither convex nor concave. {\bf A} is a local minimum, {\bf B} is a global minimum. \relax }}{273}{figure.caption.147}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.6}{\ignorespaces The quadratic form $f({\bm {x}}) = {\bm {x}}^{{\mkern -1.5mu\mathsf {T}}} \mathbf {A}{\bm {x}}$ in 2d. (a) $\mathbf {A}$ is positive definite, so $f$ is convex. (b) $\mathbf {A}$ is negative definite, so $f$ is concave. (c) $\mathbf {A}$ is positive semidefinite but singular, so $f$ is convex, but not strictly. Notice the valley of constant height in the middle. (d) $\mathbf {A}$ is indefinite, so $f$ is neither convex nor concave. The stationary point in the middle of the surface is a saddle point. From Figure 5 of \citep {Shewchuk1994}. \relax }}{274}{figure.caption.148}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.7}{\ignorespaces (a) Smooth 1d function. (b) Non-smooth 1d function. (There is a discontinuity at the origin.) Generated by \href {https://probml.github.io/notebooks\#smooth-vs-nonsmooth-1d.ipynb}{smooth-vs-nonsmooth-1d.ipynb}. \relax }}{275}{figure.caption.149}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.8}{\ignorespaces For a Lipschitz continuous function $f$, there exists a double cone (white) whose origin can be moved along the graph of $f$ so that the whole graph always stays outside the double cone. From \url {https://en.wikipedia.org/wiki/Lipschitz\_continuity}. Used with kind permission of Wikipedia author Taschee. \relax }}{276}{figure.caption.150}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.9}{\ignorespaces Illustration of subgradients. At ${\bm {x}}_1$, the convex function $f$ is differentiable, and ${\bm {g}}_1$ (which is the derivative of $f$ at ${\bm {x}}_1$) is the unique subgradient at ${\bm {x}}_1$. At the point ${\bm {x}}_2$, $f$ is not differentiable, because of the ``kink''. However, there are many subgradients at this point, of which two are shown. From \url {https://web.stanford.edu/class/ee364b/lectures/subgradients_slides.pdf}. Used with kind permission of Stephen Boyd. \relax }}{277}{figure.caption.151}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.10}{\ignorespaces The absolute value function (left) and its subdifferential (right). From \url {https://web.stanford.edu/class/ee364b/lectures/subgradients_slides.pdf}. Used with kind permission of Stephen Boyd. \relax }}{277}{figure.caption.152}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.11}{\ignorespaces Steepest descent on a simple convex function, starting from $(0,0)$, for 20 steps, using a fixed step size. The global minimum is at $(1,1)$. (a) $\eta =0.1$. (b) $\eta =0.6$. Generated by \href {https://probml.github.io/notebooks\#steepestDescentDemo.ipynb}{steepestDescentDemo.ipynb}. \relax }}{279}{figure.caption.153}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.12}{\ignorespaces Illustration of the effect of condition number $\kappa $ on the convergence speed of steepest descent with exact line searches. (a) Large $\kappa $. (b) Small $\kappa $. Generated by \href {https://probml.github.io/notebooks\#lineSearchConditionNum.ipynb}{lineSearchConditionNum.ipynb}. \relax }}{281}{figure.caption.154}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.13}{\ignorespaces Illustration of the Nesterov update. Adapted from Figure 11.6 of \citep {Geron2019}. \relax }}{282}{figure.caption.155}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.14}{\ignorespaces Illustration of Newton's method for minimizing a 1d function. (a) The solid curve is the function $\mathcal {L}(x)$. The dotted line $\mathcal {L}_{\mathrm {quad}}(\theta )$ is its second order approximation at $\theta _t$. The Newton step $d_t$ is what must be added to $\theta _t$ to get to the minimum of $\mathcal {L}_{\mathrm {quad}}(\theta )$. Adapted from Figure 13.4 of \citep {Vandenberghe06}. Generated by \href {https://probml.github.io/notebooks\#newtonsMethodMinQuad.ipynb}{newtonsMethodMinQuad.ipynb}. (b) Illustration of Newton's method applied to a nonconvex function. We fit a quadratic function around the current point $\theta _t$ and move to its stationary point, $\theta _{t+1} = \theta _t+ d_t$. Unfortunately, this takes us near a local maximum of $f$, not minimum. This means we need to be careful about the extent of our quadratic approximation. Adapted from Figure 13.11 of \citep {Vandenberghe06}. Generated by \href {https://probml.github.io/notebooks\#newtonsMethodNonConvex.ipynb}{newtonsMethodNonConvex.ipynb}. \relax }}{284}{figure.caption.157}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.15}{\ignorespaces Illustration of the trust region approach. The dashed lines represents contours of the original nonconvex objective. The circles represent successive quadratic approximations. From Figure 4.2 of \citep {Pascanu2014Thesis}. Used with kind permission of Razvan Pascanu. \relax }}{285}{figure.caption.158}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.16}{\ignorespaces Illustration of the LMS algorithm. Left: we start from ${\bm {\theta }}=(-0.5,2)$ and slowly converging to the least squares solution of $\cc@accent {"705E}{{\bm {\theta }}}=(1.45, 0.93)$ (red cross). Right: plot of objective function over time. Note that it does not decrease monotonically. Generated by \href {https://probml.github.io/notebooks\#lms\_demo.ipynb}{lms\_demo.ipynb}. \relax }}{288}{figure.caption.159}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.17}{\ignorespaces Loss vs learning rate (horizontal axis). Training loss vs learning rate for a small MLP fit to FashionMNIST using vanilla SGD. (Raw loss in blue, EWMA smoothed version in orange). Generated by \href {https://probml.github.io/notebooks\#lrschedule\_tf.ipynb}{lrschedule\_tf.ipynb}. \relax }}{289}{figure.caption.160}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.18}{\ignorespaces Illustration of some common learning rate schedules. (a) Piecewise constant. (b) Exponential decay. (c) Polynomial decay. Generated by \href {https://probml.github.io/notebooks\#learning\_rate\_plot.ipynb}{learning\_rate\_plot.ipynb}. \relax }}{289}{figure.caption.161}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.19}{\ignorespaces (a) Linear warm-up followed by cosine cool-down. (b) Cyclical learning rate schedule. \relax }}{290}{figure.caption.162}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.20}{\ignorespaces Illustration of some constrained optimization problems. Red contours are the level sets of the objective function $\mathcal {L}({\bm {\theta }})$. Optimal constrained solution is the black dot, (a) Blue line is the equality constraint $h({\bm {\theta }})=0$. (b) Blue lines denote the inequality constraints $|\theta _1| + |\theta _2| \leq 1$. (Compare to \cref {fig:L2L1contours} (left).) \relax }}{296}{figure.caption.163}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.21}{\ignorespaces (a) A convex polytope in 2d defined by the intersection of linear constraints. (b) Depiction of the feasible set as well as the linear objective function. The red line is a level set of the objective, and the arrow indicates the direction in which it is improving. We see that the optimal solution lies at a vertex of the polytope. \relax }}{299}{figure.caption.164}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.22}{\ignorespaces Illustration of projected gradient descent. ${\bm {w}}$ is the current parameter estimate, ${\bm {w}}'$ is the update after a gradient step, and $P_{\mathcal {C}}({\bm {w}}')$ projects this onto the constraint set $\mathcal {C}$. From \url {https://bit.ly/3eJ3BhZ} Used with kind permission of Martin Jaggi. \relax }}{303}{figure.caption.165}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.23}{\ignorespaces Illustration of a bound optimization algorithm. Adapted from Figure 9.14 of \citep {BishopBook}. Generated by \href {https://probml.github.io/notebooks\#emLogLikelihoodMax.ipynb}{emLogLikelihoodMax.ipynb}. \relax }}{306}{figure.caption.166}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.24}{\ignorespaces The quadratic lower bound of an MM algorithm (solid) and the quadratic approximation of Newton's method (dashed) superimposed on an empirical density esitmate (dotted). The starting point of both algorithms is the circle. The square denotes the outcome of one MM update. The diamond denotes the outcome of one Newton update. (a) Newton's method overshoots the global maximum. (b) Newton's method results in a reduction of the objective. From Figure 4 of \citep {Fashing2005}. Used with kind permission of Carlo Tomasi. \relax }}{307}{figure.caption.167}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.25}{\ignorespaces Illustration of the EM for a GMM applied to the Old Faithful data. The degree of redness indicates the degree to which the point belongs to the red cluster, and similarly for blue; thus purple points have a roughly 50/50 split in their responsibilities to the two clusters. Adapted from \citep {BishopBook} Figure 9.8. Generated by \href {https://probml.github.io/notebooks\#mix\_gauss\_demo\_faithful.ipynb}{mix\_gauss\_demo\_faithful.ipynb}. \relax }}{311}{figure.caption.168}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.26}{\ignorespaces (a) Illustration of how singularities can arise in the likelihood function of GMMs. Here $K=2$, but the first mixture component is a narrow spike (with $\sigma _1 \approx 0$) centered on a single data point $x_1$. Adapted from Figure 9.7 of \citep {BishopBook}. Generated by \href {https://probml.github.io/notebooks\#mix\_gauss\_singularity.ipynb}{mix\_gauss\_singularity.ipynb}. (b) Illustration of the benefit of MAP estimation vs ML estimation when fitting a Gaussian mixture model. We plot the fraction of times (out of 5 random trials) each method encounters numerical problems vs the dimensionality of the problem, for $N=100$ samples. Solid red (upper curve): MLE. Dotted black (lower curve): MAP. Generated by \href {https://probml.github.io/notebooks\#mix\_gauss\_mle\_vs\_map.ipynb}{mix\_gauss\_mle\_vs\_map.ipynb}. \relax }}{312}{figure.caption.169}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {8.27}{\ignorespaces Left: $N=200$ data points sampled from a mixture of 2 Gaussians in 1d, with $\pi _k=0.5$, $\sigma _k=5$, $\mu _1=-10$ and $\mu _2=10$. Right: Likelihood surface $p({\mathcal {D}}|\mu _1,\mu _2)$, with all other parameters set to their true values. We see the two symmetric modes, reflecting the unidentifiability of the parameters. Generated by \href {https://probml.github.io/notebooks\#gmm\_lik\_surface\_plot.ipynb}{gmm\_lik\_surface\_plot.ipynb}. \relax }}{313}{figure.caption.170}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {9.1}{\ignorespaces (a) Some 2d data from 3 different classes. (b) Fitting 2d Gaussians to each class. Generated by \href {https://probml.github.io/notebooks\#discrim\_analysis\_dboundaries\_plot2.ipynb}{discrim\_analysis\_dboundaries\_plot2.ipynb}. \relax }}{318}{figure.caption.171}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {9.2}{\ignorespaces Gaussian discriminant analysis fit to data in \cref {fig:gda2d}. (a) Unconstrained covariances induce quadratic decision boundaries. (b) Tied covariances induce linear decision boundaries. Generated by \href {https://probml.github.io/notebooks\#discrim\_analysis\_dboundaries\_plot2.ipynb}{discrim\_analysis\_dboundaries\_plot2.ipynb}. \relax }}{318}{figure.caption.172}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {9.3}{\ignorespaces Geometry of LDA in the 2 class case where $\boldsymbol {\Sigma }_1=\boldsymbol {\Sigma }_2=\mathbf {I}$. \relax }}{319}{figure.caption.173}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {9.4}{\ignorespaces Linear disciminant analysis applied to two class dataset in 2d, representing (standardized) height and weight for male and female adults (a) PCA direction. (b) FLDA direction. (c) Projection onto PCA direction shows poor class separation. (d) Projection onto FLDA direction shows good class separation. Generated by \href {https://probml.github.io/notebooks\#fisher\_lda\_demo.ipynb}{fisher\_lda\_demo.ipynb}. \relax }}{323}{figure.caption.174}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {9.5}{\ignorespaces (a) PCA projection of vowel data to 2d. (b) FLDA projection of vowel data to 2d. We see there is better class separation in the FLDA case. Adapted from Figure 4.11 of \citep {HastieBook}. Generated by \href {https://probml.github.io/notebooks\#fisher\_discrim\_vowel.ipynb}{fisher\_discrim\_vowel.ipynb}. \relax }}{325}{figure.caption.175}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {9.6}{\ignorespaces Visualization of the Bernoulli class conditional densities for a naive Bayes classifier fit to a binarized version of the MNIST dataset. Generated by \href {https://probml.github.io/notebooks\#naive\_bayes\_mnist\_jax.ipynb}{naive\_bayes\_mnist\_jax.ipynb}. \relax }}{327}{figure.caption.176}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {9.7}{\ignorespaces Visualization of the predictions made by the model in \cref {fig:nbc_mnist_centroids} when applied to some binarized MNIST test images. The title shows the most probable predicted class. Generated by \href {https://probml.github.io/notebooks\#naive\_bayes\_mnist\_jax.ipynb}{naive\_bayes\_mnist\_jax.ipynb}. \relax }}{327}{figure.caption.177}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {9.8}{\ignorespaces The class-conditional densities $p(x|y=c)$ (left) may be more complex than the class posteriors $p(y=c|x)$ (right). Adapted from Figure 1.27 of \citep {BishopBook}. Generated by \href {https://probml.github.io/notebooks\#generativeVsDiscrim.ipynb}{generativeVsDiscrim.ipynb}. \relax }}{330}{figure.caption.178}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.1}{\ignorespaces (a) Visualization of a 2d plane in a 3d space with surface normal ${\bm {w}}$ going through point ${\bm {x}}_0=(x_0,y_0,z_0)$. See text for details. (b) Visualization of optimal linear decision boundary induced by logistic regression on a 2-class, 2-feature version of the iris dataset. Generated by \href {https://probml.github.io/notebooks\#iris\_logreg.ipynb}{iris\_logreg.ipynb}. Adapted from Figure 4.24 of \citep {Geron2019}. \relax }}{334}{figure.caption.179}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.2}{\ignorespaces Plots of $\sigma (w_1 x_1 + w_2 x_2)$. Here ${\bm {w}}= (w_1,w_2)$ defines the normal to the decision boundary. Points to the right of this have $\sigma ({\bm {w}}^{\mkern -1.5mu\mathsf {T}}{\bm {x}})>0.5$, and points to the left have $\sigma ({\bm {w}}^{\mkern -1.5mu\mathsf {T}}{\bm {x}}) < 0.5$. Adapted from Figure 39.3 of \citep {MacKay03}. Generated by \href {https://probml.github.io/notebooks\#sigmoid\_2d\_plot.ipynb}{sigmoid\_2d\_plot.ipynb}. \relax }}{335}{figure.caption.180}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.3}{\ignorespaces Illustration of how we can transform a quadratic decision boundary into a linear one by transforming the features from ${\bm {x}}=(x_1,x_2)$ to $\boldsymbol {\phi }({\bm {x}})=(x_1^2,x_2^2)$. Used with kind permission of Jean-Philippe Vert. \relax }}{335}{figure.caption.181}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.4}{\ignorespaces Polynomial feature expansion applied to a two-class, two-dimensional logistic regression problem. (a) Degree $K=1$. (b) Degree $K=2$. (c) Degree $K=4$. (d) Train and test error vs degree. Generated by \href {https://probml.github.io/notebooks\#logreg\_poly\_demo.ipynb}{logreg\_poly\_demo.ipynb}. \relax }}{337}{figure.caption.182}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.5}{\ignorespaces NLL loss surface for binary logistic regression applied to Iris dataset with 1 feature and 1 bias term. The goal is to minimize the function. Generated by \href {https://probml.github.io/notebooks\#iris\_logreg\_loss\_surface.ipynb}{iris\_logreg\_loss\_surface.ipynb}. \relax }}{339}{figure.caption.183}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.6}{\ignorespaces Weight decay with variance $C$ applied to two-class, two-dimensional logistic regression problem with a degree 4 polynomial. (a) $C=1$. (b) $C=316$. (c) $C=100,000$. (d) Train and test error vs $C$. Generated by \href {https://probml.github.io/notebooks\#logreg\_poly\_demo.ipynb}{logreg\_poly\_demo.ipynb}. \relax }}{342}{figure.caption.185}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.7}{\ignorespaces Example of 3-class logistic regression with 2d inputs. (a) Original features. (b) Quadratic features. Generated by \href {https://probml.github.io/notebooks\#logreg\_multiclass\_demo.ipynb}{logreg\_multiclass\_demo.ipynb}. \relax }}{345}{figure.caption.186}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.8}{\ignorespaces A simple example of a label hierarchy. Nodes within the same ellipse have a mutual exclusion relationship between them. \relax }}{351}{figure.caption.187}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.9}{\ignorespaces A flat and hierarchical softmax model $p(w|C)$, where $C$ are the input features (context) and $w$ is the output label (word). Adapted from \url {https://www.quora.com/What-is-hierarchical-softmax}. \relax }}{352}{figure.caption.188}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.10}{\ignorespaces (a) Logistic regression on some data with outliers (denoted by x). Training points have been (vertically) jittered to avoid overlapping too much. Vertical line is the decision boundary, and its posterior credible interval. (b) Same as (a) but using robust model, with a mixture likelihood. Adapted from Figure 4.13 of \citep {Martin2018}. Generated by \href {https://probml.github.io/notebooks\#logreg\_iris\_bayes\_robust\_1d\_pymc3.ipynb}{logreg\_iris\_bayes\_robust\_1d\_pymc3.ipynb}. \relax }}{353}{figure.caption.189}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.11}{\ignorespaces (a) Illustration of logistic and tempered logistic loss with $t_1=0.8$. (b) Illustration of sigmoid and tempered sigmoid transfer function with $t_2=2.0$. From \url {https://ai.googleblog.com/2019/08/bi-tempered-logistic-loss-for-training.html}. Used with kind permission of Ehsan Amid. \relax }}{355}{figure.caption.190}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.12}{\ignorespaces Illustration of standard and bi-tempered logistic regression on data with label noise. From \url {https://ai.googleblog.com/2019/08/bi-tempered-logistic-loss-for-training.html}. Used with kind permission of Ehsan Amid. \relax }}{356}{figure.caption.192}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.13}{\ignorespaces (a) Illustration of the data. (b) Log-likelihood for a logistic regression model. The line is drawn from the origin in the direction of the MLE (which is at infinity). The numbers correspond to 4 points in parameter space, corresponding to the lines in (a). (c) Unnormalized log posterior (assuming vague spherical prior). (d) Laplace approximation to posterior. Adapted from a figure by Mark Girolami. Generated by \href {https://probml.github.io/notebooks\#logreg\_laplace\_demo.ipynb}{logreg\_laplace\_demo.ipynb}. \relax }}{357}{figure.caption.193}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.14}{\ignorespaces Posterior predictive distribution for a logistic regression model in 2d. (a): contours of $p(y=1|{\bm {x}},\cc@accent {"705E}{{\bm {w}}}_{map})$. (b): samples from the posterior predictive distribution. (c): Averaging over these samples. (d): moderated output (probit approximation). Adapted from a figure by Mark Girolami. Generated by \href {https://probml.github.io/notebooks\#logreg\_laplace\_demo.ipynb}{logreg\_laplace\_demo.ipynb}. \relax }}{358}{figure.caption.194}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {10.15}{\ignorespaces (a) Data for logistic regression question. (b) Plot of $\cc@accent {"705E}{w}_k$ vs amount of correlation $c_k$ for three different estimators. \relax }}{362}{figure.caption.195}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.1}{\ignorespaces Polynomial of degrees 1 and 2 fit to 21 datapoints. Generated by \href {https://probml.github.io/notebooks\#linreg\_poly\_vs\_degree.ipynb}{linreg\_poly\_vs\_degree.ipynb}. \relax }}{364}{figure.caption.196}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.2}{\ignorespaces (a) Contours of the RSS error surface for the example in \cref {fig:linregPolyDegree1}. The blue cross represents the MLE. (b) Corresponding surface plot. Generated by \href {https://probml.github.io/notebooks\#linreg\_contours\_sse\_plot.ipynb}{linreg\_contours\_sse\_plot.ipynb}. \relax }}{366}{figure.caption.197}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.3}{\ignorespaces Graphical interpretation of least squares for $m=3$ equations and $n=2$ unknowns when solving the system $\mathbf {A}{\bm {x}}= {\bm {b}}$. ${\bm {a}}_1$ and ${\bm {a}}_2$ are the columns of $\mathbf {A}$, which define a 2d linear subspace embedded in $\mathbb {R}^3$. The target vector ${\bm {b}}$ is a vector in $\mathbb {R}^3$; its orthogonal projection onto the linear subspace is denoted $\cc@accent {"705E}{{\bm {b}}}$. The line from ${\bm {b}}$ to $\cc@accent {"705E}{{\bm {b}}}$ is the vector of residual errors, whose norm we want to minimize. \relax }}{366}{figure.caption.198}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.4}{\ignorespaces Regression coefficients over time for the 1d model in \cref {fig:linregPoly2}(a). Generated by \href {https://probml.github.io/notebooks\#linregOnlineDemo.ipynb}{linregOnlineDemo.ipynb}. \relax }}{371}{figure.caption.199}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.5}{\ignorespaces Residual plot for polynomial regression of degree 1 and 2 for the functions in \cref {fig:linregPoly2}(a-b). Generated by \href {https://probml.github.io/notebooks\#linreg\_poly\_vs\_degree.ipynb}{linreg\_poly\_vs\_degree.ipynb}. \relax }}{372}{figure.caption.200}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.6}{\ignorespaces Fit vs actual plots for polynomial regression of degree 1 and 2 for the functions in \cref {fig:linregPoly2}(a-b). Generated by \href {https://probml.github.io/notebooks\#linreg\_poly\_vs\_degree.ipynb}{linreg\_poly\_vs\_degree.ipynb}. \relax }}{372}{figure.caption.201}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.7}{\ignorespaces Geometry of ridge regression. The likelihood is shown as an ellipse, and the prior is shown as a circle centered on the origin. Adapted from Figure 3.15 of \citep {BishopBook}. Generated by \href {https://probml.github.io/notebooks\#geom\_ridge.ipynb}{geom\_ridge.ipynb}. \relax }}{376}{figure.caption.202}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.8}{\ignorespaces Illustration of $\ell _1$ (left) vs $\ell _2$ (right) regularization of a least squares problem. Adapted from Figure 3.12 of \citep {Hastie01}. \relax }}{378}{figure.caption.203}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.9}{\ignorespaces Left: soft thresholding. Right: hard thresholding. In both cases, the horizontal axis is the residual error incurred by making predictions using all the coefficients except for $w_k$, and the vertical axis is the estimated coefficient $\cc@accent {"705E}{w}_k$ that minimizes this penalized residual. The flat region in the middle is the interval $[-\lambda ,+\lambda ]$. \relax }}{380}{figure.caption.204}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.10}{\ignorespaces (a) Profiles of ridge coefficients for the prostate cancer example vs bound $B$ on $\ell _2$ norm of ${\bm {w}}$, so small $B$ (large $\lambda $) is on the left. The vertical line is the value chosen by 5-fold CV using the 1 standard error rule. Adapted from Figure 3.8 of \citep {HastieBook}. Generated by \href {https://probml.github.io/notebooks\#ridgePathProstate.ipynb}{ridgePathProstate.ipynb}. (b) Same as (a) but using $\ell _1$ norm of ${\bm {w}}$. The x-axis shows the critical values of $\lambda =1/B$, where the regularization path is discontinuous. Adapted from Figure 3.10 of \citep {HastieBook}. Generated by \href {https://probml.github.io/notebooks\#lassoPathProstate.ipynb}{lassoPathProstate.ipynb}. \relax }}{381}{figure.caption.205}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.11}{\ignorespaces Results of different methods on the prostate cancer data, which has 8 features and 67 training cases. Methods are: OLS = ordinary least squares, Subset = best subset regression, Ridge, Lasso. Rows represent the coefficients; we see that subset regression and lasso give sparse solutions. Bottom row is the mean squared error on the test set (30 cases). Adapted from Table 3.3. of \citep {HastieBook}. Generated by \href {https://probml.github.io/notebooks\#prostate\_comparison.ipynb}{prostate\_comparison.ipynb}. \relax }}{383}{figure.caption.207}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.12}{\ignorespaces Boxplot displaying (absolute value of) prediction errors on the prostate cancer test set for different regression methods. Generated by \href {https://probml.github.io/notebooks\#prostate\_comparison.ipynb}{prostate\_comparison.ipynb}. \relax }}{384}{figure.caption.208}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.13}{\ignorespaces Example of recovering a sparse signal using lasso. See text for details. Adapted from Figure 1 of \citep {Figueiredo07}. Generated by \href {https://probml.github.io/notebooks\#sparse\_sensing\_demo.ipynb}{sparse\_sensing\_demo.ipynb}. \relax }}{384}{figure.caption.209}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.14}{\ignorespaces Illustration of group lasso where the original signal is piecewise Gaussian. (a) Original signal. (b) Vanilla lasso estimate. (c) Group lasso estimate using an $\ell _2$ norm on the blocks. (d) Group lasso estimate using an $\ell _{\infty }$ norm on the blocks. Adapted from Figures 3-4 of \citep {Wright09}. Generated by \href {https://probml.github.io/notebooks\#groupLassoDemo.ipynb}{groupLassoDemo.ipynb}. \relax }}{387}{figure.caption.210}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.15}{\ignorespaces Same as \cref {fig:groupLassoGauss}, except the original signal is piecewise constant. Generated by \href {https://probml.github.io/notebooks\#groupLassoDemo.ipynb}{groupLassoDemo.ipynb}. \relax }}{388}{figure.caption.211}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.16}{\ignorespaces Illustration of B-splines of degree 0, 1 and 3. Top row: unweighted basis functions. Dots mark the locations of the 3 internal knots at $[0.25, 0.5, 0.75]$. Bottom row: weighted combination of basis functions using random weights. Generated by \href {https://probml.github.io/notebooks\#splines\_basis\_weighted.ipynb}{splines\_basis\_weighted.ipynb}. Adapted from Figure 5.4 of \citep {BMC}. Used with kind permission of Osvaldo Martin. \relax }}{392}{figure.caption.213}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.17}{\ignorespaces Design matrix for B-splines of degree (a) 0, (b) 1 and (c) 3. We evaluate the splines on 20 inputs ranging from 0 to 1. Generated by \href {https://probml.github.io/notebooks\#splines\_basis\_heatmap.ipynb}{splines\_basis\_heatmap.ipynb}. Adapted from Figure 5.6 of \citep {BMC}. Used with kind permission of Osvaldo Martin. \relax }}{392}{figure.caption.214}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.18}{\ignorespaces Fitting a cubic spline regression model with 15 knots to a 1d dataset. Generated by \href {https://probml.github.io/notebooks\#splines\_cherry\_blossoms.ipynb}{splines\_cherry\_blossoms.ipynb}. Adapted from Figure 5.3 of \citep {rethinking2}. \relax }}{393}{figure.caption.215}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.19}{\ignorespaces (a) Illustration of robust linear regression. Generated by \href {https://probml.github.io/notebooks\#linregRobustDemoCombined.ipynb}{linregRobustDemoCombined.ipynb}. (b) Illustration of $\ell _2$, $\ell _1$, and Huber loss functions with $\delta =1.5$. Generated by \href {https://probml.github.io/notebooks\#huberLossPlot.ipynb}{huberLossPlot.ipynb}. \relax }}{394}{figure.caption.216}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.20}{\ignorespaces Sequential Bayesian inference of the parameters of a linear regression model $p(y|{\bm {x}}) = \mathcal {N}(y | w_0 + w_1 x_1, \sigma ^2)$. Left column: likelihood function for current data point. Middle column: posterior given first $N$ data points, $p(w_0,w_1|{\bm {x}}_{1:N},y_{1:N},\sigma ^2)$. Right column: samples from the current posterior predictive distribution. Row 1: prior distribution ($N=0$). Row 2: after 1 data point. Row 3: after 2 data points. Row 4: after 100 data points. The white cross in columns 1 and 2 represents the true parameter value; we see that the mode of the posterior rapidly converges to this point. The blue circles in column 3 are the observed data points. Adapted from Figure 3.7 of \citep {BishopBook}. Generated by \href {https://probml.github.io/notebooks\#linreg\_2d\_bayes\_demo.ipynb}{linreg\_2d\_bayes\_demo.ipynb}. \relax }}{398}{figure.caption.218}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.21}{\ignorespaces (a) Plugin approximation to predictive density (we plug in the MLE of the parameters) when fitting a second degree polynomial to some 1d data. (b) Posterior predictive density, obtained by integrating out the parameters. Black curve is posterior mean, error bars are 2 standard deviations of the posterior predictive density. (c) 10 samples from the plugin approximation to posterior predictive distribution. (d) 10 samples from the true posterior predictive distribution. Generated by \href {https://probml.github.io/notebooks\#linreg\_post\_pred\_plot.ipynb}{linreg\_post\_pred\_plot.ipynb}. \relax }}{400}{figure.caption.219}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.22}{\ignorespaces Posterior samples of $p(w_0,w_1|{\mathcal {D}})$ for 1d linear regression model $p(y|x,{\bm {\theta }})=\mathcal {N}(y|w_0 + w_1 x, \sigma ^2)$ with a Gaussian prior. (a) Original data. (b) Centered data. Generated by \href {https://probml.github.io/notebooks\#linreg\_2d\_bayes\_centering\_pymc3.ipynb}{linreg\_2d\_bayes\_centering\_pymc3.ipynb}. \relax }}{400}{figure.caption.220}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.23}{\ignorespaces Posterior marginals for the parameters in the multi-leg example. Generated by \href {https://probml.github.io/notebooks\#multi\_collinear\_legs\_numpyro.ipynb}{multi\_collinear\_legs\_numpyro.ipynb}. \relax }}{401}{figure.caption.221}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {11.24}{\ignorespaces Posteriors for the multi-leg example. (a) Joint posterior $p(\beta _l,\beta _r|{\mathcal {D}})$ (b) Posterior of $p(\beta _l + \beta _r | data)$. Generated by \href {https://probml.github.io/notebooks\#multi\_collinear\_legs\_numpyro.ipynb}{multi\_collinear\_legs\_numpyro.ipynb}. \relax }}{401}{figure.caption.222}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {12.1}{\ignorespaces Predictions of insurance claim rates on the test set. (a) Data. (b) Constant predictor. (c) Linear regression. (d) Poisson regression. Generated by \href {https://probml.github.io/notebooks\#poisson\_regression\_insurance.ipynb}{poisson\_regression\_insurance.ipynb}. \relax }}{409}{figure.caption.223}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {12.2}{\ignorespaces Calibration plot for insurance claims prediction. Generated by \href {https://probml.github.io/notebooks\#poisson\_regression\_insurance.ipynb}{poisson\_regression\_insurance.ipynb}. \relax }}{411}{figure.caption.225}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.1}{\ignorespaces (a) Illustration of the fact that the XOR function is not linearly separable, but can be separated by the two layer model using Heaviside activation functions. Adapted from Figure 10.6 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#xor\_heaviside.ipynb}{xor\_heaviside.ipynb}. (b) A neural net with one hidden layer, whose weights have been manually constructed to implement the XOR function. $h_1$ is the AND function and $h_2$ is the OR function. The bias terms are implemented using weights from constant nodes with the value 1. \relax }}{417}{figure.caption.227}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.2}{\ignorespaces (a) Illustration of how the sigmoid function is linear for inputs near 0, but saturates for large positive and negative inputs. Adapted from 11.1 of \citep {Geron2019}. (b) Plots of some neural network activation functions. Generated by \href {https://probml.github.io/notebooks\#activation\_fun\_plot.ipynb}{activation\_fun\_plot.ipynb}. \relax }}{419}{figure.caption.228}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.3}{\ignorespaces An MLP with 2 hidden layers applied to a set of 2d points from 2 classes, shown in the top left corner. The visualizations associated with each hidden unit show the decision boundary at that part of the network. The final output is shown on the right. The input is ${\bm {x}}\in \mathbb {R}^2$, the first layer activations are ${\bm {z}}_1 \in \mathbb {R}^4$, the second layer activations are ${\bm {z}}_2 \in \mathbb {R}^2$, and the final logit is $a_3 \in \mathbb {R}$, which is converted to a probability using the sigmoid function. This is a screenshot from the interactive demo at \url {http://playground.tensorflow.org}. \relax }}{420}{figure.caption.229}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.4}{\ignorespaces Results of applying an MLP (with 2 hidden layers with 128 units and 1 output layer with 10 units) to some MNIST images (cherry picked to include some errors). Red is incorrect, blue is correct. (a) After 1 epoch of training. (b) After 2 epochs. Generated by \href {https://probml.github.io/notebooks\#mlp\_mnist\_tf.ipynb}{mlp\_mnist\_tf.ipynb}. \relax }}{422}{figure.caption.231}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.5}{\ignorespaces Illustration of an MLP with a shared ``backbone'' and two output ``heads'', one for predicting the mean and one for predicting the variance. From \url {https://brendanhasz.github.io/2019/07/23/bayesian-density-net.html}. Used with kind permission of Brendan Hasz. \relax }}{423}{figure.caption.233}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.6}{\ignorespaces Illustration of predictions from an MLP fit using MLE to a 1d regression dataset with growing noise. (a) Output variance is input-dependent, as in \cref {fig:twoHeaded}. (b) Mean is computed using same model as in (a), but output variance is treated as a fixed parameter $\sigma ^2$, which is estimated by MLE after training, as in \cref {sec:linregSigmaMLE}. Generated by \href {https://probml.github.io/notebooks\#mlp\_1d\_regression\_hetero\_tfp.ipynb}{mlp\_1d\_regression\_hetero\_tfp.ipynb}. \relax }}{424}{figure.caption.234}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.7}{\ignorespaces A decomposition of $\mathbb {R}^2$ into a finite set of linear decision regions produced by an MLP with \ensuremath {\mathrm {ReLU}}\xspace activations with (a) one hidden layer of 25 hidden units and (b) two hidden layers. From Figure 1 of \citep {Hein2019}. Used with kind permission of Maksym Andriuschenko. \relax }}{424}{figure.caption.235}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.8}{\ignorespaces Illustration of two neurons connected together in a ``circuit''. The output axon of the left neuron makes a synaptic connection with the dendrites of the cell on the right. Electrical charges, in the form of ion flows, allow the cells to communicate. From \url {https://en.wikipedia.org/wiki/Neuron}. Used with kind permission of Wikipedia author BruceBlaus. \relax }}{426}{figure.caption.236}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.9}{\ignorespaces Plot of neural network sizes over time. Models 1, 2, 3 and 4 correspond to the perceptron \citep {Rosenblatt58}, the adaptive linear unit \citep {Widrow1960} the neocognitron \citep {Fukushima1980}, and the first MLP trained by backprop \citep {Rumelhart86}. Approximate number of neurons for some living organisms are shown on the right scale (the sponge has 0 neurons), based on \url {https://en.wikipedia.org/wiki/List_of_animals_by_number_of_neurons}. From Figure 1.11 of \citep {GoodfellowBook}. Used with kind permission of Ian Goodfellow. \relax }}{427}{figure.caption.237}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.10}{\ignorespaces A simple linear-chain feedforward model with 4 layers. Here ${\bm {x}}$ is the input and ${\bm {o}}$ is the output. From \citep {Blondel2020}. \relax }}{429}{figure.caption.238}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.11}{\ignorespaces An example of a computation graph with 2 (scalar) inputs and 1 (scalar) output. From \citep {Blondel2020}. \relax }}{435}{figure.caption.242}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.12}{\ignorespaces Notation for automatic differentiation at node $j$ in a computation graph. From \citep {Blondel2020}. \relax }}{436}{figure.caption.243}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.13}{\ignorespaces Computation graph for an MLP with input ${\bm {x}}$, hidden layer ${\bm {h}}$, output ${\bm {o}}$, loss function $L=\ell ({\bm {o}},y)$, an $\ell _2$ regularizer $s$ on the weights, and total loss $J=L+s$. From Figure 4.7.1 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{436}{figure.caption.244}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.14}{\ignorespaces (a) Some popular activation functions. (b) Plot of their gradients. Generated by \href {https://probml.github.io/notebooks\#activation\_fun\_deriv\_jax.ipynb}{activation\_fun\_deriv\_jax.ipynb}. \relax }}{438}{figure.caption.246}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.15}{\ignorespaces (a) Illustration of a residual block. (b) Illustration of why adding residual connections can help when training a very deep model. Adapted from Figure 14.16 of \citep {Geron2019}. \relax }}{441}{figure.caption.247}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.16}{\ignorespaces Calculation of minibatch stochastic gradient using data parallelism and two GPUs. From Figure 12.5.2 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{444}{figure.caption.248}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.17}{\ignorespaces (a) A deep but sparse neural network. The connections are pruned using $\ell _1$ regularization. At each level, nodes numbered 0 are clamped to 1, so their outgoing weights correspond to the offset/bias terms. (b) Predictions made by the model on the training set. Generated by \href {https://probml.github.io/notebooks\#sparse\_mlp.ipynb}{sparse\_mlp.ipynb}. \relax }}{445}{figure.caption.249}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.18}{\ignorespaces Illustration of dropout. (a) A standard neural net with 2 hidden layers. (b) An example of a thinned net produced by applying dropout with $p_0=0.5$. Units that have been dropped out are marked with an x. From Figure 1 of \citep {Srivastava2014}. Used with kind permission of Geoff Hinton. \relax }}{446}{figure.caption.250}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.19}{\ignorespaces Flat vs sharp minima. From Figures 1 and 2 of \citep {Hochreiter1997}. Used with kind permission of J\IeC {\"u}rgen Schmidhuber. \relax }}{447}{figure.caption.251}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.20}{\ignorespaces Each curve shows how the loss varies across parameter values for a given minibatch. (a) A stable local minimum. (b) An unstable local minimum. Generated by \href {https://probml.github.io/notebooks\#sgd\_minima\_variance.ipynb}{sgd\_minima\_variance.ipynb}. Adapted from \url {https://bit.ly/3wTc1L6}. \relax }}{448}{figure.caption.252}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.21}{\ignorespaces (a) xor truth table. (b) Fitting a linear logistic regression classifier using degree 10 polynomial expansion. (c) Same model, but using an RBF kernel with centroids specified by the 4 black crosses. Generated by \href {https://probml.github.io/notebooks\#logregXorDemo.ipynb}{logregXorDemo.ipynb}. \relax }}{450}{figure.caption.253}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.22}{\ignorespaces Linear regression using 10 equally spaced RBF basis functions in 1d. Left column: fitted function. Middle column: basis functions evaluated on a grid. Right column: design matrix. Top to bottom we show different bandwidths for the kernel function: $\sigma =0.5, 10, 50$. Generated by \href {https://probml.github.io/notebooks\#linregRbfDemo.ipynb}{linregRbfDemo.ipynb}. \relax }}{451}{figure.caption.254}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.23}{\ignorespaces (a) Some data from a one-to-many function. Horizontal axis is the input $x$, vertical axis is the target $y=f(x)$. (b) The responsibilities of each expert for the input domain. (c) Prediction of each expert (colored lines) superimposed on the training data. (d) Overall prediction. Mean is red cross, mode is black square. Adapted from Figures 5.20 and 5.21 of \citep {BishopBook}. Generated by \href {https://probml.github.io/notebooks\#mixexpDemoOneToMany.ipynb}{mixexpDemoOneToMany.ipynb}. \relax }}{452}{figure.caption.255}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.24}{\ignorespaces Deep MOE with $m$ experts, represented as a neural network. From Figure 1 of \citep {Chazan2017}. Used with kind permission of Jacob Goldberger. \relax }}{453}{figure.caption.256}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {13.25}{\ignorespaces A 2-level hierarchical mixture of experts as a neural network. The top gating network chooses between the left and right expert, shown by the large boxes; the left and right experts themselves choose between their left and right sub-experts. \relax }}{454}{figure.caption.257}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.1}{\ignorespaces Detecting patterns in 2d images using unstructured MLPs does not work well, because the method is not translation invariant. We can design a weight vector to act as a {\bf matched filter} for detecting the desired cross-shape. This will give a strong response of 5 if the object is on the left, but a weak response of 1 if the object is shifted over to the right. Adapted from Figure 7.16 of \citep {Stevens2020}. \relax }}{458}{figure.caption.258}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.2}{\ignorespaces We can classify a digit by looking for certain discriminative features (image templates) occuring in the correct (relative) locations. From Figure 5.1 of \citep {kerasBook}. Used with kind permission of Francois Chollet. \relax }}{458}{figure.caption.259}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.3}{\ignorespaces Discrete convolution of ${\bm {x}}=[1,2,3,4]$ with ${\bm {w}}=[5,6,7]$ to yield ${\bm {z}}=[5,16,34,52,45,28]$. We see that this operation consists of ``flipping'' ${\bm {w}}$ and then ``dragging'' it over ${\bm {x}}$, multiplying elementwise, and adding up the results. \relax }}{459}{figure.caption.260}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.4}{\ignorespaces 1d cross correlation. From Figure 15.3.2 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{459}{figure.caption.261}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.5}{\ignorespaces Illustration of 2d cross correlation. Generated by \href {https://probml.github.io/notebooks\#conv2d\_jax.ipynb}{conv2d\_jax.ipynb}. Adapted from Figure 6.2.1 of \citep {dive}. \relax }}{460}{figure.caption.262}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.6}{\ignorespaces Convolving a 2d image (left) with a $3 \times 3$ filter (middle) produces a 2d response map (right). The bright spots of the response map correspond to locations in the image which contain diagonal lines sloping down and to the right. From Figure 5.3 of \citep {kerasBook}. Used with kind permission of Francois Chollet. \relax }}{460}{figure.caption.263}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.7}{\ignorespaces Same-convolution (using zero-padding) ensures the output is the same size as the input. Adapted from Figure 8.3 of \citep {Stevens2020}. \relax }}{462}{figure.caption.264}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.8}{\ignorespaces Illustration of padding and strides in 2d convolution. (a) We apply ``same convolution'' to a $5 \times 7$ input (with zero padding) using a $3 \times 3$ filter to create a $5 \times 7$ output. (b) Now we use a stride of 2, so the output has size $3 \times 4$. Adapted from Figures 14.3--14.4 of \citep {Geron2019}. \relax }}{462}{figure.caption.265}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.9}{\ignorespaces Illustration of 2d convolution applied to an input with 2 channels. Generated by \href {https://probml.github.io/notebooks\#conv2d\_jax.ipynb}{conv2d\_jax.ipynb}. Adapted from Figure 6.4.1 of \citep {dive}. \relax }}{463}{figure.caption.266}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.10}{\ignorespaces Illustration of a CNN with 2 convolutional layers. The input has 3 color channels. The feature maps at internal layers have multiple channels. The cylinders correspond to hypercolumns, which are feature vectors at a certain location. Adapted from Figure 14.6 of \citep {Geron2019}. \relax }}{464}{figure.caption.267}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.11}{\ignorespaces Mapping 3 channels to 2 using convolution with a filter of size $1 \times 1 \times 3 \times 2$. Adapted from Figure 6.4.2 of \citep {dive}. \relax }}{465}{figure.caption.268}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.12}{\ignorespaces Illustration of maxpooling with a 2x2 filter and a stride of 1. Adapted from Figure 6.5.1 of \citep {dive}. \relax }}{465}{figure.caption.269}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.13}{\ignorespaces A simple CNN for classifying images. Adapted from \url {https://blog.floydhub.com/building-your-first-convnet/}. \relax }}{466}{figure.caption.270}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.14}{\ignorespaces Illustration of different activation normalization methods for a CNN. Each subplot shows a feature map tensor, with N as the batch axis, C as the channel axis, and (H, W) as the spatial axes. The pixels in blue are normalized by the same mean and variance, computed by aggregating the values of these pixels. Left to right: batch norm, layer norm, instance norm, and group norm (with 2 groups of 3 channels). From Figure 2 of \citep {Wu2018GN}. Used with kind permission of Kaiming He. \relax }}{468}{figure.caption.271}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.15}{\ignorespaces LeNet5, a convolutional neural net for classifying handwritten digits. From Figure 6.6.1 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{470}{figure.caption.272}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.16}{\ignorespaces (a) LeNet5. We assume the input has size $1 \times 28 \times 28$, as is the case for MNIST. From Figure 6.6.2 of \citep {dive}. Used with kind permission of Aston Zhang. (b) AlexNet. We assume the input has size $3 \times 224 \times 224$, as is the case for (cropped and rescaled) images from ImageNet. From Figure 7.1.2 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{470}{figure.caption.273}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.17}{\ignorespaces Results of applying a CNN to some MNIST images (cherry picked to include some errors). Red is incorrect, blue is correct. (a) After 1 epoch of training. (b) After 2 epochs. Generated by \href {https://probml.github.io/notebooks\#cnn\_mnist\_tf.ipynb}{cnn\_mnist\_tf.ipynb}. \relax }}{471}{figure.caption.274}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.18}{\ignorespaces Inception module. The $1 \times 1$ convolutional layers reduce the number of channels, keeping the spatial dimensions the same. The parallel pathways through convolutions of different sizes allows the model to learn which filter size to use for each layer. The final depth concatenation block combines the outputs of all the different pathways (which all have the same spatial size). From Figure 7.4.1 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{472}{figure.caption.275}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.19}{\ignorespaces GoogLeNet (slightly simplified from the original). Input is on the left. From Figure 7.4.2 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{472}{figure.caption.276}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.20}{\ignorespaces A residual block for a CNN. Left: standard version. Right: version with 1x1 convolution, to allow a change in the number of channels between the input to the block and the output. From Figure 7.6.3 of \citep {dive}. Used with kind permission of Aston Zhang \relax }}{473}{figure.caption.277}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.21}{\ignorespaces The ResNet-18 architecture. Each dotted module is a residual block shown in \cref {fig:resnetBlock}. From Figure 7.6.4 of \citep {dive}. Used with kind permission of Aston Zhang \relax }}{473}{figure.caption.278}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.22}{\ignorespaces (a) Left: a residual block adds the output to the input. Right: a densenet block concatenates the output with the input. (b) Illustration of a densenet. From Figures 7.7.1--7.7.2 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{475}{figure.caption.279}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.23}{\ignorespaces Dilated convolution with a 3x3 filter using rate 1, 2 and 3. From Figure 1 of \citep {Cui2019cnn}. Used with kind permission of Ximin Cui. \relax }}{476}{figure.caption.280}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.24}{\ignorespaces Transposed convolution with 2x2 kernel. From Figure 13.10.1 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{477}{figure.caption.281}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.25}{\ignorespaces Convolution, deconvolution and transposed convolution. Here $s$ is the stride and $p$ is the padding. From \url {https://tinyurl.com/ynxcxsut}. Used with kind permission of Aqeel Anwar. \relax }}{477}{figure.caption.282}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.26}{\ignorespaces Depthwise separable convolutions: each of the $C$ input channels undergoes a 2d convolution to produce $C$ output channels, which get combined pointwise (via 1x1 convolution) to produce $D$ output channels. From \url {https://bit.ly/2L9fm2o}. Used with kind permission of Eugenio Culurciello. \relax }}{478}{figure.caption.283}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.27}{\ignorespaces (a) Illustration of face detection, a special case of object detection. (Photo of author and his wife Margaret, taken at Filoli in California in Feburary, 2018. Image processed by Jonathan Huang using SSD face model.) (b) Illustration of anchor boxes. Adapted from \citep [Sec 12.5]{dive}. \relax }}{479}{figure.caption.284}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.28}{\ignorespaces Illustration of object detection and instance segmentation using Mask R-CNN. From \url {https://github.com/matterport/Mask_RCNN}. Used with kind permission of Waleed Abdulla. \relax }}{481}{figure.caption.285}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.29}{\ignorespaces Illustration of an {\bf encoder-decoder} (aka {\bf U-net}) CNN for semantic segmentation. The encoder uses convolution (which downsamples), and the decoder uses transposed convolution (which upsamples). From Figure 1 of \citep {segnet}. Used with kind permission of Alex Kendall. \relax }}{481}{figure.caption.286}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.30}{\ignorespaces Illustration of the U-Net model for semantic segmentation. Each blue box corresponds to a multi-channel feature map. The number of channels is shown on the top of the box, and the height/width is shown in the bottom left. White boxes denote copied feature maps. The different colored arrows correspond to different operations. From Figure 1 from \citep {Ronneberger2015}. Used with kind permission of Olaf Ronenberg. \relax }}{482}{figure.caption.287}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.31}{\ignorespaces Illustration of a multi-task dense prediction problem. From Figure 1 of \citep {Eigen2015}. Used with kind permission of Rob Fergus. \relax }}{482}{figure.caption.288}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.32}{\ignorespaces Illustration of keypoint detection for body, hands and face using the OpenPose system. From Figure 8 of \citep {openPose}. Used with kind permission of Yaser Sheikh. \relax }}{483}{figure.caption.289}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.33}{\ignorespaces Images that maximize the probability of ImageNet classes ``goose'' and ``ostrich'' under a simple Gaussian prior. From \url {http://yosinski.com/deepvis}. Used with kind permission of Jeff Clune. \relax }}{484}{figure.caption.290}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.34}{\ignorespaces Illustration of total variation norm. (a) Input image: a green sea turtle (Used with kind permission of Wikimedia author P. Lindgren). (b) Horizontal deltas. (c) Vertical deltas. Adapted from \url {https://www.tensorflow.org/tutorials/generative/style_transfer}. \relax }}{485}{figure.caption.291}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.35}{\ignorespaces Images that maximize the probability of certain ImageNet classes under a TV prior. From \url {https://research.googleblog.com/2015/06/inceptionism-going-deeper-into-neural.html}. Used with kind permission of Alexander Mordvintsev. \relax }}{485}{figure.caption.292}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.36}{\ignorespaces We visualize ``optimal stimuli'' for neurons in layers Conv 1, 3, 5 and fc8 in the AlexNet architecture, trained on the ImageNet dataset. For Conv5, we also show retrieved real images (under the column ``data driven'') that produce similar activations. Based on the method in \citep {Mahendran16ijcv}. Used with kind permission of Donglai Wei. \relax }}{486}{figure.caption.293}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.37}{\ignorespaces Illustration of DeepDream. The CNN is an Inception classifier trained on ImageNet. (a) Starting image of an Aurelia aurita (also called moon jelly). (b) Image generated after 10 iterations. (c) Image generated after 50 iterations. From \url {https://en.wikipedia.org/wiki/DeepDream}. Used with kind permission of Wikipedia author Martin Thoma. \relax }}{487}{figure.caption.294}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.38}{\ignorespaces Example output from a neural style transfer system. (a) Content image: a green sea turtle (Used with kind permission of Wikimedia author P. Lindgren). (b) Style image: a painting by Wassily Kandinsky called ``Composition 7''. (c) Output of neural style generation. Adapted from \url {https://www.tensorflow.org/tutorials/generative/style_transfer}. \relax }}{488}{figure.caption.295}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.39}{\ignorespaces Neural style transfer applied to photos of the ``production team'', who helped create code and demos for this book and its sequel. From top to bottom, left to right: Kevin Murphy (the author), Mahmoud Soliman, Aleyna Kara, Srikar Jilugu, Drishti Patel, Ming Liang Ang, Gerardo Dur\IeC {\'a}n-Mart\IeC {\'\i }n, Coco (the team dog). Each content photo used a different artistic style. Adapted from \url {https://www.tensorflow.org/tutorials/generative/style_transfer}. \relax }}{488}{figure.caption.296}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.40}{\ignorespaces Illustration of how neural style transfer works. Adapted from Figure 12.12.2 of \citep {dive}. \relax }}{489}{figure.caption.297}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {14.41}{\ignorespaces Schematic representation of 3 kinds of feature maps for 3 different input images. Adapted from Figure 5.16 of \citep {Foster2019}. \relax }}{489}{figure.caption.298}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.1}{\ignorespaces Recurrent neural network (RNN) for generating a variable length output sequence ${\bm {y}}_{1:T}$ given an optional fixed length input vector ${\bm {x}}$. \relax }}{494}{figure.caption.299}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.2}{\ignorespaces Example output of length 500 generated from a character level RNN when given the prefix ``the''. We use greedy decoding, in which the most likely character at each step is computed, and then fed back into the model. The model is trained on the book {\em The Time Machine} by H. G. Wells. Generated by \href {https://probml.github.io/notebooks\#rnn\_jax.ipynb}{rnn\_jax.ipynb}. \relax }}{495}{figure.caption.300}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.3}{\ignorespaces Illustration of a CNN-RNN model for image captioning. The pink boxes labeled ``LSTM'' refer to a specific kind of RNN that we discuss in \cref {sec:LSTM}. The pink boxes labeled $W_{\text {emb}}$ refer to embedding matrices for the (sampled) one-hot tokens, so that the input to the model is a real-valued vector. From \url {https://bit.ly/2FKnqHm}. Used with kind permission of Yunjey Choi. \relax }}{496}{figure.caption.301}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.4}{\ignorespaces (a) RNN for sequence classification. (b) Bi-directional RNN for sequence classification. \relax }}{496}{figure.caption.302}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.5}{\ignorespaces (a) RNN for transforming a sequence to another, aligned sequence. (b) Bi-directional RNN for the same task. \relax }}{497}{figure.caption.303}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.6}{\ignorespaces Illustration of a deep RNN. Adapted from Figure 9.3.1 of \citep {dive}. \relax }}{498}{figure.caption.304}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.7}{\ignorespaces Encoder-decoder RNN architecture for mapping sequence ${\bm {x}}_{1:T}$ to sequence ${\bm {y}}_{1:T'}$. \relax }}{498}{figure.caption.305}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.8}{\ignorespaces (a) Illustration of a seq2seq model for translating English to French. The - character represents the end of a sentence. From Figure 2.4 of \citep {Luong2016thesis}. Used with kind permission of Minh-Thang Luong. (b) Illustration of greedy decoding. The most likely French word at each step is highlighted in green, and then fed in as input to the next step of the decoder. From Figure 2.5 of \citep {Luong2016thesis}. Used with kind permission of Minh-Thang Luong. \relax }}{499}{figure.caption.306}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.9}{\ignorespaces An RNN unrolled (vertically) for 3 time steps, with the target output sequence and loss node shown explicitly. From Figure 8.7.2 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{500}{figure.caption.307}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.10}{\ignorespaces Illustration of a GRU. Adapted from Figure 9.1.3 of \citep {dive}. \relax }}{502}{figure.caption.308}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.11}{\ignorespaces Illustration of an LSTM. Adapted from Figure 9.2.4 of \citep {dive}. \relax }}{504}{figure.caption.309}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.12}{\ignorespaces Conditional probabilities of generating each token at each step for two different sequences. From Figures 9.8.1--9.8.2 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{505}{figure.caption.310}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.13}{\ignorespaces Illustration of beam search using a beam of size $K=2$. The vocabulary is $\mathcal {Y}= \{A,B,C,D,E\}$, with size $V=5$. We assume the top 2 symbols at step 1 are A,C. At step 2, we evaluate $p(y_1=A,y_2=y)$ and $p(y_1=C,y_2=y)$ for each $y \in \mathcal {Y}$. This takes $O(K V)$ time. We then pick the top 2 partial paths, which are $(y_1=A,y_2=B)$ and $(y_1=C,y_2=E)$, and continue in the obvious way. Adapted from Figure 9.8.3 of \citep {dive}. \relax }}{506}{figure.caption.311}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.14}{\ignorespaces Illustration of the TextCNN model for binary sentiment classification. Adapted from Figure 15.3.5 of \citep {dive}. \relax }}{507}{figure.caption.312}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.15}{\ignorespaces Illustration of the wavenet model using dilated (atrous) convolutions, with dilation factors of 1, 2, 4 and 8. From Figure 3 of \citep {wavenet}. Used with kind permission of Aaron van den Oord. \relax }}{508}{figure.caption.313}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.16}{\ignorespaces Attention computes a weighted average of a set of values, where the weights are derived by comparing the query vector to a set of keys. From Figure 10.3.1 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{509}{figure.caption.314}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.17}{\ignorespaces Kernel regression in 1d. (a) Kernel weight matrix. (b) Resulting predictions on a dense grid of test points. Generated by \href {https://probml.github.io/notebooks\#kernel\_regression\_attention.ipynb}{kernel\_regression\_attention.ipynb}. \relax }}{511}{figure.caption.315}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.18}{\ignorespaces Illustration of seq2seq with attention for English to French translation. Used with kind permission of Minh-Thang Luong. \relax }}{512}{figure.caption.316}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.19}{\ignorespaces Illustration of the attention heatmaps generated while translating two sentences from Spanish to English. (a) Input is ``hace mucho frio aqui.'', output is ``it is very cold here.''. (b) Input is ``\IeC {\textquestiondown }todavia estan en casa?'', output is ``are you still at home?''. Note that when generating the output token ``home'', the model should attend to the input token ``casa'', but in fact it seems to attend to the input token ``?''. Adapted from \url {https://www.tensorflow.org/tutorials/text/nmt_with_attention}. \relax }}{512}{figure.caption.317}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.20}{\ignorespaces Example of an electronic health record. In this example, 24h after admission to the hospital, the RNN classifier predicts the risk of death as 19.9\%; the patient ultimately died 10 days after admission. The ``relevant'' keywords from the input clinical notes are shown in red, as identified by an attention mechanism. From Figure 3 of \citep {Rajkomar2018}. Used with kind permission of Alvin Rakomar. \relax }}{513}{figure.caption.318}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.21}{\ignorespaces Illustration of sentence pair entailment classification using an MLP with attention to align the premise (``I do need sleep'') with the hypothesis (``I am tired''). White squares denote active attention weights, blue squares are inactive. (We are assuming hard 0/1 attention for simplicity.) From Figure 15.5.2 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{514}{figure.caption.319}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.22}{\ignorespaces Image captioning using attention. (a) Soft attention. Generates ``a woman is throwing a frisbee in a park''. (b) Hard attention. Generates ``a man and a woman playing frisbee in a field''. From Figure 6 of \citep {showAttendTell}. Used with kind permission of Kelvin Xu. \relax }}{516}{figure.caption.320}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.23}{\ignorespaces Illustration of how encoder self-attention for the word ``it'' differs depending on the input context. From \url {https://ai.googleblog.com/2017/08/transformer-novel-neural-network.html}. Used with kind permission of Jakob Uszkoreit. \relax }}{517}{figure.caption.321}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.24}{\ignorespaces Multi-head attention. Adapted from Figure 9.3.3 of \citep {dive}. \relax }}{517}{figure.caption.322}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.25}{\ignorespaces (a) Positional encoding matrix for a sequence of length $n=60$ and an embedding dimension of size $d=32$. (b) Basis functions for columns 6 to 9. Generated by \href {https://probml.github.io/notebooks\#positional\_encoding\_jax.ipynb}{positional\_encoding\_jax.ipynb}. \relax }}{519}{figure.caption.323}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.26}{\ignorespaces The transformer. From \citep {Weng2018attention}. Used with kind permission of Lilian Weng. Adapted from Figures 1--2 of \citep {Vaswani2017}. \relax }}{520}{figure.caption.324}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.27}{\ignorespaces Comparison of (1d) CNNs, RNNs and self-attention models. From Figure 10.6.1 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{521}{figure.caption.325}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.28}{\ignorespaces The Vision Transformer (ViT) model. This treats an image as a set of input patches. The input is prepended with the special CLASS embedding vector (denoted by *) in location 0. The class label for the image is derived by applying softmax to the final ouput encoding at location 0. From Figure 1 of \citep {ViT}. Used with kind permission of Alexey Dosovitskiy \relax }}{522}{figure.caption.327}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.29}{\ignorespaces Venn diagram presenting the taxonomy of different efficient transformer architectures. From \citep {Tay2020transformers}. Used with kind permission of Yi Tay. \relax }}{524}{figure.caption.328}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.30}{\ignorespaces Attention matrix $\mathbf {A}$ rewritten as a product of two lower rank matrices $\mathbf {Q}^{\prime }$ and $(\mathbf {K}^{\prime })^{{\mkern -1.5mu\mathsf {T}}}$ with random feature maps $\boldsymbol {\phi }({\bm {q}}_i) \in \mathbb {R}^M$ and $\boldsymbol {\phi }({\bm {v}}_k) \in \mathbb {R}^M$ for the corresponding queries/keys stored in the rows/columns. Used with kind permission of Krzysztof Choromanski. \relax }}{526}{figure.caption.329}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.31}{\ignorespaces Decomposition of the attention matrix $\mathbf {A}$ can be leveraged to improve attention computations via matrix associativity property. To compute $\mathbf {AV}$, we first calculate $\mathbf {G}=({\bm {k}}^{\prime })^{{\mkern -1.5mu\mathsf {T}}}\mathbf {V}$ and then ${\bm {q}}^{\prime }\mathbf {G}$, resulting in linear in $N$ space and time complexity. Used with kind permission of Krzysztof Choromanski. \relax }}{526}{figure.caption.330}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.32}{\ignorespaces Illustration of ELMo bidrectional language model. Here $y_t=x_{t+1}$ when acting as the target for the forwards LSTM, and $y_t = x_{t-1}$ for the backwards LSTM. (We add \text {\em {bos}}\xspace and \text {\em {eos}}\xspace sentinels to handle the edge cases.) From \citep {Weng2019LM}. Used with kind permission of Lilian Weng. \relax }}{527}{figure.caption.331}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.33}{\ignorespaces Illustration of (a) BERT and (b) GPT. $E_t$ is the embedding vector for the input token at location $t$, and $T_t$ is the output target to be predicted. From Figure 3 of \citep {bert}. Used with kind permission of Ming-Wei Chang. \relax }}{529}{figure.caption.332}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.34}{\ignorespaces Illustration of how a pair of input sequences, denoted A and B, are encoded before feeding to BERT. From Figure 14.8.2 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{530}{figure.caption.333}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.35}{\ignorespaces Illustration of how BERT can be used for different kinds of supervised NLP tasks. (a) Single sentence classification (e.g., sentiment analysis); (b) Sentence-pair classification (e.g., textual entailment); (d) Single sentence tagging (e.g., shallow parsing); (d) Question answering. From Figure 4 of \citep {bert}. Used with kind permission of Ming-Wei Chang. \relax }}{531}{figure.caption.334}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {15.36}{\ignorespaces Illustration of how the T5 model (``Text-to-text Transfer Transformer'') can be used to perform multiple NLP tasks, such as translating English to German; determining if a sentence is linguistic valid or not ({\bf CoLA} stands for ``Corpus of Linguistic Acceptability''); determining the degree of semantic similarity ({\bf STSB} stands for ``Semantic Textual Similarity Benchmark''); and abstractive summarization. From Figure 1 of \citep {T5}. Used with kind permission of Colin Raffel. \relax }}{533}{figure.caption.335}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {16.1}{\ignorespaces (a) Illustration of a $K$-nearest neighbors classifier in 2d for $K=5$. The nearest neighbors of test point ${\bm {x}}$ have labels $\{1, 1, 1, 0, 0\}$, so we predict $p(y=1|{\bm {x}},{\mathcal {D}}) = 3/5$. (b) Illustration of the Voronoi tessellation induced by 1-NN. Adapted from Figure 4.13 of \citep {Duda01}. Generated by \href {https://probml.github.io/notebooks\#knn\_voronoi\_plot.ipynb}{knn\_voronoi\_plot.ipynb}. \relax }}{538}{figure.caption.336}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {16.2}{\ignorespaces Decision boundaries induced by a KNN classifier. (a) $K=1$. (b) $K=2$. (c) $K=5$. (d) Train and test error vs $K$. Generated by \href {https://probml.github.io/notebooks\#knn\_classify\_demo.ipynb}{knn\_classify\_demo.ipynb}. \relax }}{539}{figure.caption.337}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {16.3}{\ignorespaces Illustration of the curse of dimensionality. (a) We embed a small cube of side $s$ inside a larger unit cube. (b) We plot the edge length of a cube needed to cover a given volume of the unit cube as a function of the number of dimensions. Adapted from Figure 2.6 from \citep {HastieBook}. Generated by \href {https://probml.github.io/notebooks\#curse\_dimensionality\_plot.ipynb}{curse\_dimensionality\_plot.ipynb}. \relax }}{539}{figure.caption.338}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {16.4}{\ignorespaces Illustration of latent coincidence analysis (LCA) as a directed graphical model. The inputs ${\bm {x}}, {\bm {x}}' \in \mathbb {R}^D$ are mapped into Gaussian latent variables ${\bm {z}}, {\bm {z}}' \in \mathbb {R}^L$ via a linear mapping $\mathbf {W}$. If the two latent points coincide (within length scale $\kappa $) then we set the similarity label to $y=1$, otherwise we set it to $y=0$. From Figure 1 of \citep {Der2012}. Used with kind permission of Lawrence Saul. \relax }}{543}{figure.caption.339}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {16.5}{\ignorespaces Networks for deep metric learning. (a) Siamese network. (b) Triplet network. Adapted from Figure 5 of \citep {Kaya2019}. \relax }}{545}{figure.caption.340}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {16.6}{\ignorespaces Speeding up triplet loss minimization. (a) Illustration of hard vs easy negatives. Here $a$ is the anchor point, $p$ is a positive point, and $n_i$ are negative points. Adapted from Figure 4 of \citep {Kaya2019}. (b) Standard triplet loss would take $8 \times 3 \times 4 = 96$ calculations, whereas using a proxy loss (with one proxy per class) takes $8 \times 2 = 16$ calculations. From Figure 1 of \citep {Do2019cvpr}. Used with kind permission of Gustavo Cerneiro. \relax }}{547}{figure.caption.341}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {16.7}{\ignorespaces Adding spherical embedding constraint to a deep metric learning method. Used with kind permission of Dingyi Zhang. \relax }}{549}{figure.caption.342}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {16.8}{\ignorespaces A comparison of some popular normalized kernels. Generated by \href {https://probml.github.io/notebooks\#smoothingKernelPlot.ipynb}{smoothingKernelPlot.ipynb}. \relax }}{551}{figure.caption.343}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {16.9}{\ignorespaces A nonparametric (Parzen) density estimator in 1d estimated from 6 data points, denoted by x. Top row: uniform kernel. Bottom row: Gaussian kernel. Left column: bandwidth parameter $h=1$. Right column: bandwidth parameter $h=2$. Adapted from \url {http://en.wikipedia.org/wiki/Kernel_density_estimation}. Generated by \href {https://probml.github.io/notebooks\#parzen\_window\_demo2.ipynb}{parzen\_window\_demo2.ipynb}. \relax }}{552}{figure.caption.345}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {16.10}{\ignorespaces An example of kernel regression in 1d using a Gaussian kernel. Generated by \href {https://probml.github.io/notebooks\#kernelRegressionDemo.ipynb}{kernelRegressionDemo.ipynb}. \relax }}{554}{figure.caption.346}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.1}{\ignorespaces Function samples from a GP with an ARD kernel. (a) $\ell _1=\ell _2=1$. Both dimensions contribute to the response. (b) $\ell _1=1$, $\ell _2=5$. The second dimension is essentially ignored. Adapted from Figure 5.1 of \citep {Rasmussen06}. Generated by \href {https://probml.github.io/notebooks\#gprDemoArd.ipynb}{gprDemoArd.ipynb}. \relax }}{559}{figure.caption.348}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.2}{\ignorespaces Functions sampled from a GP with a Matern kernel. (a) $\nu =5/2$. (b) $\nu =1/2$. Generated by \href {https://probml.github.io/notebooks\#gpKernelPlot.ipynb}{gpKernelPlot.ipynb}. \relax }}{560}{figure.caption.350}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.3}{\ignorespaces Functions sampled from a GP using various stationary periodic kernels. Generated by \href {https://probml.github.io/notebooks\#gpKernelPlot.ipynb}{gpKernelPlot.ipynb}. \relax }}{561}{figure.caption.352}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.4}{\ignorespaces Examples of 1d structures obtained by multiplying elementary kernels. Top row shows $\mathcal {K}(x,x'=1)$. Bottom row shows some functions sampled from $GP(f|0,\mathcal {K})$. From Figure 2.2 of \citep {duvenaud-thesis-2014}. Used with kind permission of David Duvenaud. \relax }}{563}{figure.caption.353}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.5}{\ignorespaces Examples of 1d structures obtained by adding elementary kernels. Here $\mathrm {SE}^{(\mathrm {short})}$ and $\mathrm {SE}^{(\mathrm {long})}$ are two SE kernels with different length scales. From Figure 2.4 of \citep {duvenaud-thesis-2014}. Used with kind permission of David Duvenaud. \relax }}{563}{figure.caption.354}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.6}{\ignorespaces A Gaussian process for 2 training points, ${\bm {x}}_1$ and ${\bm {x}}_2$, and 1 testing point, ${\bm {x}}_{*}$, represented as a graphical model representing $p({\bm {y}},{\bm {f}}_{X}|\mathbf {X}) = \mathcal {N}({\bm {f}}_{X}|m(\mathbf {X}), \mathcal {K}(\mathbf {X})) \DOTSB \prod@ \slimits@ _i p(y_i|f_i)$. The hidden nodes $f_i=f({\bm {x}}_i)$ represent the value of the function at each of the data points. These hidden nodes are fully interconnected by undirected edges, forming a Gaussian graphical model; the edge strengths represent the covariance terms $\Sigma _{ij}=\mathcal {K}({\bm {x}}_i,{\bm {x}}_j)$. If the test point ${\bm {x}}_{*}$ is similar to the training points ${\bm {x}}_1$ and ${\bm {x}}_2$, then the value of the hidden function $f_{*}$ will be similar to $f_1$ and $f_2$, and hence the predicted output $y_*$ will be similar to the training values $y_1$ and $y_2$. \relax }}{564}{figure.caption.355}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.7}{\ignorespaces (a) some functions sampled from a GP prior with squared exponential kernel. (b-d) : some samples from a GP posterior, after conditioning on 1,2, and 4 noise-free observations. The shaded area represents $\mathbb {E}\left [{f({\bm {x}})}\right ] \pm 2 \mathrm {std}\left [{f({\bm {x}})}\right ]$. Adapted from Figure 2.2 of \citep {Rasmussen06}. Generated by \href {https://probml.github.io/notebooks\#gprDemoNoiseFree.ipynb}{gprDemoNoiseFree.ipynb}. \relax }}{565}{figure.caption.356}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.8}{\ignorespaces Some 1d GPs with SE kernels but different hyper-parameters fit to 20 noisy observations. The hyper-parameters $(\ell ,\sigma _f,\sigma _y)$ are as follows: (a) (1,1,0.1) (b) (3.0, 1.16, 0.89). Adapted from Figure 2.5 of \citep {Rasmussen06}. Generated by \href {https://probml.github.io/notebooks\#gprDemoChangeHparams.ipynb}{gprDemoChangeHparams.ipynb}. \relax }}{568}{figure.caption.357}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.9}{\ignorespaces Illustration of local minima in the marginal likelihood surface. (a) We plot the log marginal likelihood vs kernel length scale $\ell $ and observation noise $\sigma _y$, for fixed signal level $\sigma _f=1$, using the 7 data points shown in panels b and c. (b) The function corresponding to the lower left local minimum, $(\ell ,\sigma _y) \approx (1,0.2)$. This is quite ``wiggly'' and has low noise. (c) The function corresponding to the top right local minimum, $(\ell ,\sigma _y) \approx (10,0.8)$. This is quite smooth and has high noise. The data was generated using $(\ell ,\sigma _f,\sigma _y)=(1,1,0.1)$. Adapted from Figure 5.5 of \citep {Rasmussen06}. Generated by \href {https://probml.github.io/notebooks\#gpr\_demo\_marglik.ipynb}{gpr\_demo\_marglik.ipynb}. \relax }}{570}{figure.caption.358}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.10}{\ignorespaces GP classifier for a binary classification problem on Iris flowers (Setosa vs Versicolor) using a single input feature (sepal length). The fat vertical line is the credible interval for the decision boundary. (a) SE kernel. (b) SE plus linear kernel. Adapted from Figures 7.11--7.12 of \citep {Martin2018}. Generated by \href {https://probml.github.io/notebooks\#gp\_classify\_iris\_1d\_pymc3.ipynb}{gp\_classify\_iris\_1d\_pymc3.ipynb}. \relax }}{571}{figure.caption.359}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.11}{\ignorespaces (a) Fictitious ``space flu'' binary classification problem. (b) Fit from a GP with SE kernel. Adapted from Figures 7.13--7.14 of \citep {Martin2018}. Generated by \href {https://probml.github.io/notebooks\#gp\_classify\_spaceflu\_1d\_pymc3.ipynb}{gp\_classify\_spaceflu\_1d\_pymc3.ipynb}. \relax }}{572}{figure.caption.360}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.12}{\ignorespaces Illustration of the large margin principle. Left: a separating hyper-plane with large margin. Right: a separating hyper-plane with small margin. \relax }}{575}{figure.caption.364}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.13}{\ignorespaces (a) Illustration of the geometry of a linear decision boundary in 2d. A point ${\bm {x}}$ is classified as belonging in decision region $\mathcal {R}_1$ if $f({\bm {x}})>0$, otherwise it belongs in decision region $\mathcal {R}_0$; ${\bm {w}}$ is a vector which is perpendicular to the decision boundary. The term $w_0$ controls the distance of the decision boundary from the origin. ${\bm {x}}_{\perp }$ is the orthogonal projection of ${\bm {x}}$ onto the boundary. The signed distance of ${\bm {x}}$ from the boundary is given by $f({\bm {x}})/||{\bm {w}}||$. Adapted from Figure 4.1 of \citep {BishopBook}. (b) Points with circles around them are support vectors, and have dual variables $\alpha _n >0$. In the soft margin case, we associate a slack variable $\xi _n$ with each example. If $0 < \xi _n < 1$, the point is inside the margin, but on the correct side of the decision boundary. If $\xi _n>1$, the point is on the wrong side of the boundary. Adapted from Figure 7.3 of \citep {BishopBook}. \relax }}{576}{figure.caption.365}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.14}{\ignorespaces Illustration of the benefits of scaling the input features before computing a max margin classifier. Adapted from Figure 5.2 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#svm\_classifier\_feature\_scaling.ipynb}{svm\_classifier\_feature\_scaling.ipynb}. \relax }}{577}{figure.caption.366}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.15}{\ignorespaces Log-odds vs $x$ for 3 different methods. Adapted from Figure 10 of \citep {Tipping01}. Used with kind permission of Mike Tipping. \relax }}{581}{figure.caption.367}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.16}{\ignorespaces (a) The one-versus-rest approach. The green region is predicted to be both class 1 and class 2. (b) The one-versus-one approach. The label of the green region is ambiguous. Adapted from Figure 4.2 of \citep {BishopBook}. \relax }}{582}{figure.caption.368}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.17}{\ignorespaces SVM classifier with RBF kernel with precision $\gamma $ and regularizer $C$ applied to two moons data. Adapted from Figure 5.9 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#svm\_classifier\_2d.ipynb}{svm\_classifier\_2d.ipynb}. \relax }}{583}{figure.caption.369}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.18}{\ignorespaces (a) A cross validation estimate of the 0-1 error for an SVM classifier with RBF kernel with different precisions $\gamma =1/(2\sigma ^2)$ and different regularizer $\lambda =1/C$, applied to a synthetic data set drawn from a mixture of 2 Gaussians. (b) A slice through this surface for $\gamma =5$ The red dotted line is the Bayes optimal error, computed using Bayes rule applied to the model used to generate the data. Adapted from Figure 12.6 of \citep {HastieBook}. Generated by \href {https://probml.github.io/notebooks\#svmCgammaDemo.ipynb}{svmCgammaDemo.ipynb}. \relax }}{584}{figure.caption.370}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.19}{\ignorespaces (a) Illustration of $\ell _2$, Huber and $\epsilon $-insensitive loss functions, where $\epsilon =1.5$. Generated by \href {https://probml.github.io/notebooks\#huberLossPlot.ipynb}{huberLossPlot.ipynb}. (b) Illustration of the $\epsilon $-tube used in SVM regression. Points above the tube have $\xi _i^+>0$ and $\xi _i^-=0$. Points below the tube have $\xi _i^+=0$ and $\xi _i^->0$. Points inside the tube have $\xi _i^+=\xi _i^-=0$. Adapted from Figure 7.7 of \citep {BishopBook}. \relax }}{586}{figure.caption.371}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.20}{\ignorespaces Illustration of support vector regression. Adapted from Figure 5.11 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#svm\_regression\_1d.ipynb}{svm\_regression\_1d.ipynb}. \relax }}{588}{figure.caption.372}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.21}{\ignorespaces Example of non-linear binary classification using an RBF kernel with bandwidth $\sigma =0.3$. (a) L2VM. (b) L1VM. (c) RVM. (d) SVM. Black circles denote the support vectors. Generated by \href {https://probml.github.io/notebooks\#kernelBinaryClassifDemo.ipynb}{kernelBinaryClassifDemo.ipynb}. \relax }}{589}{figure.caption.373}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.22}{\ignorespaces Model fits for kernel based regression on the noisy sinc function using an RBF kernel with bandwidth $\sigma =0.3$. (a) L2VM with $\lambda =0.5$. (b) L1VM with $\lambda =0.5$. (c) RVM. (d) SVM regression with $C=1/\lambda $. chosen by cross validation. Red circles denote the retained training exemplars. Generated by \href {https://probml.github.io/notebooks\#rvm\_regression\_1d.ipynb}{rvm\_regression\_1d.ipynb}. \relax }}{590}{figure.caption.374}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {17.23}{\ignorespaces Estimated coefficients for the models in \cref {fig:kernelRegrDemoData}. Generated by \href {https://probml.github.io/notebooks\#rvm\_regression\_1d.ipynb}{rvm\_regression\_1d.ipynb}. \relax }}{591}{figure.caption.375}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {18.1}{\ignorespaces (a) A regression tree on two inputs. (b) Corresponding piecewise constant surface. Adapted from Figure 9.2 of \citep {HastieBook}. Generated by \href {https://probml.github.io/notebooks\#regtreeSurfaceDemo.ipynb}{regtreeSurfaceDemo.ipynb}. \relax }}{594}{figure.caption.377}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {18.2}{\ignorespaces (a) A set of shapes with corresponding binary labels. The features are: color (values ``blue'', ``red'', ``other''), shape (values ``ellipse'', ``other''), and size (real-valued). (b) A hypothetical classification tree fitted to this data. A leaf labeled as $(n_1,n_0)$ means that there are $n_1$ positive examples that fall into this partition, and $n_0$ negative examples. \relax }}{594}{figure.caption.378}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {18.3}{\ignorespaces (a) A decision tree of depth 2 fit to the iris data, using just the petal length and petal width features. Leaf nodes are color coded according to the majority class. The number of training samples that pass from the root to each node is shown inside each box, as well as how many of these values fall into each class. This can be normalized to get a distribution over class labels for each node. (b) Decision surface induced by (a). (c) Fit to data where we omit a single data point (shown by red star). (d) Ensemble of the two models in (b) and (c). Generated by \href {https://probml.github.io/notebooks\#dtree\_sensitivity.ipynb}{dtree\_sensitivity.ipynb}. \relax }}{597}{figure.caption.379}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {18.4}{\ignorespaces (a) A single decision tree. (b-c) Bagging ensemble of 10 and 50 trees. (d) Random forest of 50 trees. Adapted from Figure 7.5 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#bagging\_trees.ipynb}{bagging\_trees.ipynb} and \href {https://probml.github.io/notebooks\#rf\_demo\_2d.ipynb}{rf\_demo\_2d.ipynb}. \relax }}{600}{figure.caption.380}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {18.5}{\ignorespaces Preditive accuracy vs size of tree ensemble for bagging, random forests and gradient boosting with log loss. Adapted from Figure 15.1 of \citep {HastieBook}. Generated by \href {https://probml.github.io/notebooks\#spam\_tree\_ensemble\_compare.ipynb}{spam\_tree\_ensemble\_compare.ipynb}. \relax }}{601}{figure.caption.381}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {18.6}{\ignorespaces Illustration of boosting using a regression tree of depth 2 applied to a 1d dataset. Adapted from Figure 7.9 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#boosted\_regr\_trees.ipynb}{boosted\_regr\_trees.ipynb}. \relax }}{603}{figure.caption.382}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {18.7}{\ignorespaces Illustration of various loss functions for binary classification. The horizontal axis is the margin $m({\bm {x}}) = \cc@accent {"707E}{y}F({\bm {x}})$, the vertical axis is the loss. The log loss uses log base 2. Generated by \href {https://probml.github.io/notebooks\#hinge\_loss\_plot.ipynb}{hinge\_loss\_plot.ipynb}. \relax }}{604}{figure.caption.383}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {18.8}{\ignorespaces Feature importance of a random forest classifier trained to distinguish MNIST digits from classes 0 and 8. Adapted from Figure 7.6 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#rf\_feature\_importance\_mnist.ipynb}{rf\_feature\_importance\_mnist.ipynb}. \relax }}{611}{figure.caption.388}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {18.9}{\ignorespaces Feature importance of a gradient boosted classifier trained to distinguish spam from non-spam email. The dataset has X training examples with Y features, corresponding to token frequency. Adapted from Figure 10.6 of \citep {HastieBook}. Generated by \href {https://probml.github.io/notebooks\#spam\_tree\_ensemble\_interpret.ipynb}{spam\_tree\_ensemble\_interpret.ipynb}. \relax }}{612}{figure.caption.389}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {18.10}{\ignorespaces (a) Partial dependence of log-odds of the spam class for 4 important predictors. The red ticks at the base of the plot are deciles of the empirical distribution for this feature. (b) Joint partial dependence of log-odds on the features hp and !. Adapted from Figure 10.6--10.8 of \citep {HastieBook}. Generated by \href {https://probml.github.io/notebooks\#spam\_tree\_ensemble\_interpret.ipynb}{spam\_tree\_ensemble\_interpret.ipynb}. \relax }}{612}{figure.caption.390}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.1}{\ignorespaces Illustration of random crops and zooms of a image images. Generated by \href {https://probml.github.io/notebooks\#image\_augmentation\_jax.ipynb}{image\_augmentation\_jax.ipynb}. \relax }}{616}{figure.caption.391}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.2}{\ignorespaces Illustration of fine-tuning a model on a new dataset. The final output layer is trained from scratch, since it might correspond to a different label set. The other layers are initialized at their previous parameters, and then optionally updated using a small learning rate. From Figure 13.2.1 of \citep {dive}. Used with kind permission of Aston Zhang. \relax }}{617}{figure.caption.392}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.3}{\ignorespaces (a) Adding adapter layers to a transformer. From Figure 2 of \citep {Houlsby2019}. Used with kind permission of Neil Houlsby. (b) Adding adapter layers to a resnet. From Figure 2 of \citep {Rebuffi2018}. Used with kind permission of Sylvestre-Alvise Rebuffi. \relax }}{618}{figure.caption.393}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.4}{\ignorespaces (a) Context encoder for self-supervised learning. From \citep {Pathak2016}. Used with kind permission of Deepak Pathak. (b) Some other proxy tasks for self-supervised learning. From \citep {LeCunSSL2018}. Used with kind permission of Yann LeCun. \relax }}{620}{figure.caption.394}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.5}{\ignorespaces (a) Illustration of SimCLR training. $\mathcal {T}$ is a set of stochastic semantics-preserving transformations (data augmentations). (b-c) Illustration of the benefit of random crops. Solid rectangles represent the original image, dashed rectangles are random crops. In (b), the model is forced to predict the local view A from the global view B (and vice versa). In (c), the model is forced to predict the appearance of adjacent views (C,D). From Figures 2--3 of \citep {chen2020simple}. Used with kind permission of Ting Chen. \relax }}{622}{figure.caption.395}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.6}{\ignorespaces Visualization of SimCLR training. Each input image in the minibatch is randomly modified in two different ways (using cropping (followed by resize), flipping, and color distortion), and then fed into a Siamese network. The embeddings (final layer) for each pair derived from the same image is forced to be close, whereas the embeddings for all other pairs are forced to be far. From \url {https://ai.googleblog.com/2020/04/advancing-self-supervised-and-semi.html}. Used with kind permission of Ting Chen. \relax }}{622}{figure.caption.396}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.7}{\ignorespaces Illustration of the CLIP model. From Figure 1 of \citep {CLIP}. Used with kind permission of Alec Radford. \relax }}{624}{figure.caption.397}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.8}{\ignorespaces Illustration of the benefits of semi-supervised learning for a binary classification problem. Labeled points from each class are shown as black and white circles respectively. (a) Decision boundary we might learn given only labeled data. (b) Decision boundary we might learn if we also had a lot of unlabeled data points, shown as smaller grey circles. \relax }}{626}{figure.caption.398}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.9}{\ignorespaces Comparison of the entropy minimization, self-training, and ``sharpened'' entropy minimization loss functions for a binary classification problem. \relax }}{628}{figure.caption.399}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.10}{\ignorespaces Visualization demonstrating how entropy minimization enforces the cluster assumption. The classifier assigns a higher probability to class 1 (black dots) or 2 (white dots) in red or blue regions respectively. The predicted class probabilities for one particular unlabeled datapoint is shown in the bar plot. In (a), the decision boundary passes through high-density regions of data, so the classifier is forced to output high-entropy predictions. In (b), the classifier avoids high-density regions and is able to assign low-entropy predictions to most of the unlabeled data. \relax }}{629}{figure.caption.400}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.11}{\ignorespaces Comparison of the squared error and KL divergence lossses for a consistency regularization. This visualization is for a binary classification problem where it is assumed that the model's output for the unperturbed input is 1. The figure plots the loss incurred for a particular value of the logit (i.e.\ the pre-activation fed into the output sigmoid nonlinearity) for the perturbed input. As the logit grows towards infinity, the model predicts a class label of 1 (in agreement with the prediction for the unperturbed input); as it grows towards negative infinity, the model predictions class 0. The squared error loss saturates (and has zero gradients) when the model predicts one class or the other with high probability, but the KL divergence grows without bound as the model predicts class 0 with more and more confidence. \relax }}{633}{figure.caption.401}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.12}{\ignorespaces Diagram of the semi-supervised GAN framework. The discriminator is trained to output the class of labeled datapoints (red), a ``fake'' label for outputs from the generator (yellow), and any label for unlabeled data (green). \relax }}{636}{figure.caption.402}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.13}{\ignorespaces Combining self-supervised learning on unlabeled data (left), supervised fine-tuning (middle), and self-training on pseudo-labeled data (right). From Figure 3 of \citep {Chen2020nips}. Used with kind permission of Ting Chen. \relax }}{637}{figure.caption.403}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.14}{\ignorespaces Illustration of a hierarchical Bayesian model for meta-learning. Generated by \href {https://probml.github.io/notebooks\#hbayes\_maml.ipynb}{hbayes\_maml.ipynb}. \relax }}{640}{figure.caption.404}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.15}{\ignorespaces Illustration of meta-learning for few-shot learning. Here, each task is a 3-way-2-shot classification problem because each training task contains a support set with three classes, each with two examples. From \url {https://bit.ly/3rrvSjw}. Copyright (2019) Borealis AI. Used with kind permission of Simon Prince and April Cooper. \relax }}{641}{figure.caption.405}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {19.16}{\ignorespaces Illustration of a matching network for one-shot learning. From Figure 1 of \citep {Vinyals2016}. Used with kind permission of Oriol Vinyals. \relax }}{642}{figure.caption.406}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.1}{\ignorespaces An illustration of PCA where we project from 2d to 1d. Red circles are the original data points, blue circles are the reconstructions. The red dot is the data mean. Generated by \href {https://probml.github.io/notebooks\#pcaDemo2d.ipynb}{pcaDemo2d.ipynb}. \relax }}{646}{figure.caption.407}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.2}{\ignorespaces An illustration of PCA applied to MNIST digits from class 9. Grid points are at the 5, 25, 50, 75, 95 \% quantiles of the data distribution along each dimension. The circled points are the closest projected images to the vertices of the grid. Adapted from Figure 14.23 of \citep {HastieBook}. Generated by \href {https://probml.github.io/notebooks\#pca\_digits.ipynb}{pca\_digits.ipynb}. \relax }}{646}{figure.caption.408}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.3}{\ignorespaces a) Some randomly chosen $64 \times 64$ pixel images from the Olivetti face database. (b) The mean and the first three PCA components represented as images. Generated by \href {https://probml.github.io/notebooks\#pcaImageDemo.ipynb}{pcaImageDemo.ipynb}. \relax }}{647}{figure.caption.409}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.4}{\ignorespaces Illustration of the variance of the points projected onto different 1d vectors. $v_1$ is the first principal component, which maximizes the variance of the projection. $v_2$ is the second principal component which is direction orthogonal to $v_1$. Finally $v'$ is some other vector in between $v_1$ and $v_2$. Adapted from Figure 8.7 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#pca\_projected\_variance.ipynb}{pca\_projected\_variance.ipynb} \relax }}{649}{figure.caption.410}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.5}{\ignorespaces Effect of standardization on PCA applied to the height/weight dataset. (Red=female, blue=male.) Left: PCA of raw data. Right: PCA of standardized data. Generated by \href {https://probml.github.io/notebooks\#pcaStandardization.ipynb}{pcaStandardization.ipynb}. \relax }}{650}{figure.caption.411}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.6}{\ignorespaces Reconstruction error on MNIST vs number of latent dimensions used by PCA. (a) Training set. (b) Test set. Generated by \href {https://probml.github.io/notebooks\#pcaOverfitDemo.ipynb}{pcaOverfitDemo.ipynb}. \relax }}{652}{figure.caption.412}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.7}{\ignorespaces (a) Scree plot for training set, corresponding to \cref {fig:pcaErr}(a). (b) Fraction of variance explained. Generated by \href {https://probml.github.io/notebooks\#pcaOverfitDemo.ipynb}{pcaOverfitDemo.ipynb}. \relax }}{653}{figure.caption.413}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.8}{\ignorespaces Profile likelihood corresponding to PCA model in \cref {fig:pcaErr}(a). Generated by \href {https://probml.github.io/notebooks\#pcaOverfitDemo.ipynb}{pcaOverfitDemo.ipynb}. \relax }}{654}{figure.caption.414}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.9}{\ignorespaces Illustration of the FA generative process, where we have $L=1$ latent dimension generating $D=2$ observed dimensions; we assume $\boldsymbol {\Psi }=\sigma ^2 \mathbf {I}$. The latent factor has value $z \in \mathbb {R}$, sampled from $p(z)$; this gets mapped to a 2d offset $\boldsymbol {\delta }= z {\bm {w}}$, where ${\bm {w}}\in \mathbb {R}^2$, which gets added to $\boldsymbol {\mu }$ to define a Gaussian $p({\bm {x}}|z) = \mathcal {N}({\bm {x}}|\boldsymbol {\mu }+ \boldsymbol {\delta },\sigma ^2 \mathbf {I})$. By integrating over $z$, we ``slide'' this circular Gaussian ``spray can'' along the principal component axis ${\bm {w}}$, which induces elliptical Gaussian contours in ${\bm {x}}$ space centered on $\boldsymbol {\mu }$. Adapted from Figure 12.9 of \citep {BishopBook}. \relax }}{655}{figure.caption.415}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.10}{\ignorespaces Illustration of EM for PCA when $D=2$ and $L=1$. Green stars are the original data points, black circles are their reconstructions. The weight vector ${\bm {w}}$ is represented by blue line. (a) We start with a random initial guess of ${\bm {w}}$. The E step is represented by the orthogonal projections. (b) We update the rod ${\bm {w}}$ in the M step, keeping the projections onto the rod (black circles) fixed. (c) Another E step. The black circles can 'slide' along the rod, but the rod stays fixed. (d) Another M step. Adapted from Figure 12.12 of \citep {BishopBook}. Generated by \href {https://probml.github.io/notebooks\#pcaEmStepByStep.ipynb}{pcaEmStepByStep.ipynb}. \relax }}{660}{figure.caption.416}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.11}{\ignorespaces Mixture of factor analyzers as a PGM. \relax }}{662}{figure.caption.417}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.12}{\ignorespaces Mixture of PPCA models fit to a 2d dataset, using $L=1$ latent dimensions. (a) $K=1$ mixture components. (b) $K=10$ mixture components. Generated by \href {https://probml.github.io/notebooks\#mixPpcaDemo.ipynb}{mixPpcaDemo.ipynb}. \relax }}{663}{figure.caption.418}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.13}{\ignorespaces (a) 150 synthetic 16 dimensional bit vectors. (b) The 2d embedding learned by binary PCA, fit using variational EM. We have color coded points by the identity of the true ``prototype'' that generated them. (c) Predicted probability of being on. (d) Thresholded predictions. Generated by \href {https://probml.github.io/notebooks\#binary\_fa\_demo.ipynb}{binary\_fa\_demo.ipynb}. \relax }}{665}{figure.caption.419}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.14}{\ignorespaces Gaussian latent factor models for paired data. (a) Supervised PCA. (b) Partial least squares. \relax }}{665}{figure.caption.420}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.15}{\ignorespaces Canonical correlation analysis as a PGM. \relax }}{667}{figure.caption.421}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.16}{\ignorespaces An autoencoder with one hidden layer. \relax }}{668}{figure.caption.422}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.17}{\ignorespaces Results of applying an autoencoder to the Fashion MNIST data. Top row are first 5 images from validation set. Bottom row are reconstructions. (a) MLP model (trained for 20 epochs). The encoder is an MLP with architecture 784-100-30. The decoder is the mirror image of this. (b) CNN model (trained for 5 epochs). The encoder is a CNN model with architecture Conv2D(16, $3 \times 3$, same, selu), MaxPool2D(2x2), Conv2D(32, $3 \times 3$, same, selu), MaxPool2D($2 \times 2$), Conv2D(64, $3 \times 3$, same, selu), MaxPool2D($2 \times 2$). The decoder is the mirror image of this, using transposed convolution and without the max pooling layers. Adapted from Figure 17.4 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#ae\_mnist\_tf.ipynb}{ae\_mnist\_tf.ipynb}. \relax }}{669}{figure.caption.423}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.18}{\ignorespaces tSNE plot of the first 2 latent dimensions of the Fashion MNIST validation set using an autoencoder. (a) MLP. (b) CNN. Adapted from Figure 17.5 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#ae\_mnist\_tf.ipynb}{ae\_mnist\_tf.ipynb}. \relax }}{669}{figure.caption.424}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.19}{\ignorespaces Denoising autoencoder (MLP architecture) applied to some noisy Fashion MNIST images from the validation set. (a) Gaussian noise. (b) Bernoulli dropout noise. Top row: input. Bottom row: output. Adapted from Figure 17.9 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#ae\_mnist\_tf.ipynb}{ae\_mnist\_tf.ipynb}. \relax }}{670}{figure.caption.425}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.20}{\ignorespaces The residual error from a DAE, ${\bm {e}}({\bm {x}})=r({\cc@accent {"707E}{{\bm {x}}}})-{\bm {x}}$, can learn a vector field corresponding to the score function. Arrows point towards higher probability regions. The length of the arrow is proportional to $||{\bm {e}}({\bm {x}})||$, so points near the 1d data manifold (represented by the curved line) have smaller arrows. From Figure 5 of \citep {Alain2014}. Used with kind permission of Guillaume Alain. \relax }}{670}{figure.caption.426}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.21}{\ignorespaces Neuron activity (in the bottleneck layer) for an autoencoder applied to Fashion MNIST. We show results for three models, with different kinds of sparsity penalty: no penalty (left column), $\ell _1$ penalty (middle column), KL penalty (right column). Top row: Heatmap of 300 neuron activations (columns) across 100 examples (rows). Middle row: Histogram of activation levels derived from this heatmap. Bottom row: Histogram of the mean activation per neuron, averaged over all examples in the validation set. Adapted from Figure 17.11 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#ae\_mnist\_tf.ipynb}{ae\_mnist\_tf.ipynb}. \relax }}{672}{figure.caption.427}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.22}{\ignorespaces Schematic illustration of a VAE. From a figure from \url {http://krasserm.github.io/2018/07/27/dfc-vae/}. Used with kind permission of Martin Krasser. \relax }}{673}{figure.caption.428}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.23}{\ignorespaces Computation graph for VAEs. where $p({\bm {z}})=\mathcal {N}({\bm {z}}|\boldsymbol {0},\mathbf {I})$, $p({\bm {x}}|{\bm {z}},{\bm {\theta }}) = \mathcal {N}({\bm {x}}| f({\bm {z}}), \sigma ^2 \mathbf {I})$, and $q({\bm {z}}|{\bm {x}},\boldsymbol {\phi }) = \mathcal {N}({\bm {z}}|\mu ({\bm {x}}), \Sigma ({\bm {x}}))$. Red boxes show sampling operations which are not differentiable. Blue boxes show loss layers (we assume Gaussian likelihoods and priors). (a) Without the reparameterization trick. (b) With the reparameterization trick. Gradients can flow from the output loss, back through the decoder and into the encoder. From Figure 4 of \citep {Doersch2016vae}. Used with kind permission of Carl Doersch. \relax }}{675}{figure.caption.429}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.24}{\ignorespaces Reconstructing MNIST digits using a 20 dimensional latent space. Top row: input images. Bottom row: reconstructions. (a) VAE. Generated by \href {https://probml.github.io/notebooks\#vae\_mnist\_conv\_lightning.ipynb}{vae\_mnist\_conv\_lightning.ipynb}. (b) Deterministic AE. Generated by \href {https://probml.github.io/notebooks\#ae\_mnist\_conv.ipynb}{ae\_mnist\_conv.ipynb}. \relax }}{676}{figure.caption.430}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.25}{\ignorespaces Sampling MNIST digits using a 20 dimensional latent space. (a) VAE. Generated by \href {https://probml.github.io/notebooks\#vae\_mnist\_conv\_lightning.ipynb}{vae\_mnist\_conv\_lightning.ipynb}. (b) Deterministic AE. Generated by \href {https://probml.github.io/notebooks\#ae\_mnist\_conv.ipynb}{ae\_mnist\_conv.ipynb}. \relax }}{676}{figure.caption.431}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.26}{\ignorespaces tSNE projection of a 20 dimensional latent space. (a) VAE. Generated by \href {https://probml.github.io/notebooks\#vae\_mnist\_conv\_lightning.ipynb}{vae\_mnist\_conv\_lightning.ipynb}. (b) Deterministic AE. Generated by \href {https://probml.github.io/notebooks\#ae\_mnist\_conv.ipynb}{ae\_mnist\_conv.ipynb}. \relax }}{677}{figure.caption.432}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.27}{\ignorespaces Linear interpolation between the left and right images in a 20 dimensional latent space. (a) VAE. (b) Deterministic AE. Generated by \href {https://probml.github.io/notebooks\#vae\_mnist\_conv\_lightning.ipynb}{vae\_mnist\_conv\_lightning.ipynb}. \relax }}{677}{figure.caption.433}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.28}{\ignorespaces Illustration of the tangent space and tangent vectors at two different points on a 2d curved manifold. From Figure 1 of \citep {Bronstein2017}. Used with kind permission of Michael Bronstein. \relax }}{677}{figure.caption.434}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.29}{\ignorespaces Illustration of the image manifold. (a) An image of the digit 6 from the USPS dataset, of size $64 \times 57 = 3,648$. (b) A random sample from the space $\{0,1\}^{3648}$ reshaped as an image. (c) A dataset created by rotating the original image by one degree 360 times. We project this data onto its first two principal components, to reveal the underlying 2d circular manifold. From Figure 1 of \citep {Lawrence2012}. Used with kind permission of Neil Lawrence. \relax }}{678}{figure.caption.435}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.30}{\ignorespaces Illustration of some data generated from low-dimensional manifolds. (a) The 2d Swiss-roll manifold embedded into 3d. Generated by \href {https://probml.github.io/notebooks\#manifold\_swiss\_sklearn.ipynb}{manifold\_swiss\_sklearn.ipynb}. (b) Sample of some UCI digits, which have size $8 \times 8 = 64$. Generated by \href {https://probml.github.io/notebooks\#manifold\_digits\_sklearn.ipynb}{manifold\_digits\_sklearn.ipynb}. \relax }}{679}{figure.caption.437}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.31}{\ignorespaces Metric MDS applied to (a) Swiss roll. Generated by \href {https://probml.github.io/notebooks\#manifold\_swiss\_sklearn.ipynb}{manifold\_swiss\_sklearn.ipynb}. (b) UCI digits. Generated by \href {https://probml.github.io/notebooks\#manifold\_digits\_sklearn.ipynb}{manifold\_digits\_sklearn.ipynb}. \relax }}{681}{figure.caption.438}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.32}{\ignorespaces (a) If we measure distances along the manifold, we find $d(1,6) > d(1,4)$, whereas if we measure in ambient space, we find $d(1,6) < d(1,4)$. The plot at the bottom shows the underlying 1d manifold. (b) The $K$-nearest neighbors graph for some datapoints; the red path is the shortest distance between A and B on this graph. From \citep {HintonEmbedding}. Used with kind permission of Geoff Hinton. \relax }}{682}{figure.caption.439}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.33}{\ignorespaces Isomap applied to (a) Swiss roll. Generated by \href {https://probml.github.io/notebooks\#manifold\_swiss\_sklearn.ipynb}{manifold\_swiss\_sklearn.ipynb}. (b) UCI digits. Generated by \href {https://probml.github.io/notebooks\#manifold\_digits\_sklearn.ipynb}{manifold\_digits\_sklearn.ipynb}. \relax }}{683}{figure.caption.440}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.34}{\ignorespaces (a) Noisy version of Swiss roll data. We perturb each point by adding $\mathcal {N}(0, 0.5^2)$ noise. (b) Results of Isomap applied to this data. Generated by \href {https://probml.github.io/notebooks\#manifold\_swiss\_sklearn.ipynb}{manifold\_swiss\_sklearn.ipynb}. \relax }}{683}{figure.caption.441}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.35}{\ignorespaces Visualization of the first 8 kernel principal component basis functions derived from some 2d data. We use an RBF kernel with $\sigma ^2=0.1$. Generated by \href {https://probml.github.io/notebooks\#kpcaScholkopf.ipynb}{kpcaScholkopf.ipynb}. \relax }}{684}{figure.caption.442}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.36}{\ignorespaces Kernel PCA applied to (a) Swiss roll. Generated by \href {https://probml.github.io/notebooks\#manifold\_swiss\_sklearn.ipynb}{manifold\_swiss\_sklearn.ipynb}. (b) UCI digits. Generated by \href {https://probml.github.io/notebooks\#manifold\_digits\_sklearn.ipynb}{manifold\_digits\_sklearn.ipynb}. \relax }}{685}{figure.caption.443}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.37}{\ignorespaces LLE applied to (a) Swiss roll. Generated by \href {https://probml.github.io/notebooks\#manifold\_swiss\_sklearn.ipynb}{manifold\_swiss\_sklearn.ipynb}. (b) UCI digits. Generated by \href {https://probml.github.io/notebooks\#manifold\_digits\_sklearn.ipynb}{manifold\_digits\_sklearn.ipynb}. \relax }}{686}{figure.caption.444}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.38}{\ignorespaces Laplacian eigenmaps applied to (a) Swiss roll. Generated by \href {https://probml.github.io/notebooks\#manifold\_swiss\_sklearn.ipynb}{manifold\_swiss\_sklearn.ipynb}. (b) UCI digits. Generated by \href {https://probml.github.io/notebooks\#manifold\_digits\_sklearn.ipynb}{manifold\_digits\_sklearn.ipynb}. \relax }}{687}{figure.caption.445}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.39}{\ignorespaces Illustration of the Laplacian matrix derived from an undirected graph. From \url {https://en.wikipedia.org/wiki/Laplacian_matrix}. Used with kind permission of Wikipedia author AzaToth. \relax }}{688}{figure.caption.446}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.40}{\ignorespaces Illustration of a (positive) function defined on a graph. From Figure 1 of \citep {Shuman2013}. Used with kind permission of Pascal Frossard. \relax }}{688}{figure.caption.447}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.41}{\ignorespaces tSNE applied to (a) Swiss roll. Generated by \href {https://probml.github.io/notebooks\#manifold\_swiss\_sklearn.ipynb}{manifold\_swiss\_sklearn.ipynb}. (b) UCI digits. Generated by \href {https://probml.github.io/notebooks\#manifold\_digits\_sklearn.ipynb}{manifold\_digits\_sklearn.ipynb}. \relax }}{692}{figure.caption.448}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.42}{\ignorespaces Illustration of the effect of changing the perplexity parameter when t-SNE is applied to some 2d data. From \citep {Wattenberg2016how}. See \url {http://distill.pub/2016/misread-tsne} for an animated version of these figures. Used with kind permission of Martin Wattenberg. \relax }}{692}{figure.caption.449}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.43}{\ignorespaces Illustration of the cosine similarity between a query vector ${\bm {q}}$ and two document vectors ${\bm {d}}_1$ and ${\bm {d}}_2$. Since angle $\alpha $ is less than angle $\theta $, we see that the query is more similar to document 1. From \url {https://en.wikipedia.org/wiki/Vector_space_model}. Used with kind permission of Wikipedia author Riclas. \relax }}{694}{figure.caption.450}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.44}{\ignorespaces Illustration of word2vec model with window size of 2. (a) CBOW version. (b) Skip-gram version. \relax }}{696}{figure.caption.451}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {20.45}{\ignorespaces Visualization of arithmetic operations in word2vec embedding space. From \url {https://www.tensorflow.org/tutorials/representation/word2vec}. \relax }}{699}{figure.caption.452}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.1}{\ignorespaces Three clusters with labeled objects inside. \relax }}{704}{figure.caption.453}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.2}{\ignorespaces (a) An example of single link clustering using city block distance. Pairs (1,3) and (4,5) are both distance 1 apart, so get merged first. (b) The resulting dendrogram. Adapted from Figure 7.5 of \citep {Alpaydin04}. Generated by \href {https://probml.github.io/notebooks\#agglomDemo.ipynb}{agglomDemo.ipynb}. \relax }}{706}{figure.caption.454}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.3}{\ignorespaces Illustration of (a) Single linkage. (b) Complete linkage. (c) Average linkage. \relax }}{706}{figure.caption.456}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.4}{\ignorespaces Hierarchical clustering of yeast gene expression data. (a) Single linkage. (b) Complete linkage. (c) Average linkage. Generated by \href {https://probml.github.io/notebooks\#hclust\_yeast\_demo.ipynb}{hclust\_yeast\_demo.ipynb}. \relax }}{707}{figure.caption.457}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.5}{\ignorespaces (a) Some yeast gene expression data plotted as a heat map. (b) Same data plotted as a time series. Generated by \href {https://probml.github.io/notebooks\#yeast\_data\_viz.ipynb}{yeast\_data\_viz.ipynb}. \relax }}{709}{figure.caption.458}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.6}{\ignorespaces Hierarchical clustering applied to the yeast gene expression data. (a) The rows are permuted according to a hierarchical clustering scheme (average link agglomerative clustering), in order to bring similar rows close together. (b) 16 clusters induced by cutting the average linkage tree at a certain height. Generated by \href {https://probml.github.io/notebooks\#hclust\_yeast\_demo.ipynb}{hclust\_yeast\_demo.ipynb}. \relax }}{709}{figure.caption.459}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.7}{\ignorespaces Illustration of K-means clustering in 2d. We show the result of using two different random seeds. Adapted from Figure 9.5 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#kmeans\_voronoi.ipynb}{kmeans\_voronoi.ipynb}. \relax }}{711}{figure.caption.460}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.8}{\ignorespaces Clustering the yeast data from \cref {fig:yeast} using K-means clustering with $K=16$. (a) Visualizing all the time series assigned to each cluster. (b) Visualizing the 16 cluster centers as prototypical time series. Generated by \href {https://probml.github.io/notebooks\#kmeans\_yeast\_demo.ipynb}{kmeans\_yeast\_demo.ipynb}. \relax }}{711}{figure.caption.461}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.9}{\ignorespaces An image compressed using vector quantization with a codebook of size $K$. (a) $K=2$. (b) $K=4$. (c) Original uncompressed image. Generated by \href {https://probml.github.io/notebooks\#vqDemo.ipynb}{vqDemo.ipynb}. \relax }}{712}{figure.caption.462}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.10}{\ignorespaces Illustration of batch vs mini-batch K-means clustering on the 2d data from \cref {fig:kmeansVoronoi}. Left: distortion vs $K$. Right: Training time vs $K$. Adapted from Figure 9.6 of \citep {Geron2019}. Generated by \href {https://probml.github.io/notebooks\#kmeans\_minibatch.ipynb}{kmeans\_minibatch.ipynb}. \relax }}{714}{figure.caption.464}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.11}{\ignorespaces Performance of K-means and GMM vs $K$ on the 2d dataset from \cref {fig:kmeansVoronoi}. (a) Distortion on validation set vs $K$. Generated by \href {https://probml.github.io/notebooks\#kmeans\_silhouette.ipynb}{kmeans\_silhouette.ipynb}. (b) BIC vs $K$. Generated by \href {https://probml.github.io/notebooks\#gmm\_2d.ipynb}{gmm\_2d.ipynb}. (c) Silhouette score vs $K$. Generated by \href {https://probml.github.io/notebooks\#kmeans\_silhouette.ipynb}{kmeans\_silhouette.ipynb}. \relax }}{715}{figure.caption.465}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.12}{\ignorespaces Voronoi diagrams for K-means for different $K$ on the 2d dataset from \cref {fig:kmeansVoronoi}. Generated by \href {https://probml.github.io/notebooks\#kmeans\_silhouette.ipynb}{kmeans\_silhouette.ipynb}. \relax }}{716}{figure.caption.466}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.13}{\ignorespaces Silhouette diagrams for K-means for different $K$ on the 2d dataset from \cref {fig:kmeansVoronoi}. Generated by \href {https://probml.github.io/notebooks\#kmeans\_silhouette.ipynb}{kmeans\_silhouette.ipynb}. \relax }}{717}{figure.caption.467}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.14}{\ignorespaces Some data in 2d fit using a GMM with $K=5$ components. Left column: marginal distribution $p({\bm {x}})$. Right column: visualization of each mixture distribution, and the hard assignment of points to their most likely cluster. (a-b) Full covariance. (c-d) Tied full covariance. (e-f) Diagonal covairance, (g-h) Spherical covariance. Color coding is arbitrary. Generated by \href {https://probml.github.io/notebooks\#gmm\_2d.ipynb}{gmm\_2d.ipynb}. \relax }}{719}{figure.caption.468}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.15}{\ignorespaces Some 1d data, with a kernel density estimate superimposed. Adapted from Figure 6.2 of \citep {Martin2018}. Generated by \href {https://probml.github.io/notebooks\#gmm\_identifiability\_pymc3.ipynb}{gmm\_identifiability\_pymc3.ipynb}. \relax }}{720}{figure.caption.469}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.16}{\ignorespaces Illustration of the label switching problem when performing posterior inference for the parameters of a GMM. We show a KDE estimate of the posterior marginals derived from 1000 samples from 4 HMC chains. (a) Unconstrained model. Posterior is symmetric. (b) Constrained model, where we add a penalty to ensure $\mu _0 < \mu _1$. Adapted from Figure 6.6-6.7 of \citep {Martin2018}. Generated by \href {https://probml.github.io/notebooks\#gmm\_identifiability\_pymc3.ipynb}{gmm\_identifiability\_pymc3.ipynb}. \relax }}{720}{figure.caption.470}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.17}{\ignorespaces Fitting GMMs with different numbers of clusters $K$ to the data in \cref {fig:gmmIdentifiabilityData}. Black solid line is KDE fit. Solid blue line is posterior mean; feint blue lines are posterior samples. Dotted lines show the individual Gaussian mixture components, evaluated by plugging in their posterior mean parameters. Adapted from Figure 6.8 of \citep {Martin2018}. Generated by \href {https://probml.github.io/notebooks\#gmm\_chooseK\_pymc3.ipynb}{gmm\_chooseK\_pymc3.ipynb}. \relax }}{721}{figure.caption.471}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.18}{\ignorespaces WAIC scores for the different GMMs. The empty circle is the posterior mean WAIC score for each model, and the black lines represent the standard error of the mean. The solid circle is the in-sample deviance of each model, i.e., the unpenalized log-likelihood. The dashed vertical line corresponds to the maximum WAIC value. The gray triangle is the difference in WAIC score for that model compared to the best model. Adapted from Figure 6.10 of \citep {Martin2018}. Generated by \href {https://probml.github.io/notebooks\#gmm\_chooseK\_pymc3.ipynb}{gmm\_chooseK\_pymc3.ipynb}. \relax }}{722}{figure.caption.472}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.19}{\ignorespaces Results of clustering some data. (a) K-means. (b) Spectral clustering. Generated by \href {https://probml.github.io/notebooks\#spectral\_clustering\_demo.ipynb}{spectral\_clustering\_demo.ipynb}. \relax }}{725}{figure.caption.473}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.20}{\ignorespaces Illustration of biclustering. We show 5 of the 12 organism clusters, and 6 of the 33 feature clusters. The original data matrix is shown, partitioned according to the discovered clusters. From Figure 3 of \citep {Kemp06}. Used with kind permission of Charles Kemp. \relax }}{726}{figure.caption.474}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.21}{\ignorespaces (a) Example of biclustering. Each row is assigned to a unique cluster, and each column is assigned to a unique cluster. (b) Example of multi-clustering using a nested partition model. The rows can belong to different clusters depending on which subset of column features we are looking at. \relax }}{727}{figure.caption.475}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {21.22}{\ignorespaces MAP estimate produced by the crosscat system when applied to a binary data matrix of animals (rows) by features (columns). See text for details. From Figure 7 of \citep {Shafto06}. Used with kind permission of Vikash Mansingkha. \relax }}{728}{figure.caption.476}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {22.1}{\ignorespaces Example of a relational dataset represented as a sparse matrix (left) or a sparse bipartite graph (right). Values corresponding to empty cells (missing edges) are unknown. Rows 3 and 4 are similar to each other, indicating that users 3 and 4 might have similar preferences, so we can use the data from user 3 to predict user 4's preferences. However, user 1 seems quite different in their preferences, and seems to give low ratings to all items. For user 2, we have very little observed data, so it is hard to make reliable predictions. \relax }}{730}{figure.caption.477}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {22.2}{\ignorespaces Visualization of the first two latent movie factors estimated from the Netflix challenge data. Each movie $j$ is plotted at the location specified by ${\bm {v}}_j$. See text for details. From Figure 3 of \citep {Koren09mf}. Used with kind permission of Yehuda Koren. \relax }}{732}{figure.caption.478}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {22.3}{\ignorespaces (a) A fragment of the observed ratings matrix from the MovieLens-1M dataset. (b) Predictions using SVD with 50 latent components. Generated by \href {https://probml.github.io/notebooks\#matrix\_factorization\_recommender.ipynb}{matrix\_factorization\_recommender.ipynb}. \relax }}{733}{figure.caption.479}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {22.4}{\ignorespaces (a) Top 10 movies (from a list of 69) that user ``837'' has already highly rated. (b) Top 10 predictions (from a list of 3637) from the algorithm. Generated by \href {https://probml.github.io/notebooks\#matrix\_factorization\_recommender.ipynb}{matrix\_factorization\_recommender.ipynb}. \relax }}{734}{figure.caption.480}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {22.5}{\ignorespaces Illustration of the neural matrix factorization model. From Figure 2 of \citep {NCF}. Used with kind permission of Xiangnan He. \relax }}{737}{figure.caption.481}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {22.6}{\ignorespaces Illustration of a design matrix for a movie recommender system, where we show the id of the user and movie, as well as other side information. From Figure 1 of \citep {Rendle12}. Used with kind permission of Stefen Rendle. \relax }}{738}{figure.caption.482}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {23.1}{\ignorespaces An illustration of Euclidean vs. non-Euclidean graphs. Used with permission from \cite {chami2020machine}.\relax }}{742}{figure.caption.483}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {23.2}{\ignorespaces Illustration of the \textsc {GraphEDM} framework from \citet {chami2020machine}. Based on the supervision available, methods will use some or all of the branches. In particular, unsupervised methods do not leverage label decoding for training and only optimize the similarity decoder (lower branch). On the other hand, semi-supervised and supervised methods leverage the additional supervision to learn models' parameters (upper branch). Reprinted with permission from \cite {chami2020machine}. \relax }}{743}{figure.caption.484}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {23.3}{\ignorespaces Shallow embedding methods. The encoder is a simple embedding look-up and the graph structure is only used in the loss function. Reprinted with permission from \cite {chami2020machine}. \relax }}{744}{figure.caption.485}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {23.4}{\ignorespaces An overview of the pipeline for random-walk graph embedding methods. Reprinted with permission from \citep {godec_2018}.\relax }}{747}{figure.caption.486}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {23.5}{\ignorespaces Illustration of the GraphSAGE model. Reprinted with permission from \citep {hamilton2017inductive}.\relax }}{752}{figure.caption.487}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {23.6}{\ignorespaces Euclidean (left) and hyperbolic (right) embeddings of a tree graph. Hyperbolic embeddings learn natural hierarchies in the embedding space (depth indicated by color). Reprinted with permission from \citep {chami2019hyperbolic}.\relax }}{754}{figure.caption.488}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {23.7}{\ignorespaces Unsupervised graph neural networks. Graph structure and input features are mapped to low-dimensional embeddings using a graph neural network encoder. Embeddings are then decoded to compute a graph regularization loss (unsupervised). Reprinted with permission from \cite {chami2020machine}. \relax }}{754}{figure.caption.489}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {23.8}{\ignorespaces A graph representation of some financial transactions. Adapted from \url {http://pgql-lang.org/spec/1.2/}. \relax }}{758}{figure.caption.490}% 
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {23.9}{\ignorespaces Structurally similar molecules do not necessarily have similar odor descriptors. (A) Lyral, the reference molecule. (B) Molecules with similar structure can share similar odor descriptors. (C) However, a small structural change can render the molecule odorless. (D) Further, large structural changes can leave the odor of the molecule largely unchanged. From Figure 1 of \citep {SanchezLengeling2019}, originally from \citep {Ohloff2012}. Used with kind permission of Benjamin Sanchez-Lengeling. \relax }}{760}{figure.caption.491}% 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }