CoCalc -- mixPpcaDemo.ipynb

GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book1/20/mixPpcaDemo.ipynb
¹¹⁹² views

Kernel: Unknown Kernel

In [ ]:

# Author: Meduri Venkata Shivaditya

import math
import matplotlib.pyplot as plt
import numpy as np
from scipy.special import logsumexp

"""
z = Wx + µ + E
the equation above represents the latent variable model which 
relates a d-dimensional data vector z to a corresponding q-dimensional 
latent variables x 
with q < d, for isotropic noise E ∼ N (0, σ2I)
z : latent
x : data
W : latent_to_observation matrix
µ : centres_of_clusters
E : var_of_latent
This code is an implementation of generative model of mixture of PPCA
Given the number of clusters, data_dim(D) and latent_dim(L)
we generate the data for every cluster n,  
we sample zn from a Gaussian prior and pass it through the
Wk matrix and add noise, where Wk maps from the L-dimensional subspace to the D-dimensional
visible space. Using the expectation maximization algorithm we estimate the parameters 
and then we plot the PC vectors
"""


def mixture_ppca_parameter_initialization(data, n_clusters, latent_dim, n_iterations):
    """
    The k-means algorithm is used to determine the centres. The
        priors are computed from the proportion of examples belonging to each
        cluster. The covariance matrices are calculated as the sample
        covariance of the points associated with (i.e. closest to) the
        corresponding centres. For a mixture of PPCA model, the PPCA
        decomposition is calculated for the points closest to a given centre.
        This initialisation can be used as the starting point for training
        the model using the EM algorithm.
        W : latent_to_observation matrix
    µ/mu : centres_of_clusters
    pi : proportion of data in each cluster
    sigma2 : variance of latent
    covars : covariance of the points associated with (i.e. closest to) the
        corresponding centres
    """
    n_datapts, data_dim = data.shape
    # initialization of the centres of clusters
    init_centers = np.random.randint(0, n_datapts, n_clusters)
    # Randomly choose distinct initial centres for the clusters
    while len(np.unique(init_centers)) != n_clusters:
        init_centers = np.random.randint(0, n_datapts, n_clusters)
    mu = data[init_centers, :]
    distance_square = np.zeros((n_datapts, n_clusters))
    clusters = np.zeros(n_datapts, dtype=np.int32)
    # Running iterations for K means algorithm to assign centres for clusters
    for k in range(n_iterations):
        # assign clusters
        for c in range(n_clusters):
            distance_square[:, c] = np.power(data - mu[c, :], 2).sum(1)
        clusters = np.argmin(distance_square, axis=1)
        # compute distortion
        distmin = distance_square[range(n_datapts), clusters]
        # compute new centers
        for c in range(n_clusters):
            mu[c, :] = data[clusters == c, :].mean(0)

    # parameter initialization
    pi = np.zeros(n_clusters)  # Sum should be equal to 1
    W = np.zeros((n_clusters, data_dim, latent_dim))
    sigma2 = np.zeros(n_clusters)
    for c in range(n_clusters):
        W[c, :, :] = np.random.randn(data_dim, latent_dim)
        pi[c] = (clusters == c).sum() / n_datapts
        sigma2[c] = (distmin[clusters == c]).mean() / data_dim
    covars = np.zeros(n_clusters)
    for i in range(n_clusters):
        covars[i] = (np.var(data[clusters == i, 0]) + np.var(data[clusters == i, 1])) / 2
    return pi, mu, W, sigma2, covars, clusters


def mixture_ppca_expectation_maximization(data, pi, mu, W, sigma2, niter):
    """
    we can find the p(latent|data) with the assumption that data is gaussian
    z : latent
    x : data
    W : latent_to_observation matrix
    µ/mu : centres_of_clusters
    d : data_dimension
    q : latent_dimention
    σ2/ sigma2 : variance of latent
    π/pi : cluster proportion
    p(z|x) = (2πσ2)^−d/2 * exp(−1/(2σ2) * ||z − Wx − µ||)
    p(z) = ∫p(z|x)p(x)dx
    Solving for p(z) and then using the result we can find the p(x|z)
    through which we can find
    the log likelihood function which is
    log_likelihood = −N/2 * (d ln(2π) + ln |Σ| + tr(Σ−1S))
    We can develop an iterative EM algorithm for
    optimisation of all of the model parameters µ,W and σ2
    If Rn,i = p(zn, i) is the posterior responsibility of
    mixture i for generating data point zn,given by
    Rn,i = (p(zn|i) * πi) / p(zn)
    Using EM, the parameter estimates are as follows:
    µi = Σ (Rn,i * zn) / Σ Rn,i
    Si = 1/(πi*N) * ΣRn,i*(zn − µi)*(zn − µi)'
    Using Si we can estimate W and σ2
    For more information on EM algorithm for mixture of PPCA
    visit Mixtures of Probabilistic Principal Component Analysers
    by Michael E. Tipping and Christopher M. Bishop:
    page 5-10 of http://www.miketipping.com/papers/met-mppca.pdf
    """
    n_datapts, data_dim = data.shape
    n_clusters = len(sigma2)
    _, latent_dim = W[0].shape
    M = np.zeros((n_clusters, latent_dim, latent_dim))
    Minv = np.zeros((n_clusters, latent_dim, latent_dim))
    Cinv = np.zeros((n_clusters, data_dim, data_dim))
    logR = np.zeros((n_datapts, n_clusters))
    R = np.zeros((n_datapts, n_clusters))
    M[:] = 0.0
    Minv[:] = 0.0
    Cinv[:] = 0.0
    log_likelihood = np.zeros(niter)

    for i in range(niter):
        print(".", end="")
        for c in range(n_clusters):
            # M
            """
            M = σ2I + WT.W
            """
            M[c, :, :] = sigma2[c] * np.eye(latent_dim) + np.dot(W[c, :, :].T, W[c, :, :])

            Minv[c, :, :] = np.linalg.inv(M[c, :, :])

            # Cinv
            Cinv[c, :, :] = (np.eye(data_dim) - np.dot(np.dot(W[c, :, :], Minv[c, :, :]), W[c, :, :].T)) / sigma2[c]

            # R_ni
            deviation_from_center = data - mu[c, :]
            logR[:, c] = (
                np.log(pi[c])
                + 0.5
                * np.log(np.linalg.det(np.eye(data_dim) - np.dot(np.dot(W[c, :, :], Minv[c, :, :]), W[c, :, :].T)))
                - 0.5 * data_dim * np.log(sigma2[c])
                - 0.5 * (deviation_from_center * np.dot(deviation_from_center, Cinv[c, :, :].T)).sum(1)
            )

        """
        Using the log-sum-trick,  visit Section 2.5.4 in "Probabilistic Machine Learning: An Introduction" by Kevin P. Murphy for more information
        logsumexp(logR - myMax, axis=1) can be replaced by logsumexp(logR, axis=1)
        myMax + logsumexp((logR - myMax), axis=0) can be replaced by logsumexp(logR, axis=0)
        myMax in the above equations refer to
        myMax = logR.max(axis=0) & myMax = logR.max(axis=1).reshape((n_datapts, 1))
        """
        log_likelihood[i] = (logsumexp(logR, axis=1)).sum(axis=0) - n_datapts * data_dim * np.log(2 * math.pi) / 2.0

        logR = logR - np.reshape(logsumexp(logR, axis=1), (n_datapts, 1))

        logpi = logsumexp(logR, axis=0) - np.log(n_datapts)
        logpi = logpi.T
        pi = np.exp(logpi)
        R = np.exp(logR)
        for c in range(n_clusters):
            mu[c, :] = (R[:, c].reshape((n_datapts, 1)) * data).sum(axis=0) / R[:, c].sum()
            deviation_from_center = data - mu[c, :].reshape((1, data_dim))
            """
            Si = 1/(πi*N) * ΣRn,i*(zn − µi)*(zn − µi)'
            Si is used to estimate 
            """
            Si = (1 / (pi[c] * n_datapts)) * np.dot(
                (R[:, c].reshape((n_datapts, 1)) * deviation_from_center).T, np.dot(deviation_from_center, W[c, :, :])
            )

            Wnew = np.dot(
                Si, np.linalg.inv(sigma2[c] * np.eye(latent_dim) + np.dot(np.dot(Minv[c, :, :], W[c, :, :].T), Si))
            )

            sigma2[c] = (1 / data_dim) * (
                (R[:, c].reshape(n_datapts, 1) * np.power(deviation_from_center, 2)).sum() / (n_datapts * pi[c])
                - np.trace(np.dot(np.dot(Si, Minv[c, :, :]), Wnew.T))
            )

            W[c, :, :] = Wnew

    return pi, mu, W, sigma2, log_likelihood


def generate_data():
    n = 500
    r = np.random.rand(1, n) + 1
    theta = np.random.rand(1, n) * (2 * math.pi)
    x1 = r * np.sin(theta)
    x2 = r * np.cos(theta)
    X = np.vstack((x1, x2))
    return np.transpose(X)


def mixppcademo(data, n_clusters):
    """
    W : latent to observation matrix
    mu : centres_of_clusters
    pi : proportions of data in each of the cluster
    sigma2 : variance of latent
    L : log likelihood after each iteration
    covars : covariance of the points associated with (i.e. closest to) the
        corresponding centres
    """
    plt.plot(data[:, 0], data[:, 1], "o", c="blue", mfc="none")
    pi, mu, W, sigma2, covars, clusters = mixture_ppca_parameter_initialization(
        data, n_clusters, latent_dim=1, n_iterations=10
    )
    pi, mu, W, sigma2, L = mixture_ppca_expectation_maximization(data, pi, mu, W, sigma2, 10)

    for i in range(n_clusters):
        v = W[i, :, :]
        # Plotting the pc vectors using 2 standard deviations
        start = mu[i].reshape((2, 1)) - (v * 2 * np.sqrt(sigma2[i]))
        endpt = mu[i].reshape((2, 1)) + (v * 2 * np.sqrt(sigma2[i]))
        linex = [start[0], endpt[0]]
        liney = [start[1], endpt[1]]
        plt.plot(linex, liney, linewidth=3, c="black")
        theta = np.arange(0, 2 * math.pi, 0.02)
        # Plotting the confidence interval ellipse using 2 standard deviations
        x = 2 * np.sqrt(sigma2[i]) * np.cos(theta)
        y = np.sqrt(covars[i]) * np.sin(theta)
        rot_matrix = np.vstack((np.hstack((v[0], -v[1])), np.hstack((v[1], v[0]))))
        ellipse = np.dot(rot_matrix, np.vstack((x, y)))
        ellipse = np.transpose(ellipse)
        ellipse = ellipse + np.dot(np.ones((len(theta), 1)), mu[i, :].reshape((1, 2)))
        plt.plot(ellipse[:, 0], ellipse[:, 1], c="crimson")


def main():
    np.random.seed(61)
    data = generate_data()
    plt.figure(0)
    mixppcademo(data, n_clusters=1)
    plt.savefig("mixppca_k-1.png", dpi=300)
    np.random.seed(7)
    data = generate_data()
    plt.figure(1)
    mixppcademo(data, n_clusters=10)
    plt.savefig("mixppca_k-10.png", dpi=300)
    plt.show()


if __name__ == "__main__":
    main()

Product

Resources

Company