Understanding Effect of Outliers on diffrent PDF

In [1]:

import jax.numpy as jnp
from jax import random
import matplotlib.pyplot as plt
from scipy.stats import t, laplace, norm
import seaborn as sns
import numpy as np

try:
    from probml_utils import savefig, latexify
except ModuleNotFoundError:
    %pip install -qq git+https://github.com/probml/probml-utils.git
    from probml_utils import savefig, latexify

In [2]:

latexify(width_scale_factor=2, fig_height=2.0)

Out[2]:

C:\Users\NITISH SHARMA\.pyenv\pyenv-win\versions\3.9.10\lib\site-packages\probml_utils\plotting.py:26: UserWarning: LATEXIFY environment variable not set, not latexifying
  warnings.warn("LATEXIFY environment variable not set, not latexifying")

In [3]:

def plot_outlier_effect(
    save_name,
    outlier_pos=0,
    outliers=[],
    bins=7,
    samples_norm_dist=30,
    samples_graph_xaxis=500,
    range_xaxis=[-5, 10],
    range_yaxis=[0, 0.60],
    fig=None,
    ax=None,
):
    """
    Sample from a normal distribution and plot the PDF for
    normal distribution, laplacian distribution, and the student T
    distribution. The function plots/saves data for distributions.
    If outliers are provided, we see the robustness of the student
    T distribution compared to the normal distribution.


    Args:
    ----------
    save_name : string
        The filenames to save the graphs

    outlier_pos : int, default=0
        Changes position of outliers

    outliers :  list, default=[]
        A list of outlier values

    bins : int, default=7
        Value of bin size for normal distribution histogram

    samples_norm_dist : int, default=30
        Number of samples to be taken from the normal distribution

    samples_graph_xaxis : int, default=500
        Number of values for the x-axis i.e the values the
        random variable can take

    range_xaxis : list, default=[-5, 10]
        The range of values for the x-axis

    range_yaxis : list, default=[0, 0.6]
        The range of values for the y-axis

    fig : None
        Will be used to store matplotlib figure

    ax : None
        Will be used to store matplotlib axes

    Returns:
    ----------
    fig : matplotlib figure object
        Stores the graph data displayed

    ax : matplotlib axis object
        Stores the axes data of the graph displayed
    """

    # Generate Samples from normal distribution
    norm_dist_sample = random.normal(random.PRNGKey(42), shape=(samples_norm_dist,))

    # Generate values for x axis i.e. the values your random variable can take
    x_axis = jnp.linspace(range_xaxis[0], range_xaxis[1], samples_graph_xaxis)

    # Set figure
    fig, ax = plt.subplots()

    if outliers:
        samples = jnp.hstack((norm_dist_sample, jnp.array(outliers) + outlier_pos))

        # Plot the data from normal distribution
        ax.hist(
            np.array(norm_dist_sample),
            bins,
            color="steelblue",
            ec="steelblue",
            weights=[1 / (norm_dist_sample.shape[0] + len(outliers))] * norm_dist_sample.shape[0],
            rwidth=0.8,
        )

        # Plot outlier data
        ax.hist(
            np.array(outliers) + outlier_pos,
            len(outliers),
            color="steelblue",
            ec="steelblue",
            weights=[1 / (norm_dist_sample.shape[0] + len(outliers))] * len(outliers),
            rwidth=0.8,
        )
    else:
        samples = norm_dist_sample

        # Plot the data from normal distribution
        ax.hist(
            np.array(norm_dist_sample),
            bins,
            color="steelblue",
            ec="steelblue",
            weights=[1 / norm_dist_sample.shape[0]] * norm_dist_sample.shape[0],
            rwidth=0.8,
        )

    # Calculate mean and standard deviation for different distributions and then
    # find the PDF for each distribution
    loc, scale = norm.fit(samples)
    norm_pdf = norm.pdf(x_axis, loc=loc, scale=scale)

    loc, scale = laplace.fit(samples)
    laplace_pdf = laplace.pdf(x_axis, loc=loc, scale=scale)

    fd, loc, scale = t.fit(samples)
    studentT_pdf = t.pdf(x_axis, fd, loc=loc, scale=scale)

    # Find range of values for PDF i.e y-axis
    y_range = range_yaxis

    # Update tick intervals for x-axis
    ax.set_xticks(jnp.arange(range_xaxis[0], range_xaxis[1] + 1, 5))

    # Update the tick intervals and limit for y-axis
    ax.set_ylim(y_range)
    ax.set_yticks(jnp.linspace(y_range[0], y_range[1], 5))

    # Plot the different PDF's obtained
    ax.plot(x_axis, norm_pdf, "k-", linewidth=2.0)
    ax.plot(x_axis, studentT_pdf, "r-.", linewidth=2.0)
    ax.plot(x_axis, laplace_pdf, "b:", linewidth=2.0)

    # Update the Legend and the axis labels
    ax.legend(("gaussian", "student T", "laplace", "data"))
    ax.set_xlabel("$x$")
    ax.set_ylabel("$p(x)$")

    sns.despine()

    # Save figure to files
    if len(save_name) > 0:
        savefig(save_name)

    return fig, ax

In [4]:

plot_outlier_effect(save_name="robust_pdf_plot")

Out[4]:

WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
C:\Users\NITISH SHARMA\.pyenv\pyenv-win\versions\3.9.10\lib\site-packages\probml_utils\plotting.py:79: UserWarning: set FIG_DIR environment variable to save figures
  warnings.warn("set FIG_DIR environment variable to save figures")

(<Figure size 432x288 with 1 Axes>,
 <AxesSubplot:xlabel='$x$', ylabel='$p(x)$'>)

In [5]:

plot_outlier_effect(save_name="robust_pdf_plot_outliers", outliers=[8, 8.75, 9.5])

Out[5]:

C:\Users\NITISH SHARMA\.pyenv\pyenv-win\versions\3.9.10\lib\site-packages\probml_utils\plotting.py:79: UserWarning: set FIG_DIR environment variable to save figures
  warnings.warn("set FIG_DIR environment variable to save figures")

(<Figure size 432x288 with 1 Axes>,
 <AxesSubplot:xlabel='$x$', ylabel='$p(x)$'>)

In [6]:

from ipywidgets import interact


@interact(outlier_pos=(-5, 5))
def interactive_plot(outlier_pos):
    fig, ax = plot_outlier_effect(save_name="", outlier_pos=outlier_pos, outliers=[8, 8.75, 9.5])

Out[6]:

interactive(children=(IntSlider(value=0, description='outlier_pos', max=5, min=-5), Output()), _dom_classes=('…

In [ ]:

Understanding Effect of Outliers on diffrent PDF

Product

Resources

Company