CoCalc -- active_learning_visualization

GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book2/34/active_learning_visualization_class.ipynb
¹¹⁹³ views

Kernel: Python 3 (ipykernel)

In [1]:

# sklearn Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# sklearn Utils
from sklearn.base import BaseEstimator
from sklearn.base import clone
from sklearn.model_selection import train_test_split

import numpy as np
from numpy.random import default_rng
import random

np.random.seed(42)

from matplotlib.colors import ListedColormap
from matplotlib.legend_handler import HandlerTuple
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

try:
    from celluloid import Camera
except ModuleNotFoundError:
    %pip install -qq celluloid
    from celluloid import Camera

try:
    import probml_utils as pml
except ModuleNotFoundError:
    %pip install -qq git+https://github.com/probml/probml-utils.git
    import probml_utils as pml

from probml_utils import active_learn_utils as alu

try:
    import modAL
except ModuleNotFoundError:
    %pip install -qq modAL
    import modAL


from modAL.uncertainty import uncertainty_sampling
from modAL.uncertainty import entropy_sampling
from modAL.uncertainty import margin_sampling
from modAL.disagreement import max_std_sampling


from modAL.models import ActiveLearner
from modAL.utils.data import modALinput
from modAL.models import Committee


from typing import Callable
from typing import Tuple
from typing import Optional
import warnings
from typing import List

from IPython.display import HTML
from functools import reduce
import operator

In [2]:

# Default RC parms for grid and line plot
p = plt.rcParams

# Grid Setting
p["grid.color"] = "#999999"
p["grid.linestyle"] = "--"

# Markersize setting for scatter
if pml.is_latexify_enabled():
    p["lines.markersize"] = 3
    p["lines.markeredgewidth"] = 1
    p["lines.linewidth"] = 1.5
    p["grid.linewidth"] = 0.5
else:
    p["lines.markersize"] = 5
    p["lines.markeredgewidth"] = 1.5
    p["lines.linewidth"] = 2

Decision Contour Plots

In [3]:

def plot_countor(
    prob: np.ndarray,
    xx: np.ndarray,
    yy: np.ndarray,
    cmap: str,
    ax: mpl.axes.Axes,
    fig: mpl.figure.Figure,
):
    """
    Plot countors for the given meshgrid and probabilities

    Args:
    ----------
    prob :
        Probabilities to be displayed over the meshgrid

    xx :
        Meshgrid x-coords

    yy :
        Meshgrid y-coords

    cmap :
        Color for the countor

    ax :
        Matplotlib axes

    fig :
        Matplotlib figure

    Returns:
    ----------
    ax :
        Matplotlib axes

    fig :
        Matplotlib figure

    """

    # Mask probabilities low probabilities for a given class
    # since contours overlap, the colors look saturated
    prob = np.ma.masked_where(prob < 0.20, prob)

    # Add a countor lines and countor plot
    ax.contourf(
        xx,
        yy,
        prob,
        cmap=cmap,
        alpha=0.3,
        levels=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
    )

    ax.contour(
        xx,
        yy,
        prob,
        cmap=cmap,
        alpha=0.4,
        linewidths=2,
        levels=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
    )

    return fig, ax

In [4]:

def final_decision_countor(
    key: str,
    i: int,
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_train_qry: np.ndarray,
    y_train_qry: np.ndarray,
    probs: np.ndarray,
    xx: np.ndarray,
    yy: np.ndarray,
    cmaps: list,
    X_pool: np.ndarray,
    y_pool: np.ndarray,
    save_name: str,
) -> None:

    """
    At any given iteration save the countour plots

    Args:
    ----------
    key :
        Query Stratergy Name

    i:
        The current Iteration

    X_train_qry :
       Train+queried sample features

    y_train_qry :
        Train+queried sample labels

    X_train :
        Train data features

    y_train :
        Train data labels

    probs:
        Probabilities for surface plot

    xx :
        Meshgrid x-coords

    yy :
        Meshgrid y-coords

    cmap :
        Color for the countor

    X_pool :
        Features pool data

    y_pool :
        Labels pool data

    save_name:
        Fig name to save

    Returns:
    ----------
    None
    """

    # Create a figure
    pml.latexify(width_scale_factor=3)
    fig = plt.figure()
    ax = plt.gca()
    ax.grid(False)

    for prob_idx in range(probs.shape[1]):
        plot_countor(probs[:, prob_idx].reshape(xx.shape), xx, yy, cmaps[prob_idx], ax, fig)

    # Scatter plot of Pool data
    pool_pts = ax.scatter(X_pool[:, 0], X_pool[:, 1], c="darkgrey", label="Pool Points", alpha=0.75)

    # Scatter Plots for train data
    class_0_train = ax.scatter(X_train[:, 0][y_train == 0], X_train[:, 1][y_train == 0], c="blue", zorder=2)
    class_1_train = ax.scatter(X_train[:, 0][y_train == 1], X_train[:, 1][y_train == 1], c="purple", zorder=2)
    class_2_train = ax.scatter(X_train[:, 0][y_train == 2], X_train[:, 1][y_train == 2], c="green", zorder=2)

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_xlabel("Feature 1")

    # Get Queried Points excluding the train data
    X_only_qry = X_train_qry[-i:]
    y_only_qry = y_train_qry[-i:]

    # Setup legends and display for when queried points are shown
    if i != 0:
        class_0 = ax.scatter(
            X_only_qry[:, 0][y_only_qry == 0],
            X_only_qry[:, 1][y_only_qry == 0],
            linewidths=0.75,
            s=15,
            edgecolor="black",
            c="blue",
            zorder=3,
        )
        class_1 = ax.scatter(
            X_only_qry[:, 0][y_only_qry == 1],
            X_only_qry[:, 1][y_only_qry == 1],
            linewidths=0.75,
            s=15,
            edgecolor="black",
            c="purple",
            zorder=3,
        )
        class_2 = ax.scatter(
            X_only_qry[:, 0][y_only_qry == 2],
            X_only_qry[:, 1][y_only_qry == 2],
            linewidths=0.75,
            s=15,
            edgecolor="black",
            c="green",
            zorder=3,
        )
        ax.legend(
            [
                (class_0, class_1, class_2),
                (class_0_train, class_1_train, class_2_train),
                (pool_pts),
            ],
            ["Queried Data", "Train Data", "Pool Data"],
            loc="upper left",
            fontsize=5,
            framealpha=0.7,
            handler_map={tuple: HandlerTuple(ndivide=None)},
        )
    # Setup legends for only when train data is shown
    else:
        ax.set_ylabel("Feature 2")
        ax.legend(
            [(class_0_train, class_1_train, class_2_train), (pool_pts)],
            ["Train Data", "Pool Data"],
            loc="upper left",
            fontsize=5,
            framealpha=0.7,
            handler_map={tuple: HandlerTuple(ndivide=None)},
        )

    pml.savefig(save_name)

In [5]:

def final_decision_bound(
    key: str,
    i: int,
    X_all: np.ndarray,
    y_all: np.ndarray,
    X_train: np.ndarray,
    y_train: np.ndarray,
    predictions,
    query_x0,
    query_x1,
    xx: np.ndarray,
    yy: np.ndarray,
    cmap: list,
    save_name: str,
) -> None:
    """
    At any given iteration save the decision boundary plots

    Args:
    ----------
    key :
        Query Stratergy Name

    i:
        The current Iteration

    X_all :
       Train+pool sample features

    y_all :
        Train+pool sample labels

    X_train :
        Train data features

    y_train :
        Train data labels

    predictions:
        Predictions

    xx :
        Meshgrid x-coords

    yy :
        Meshgrid y-coords

    cmap :
        Color for the countor

    save_name:
        Fig name to save

    Returns:
    ----------
    None
    """

    # Setup Figure
    pml.latexify(width_scale_factor=2)
    fig = plt.figure()
    ax = plt.gca()
    ax.grid(False)

    # Create a Countorrf plot
    ax.contourf(xx, yy, predictions, cmap=cmap)

    # Display all data
    class_0 = ax.scatter(X_all[:, 0][y_all == 0], X_all[:, 1][y_all == 0], c="b")
    class_1 = ax.scatter(X_all[:, 0][y_all == 1], X_all[:, 1][y_all == 1], c="y")
    class_2 = ax.scatter(X_all[:, 0][y_all == 2], X_all[:, 1][y_all == 2], c="g")

    # Highlight train data
    train_pts = ax.scatter(
        X_train[:, 0],
        X_train[:, 1],
        c="red",
        zorder=2,
        s=25,
        edgecolor="black",
        label="Train Data",
    )

    # Setup legends and display for when queried points are shown
    if i != 0:
        query_pts = ax.scatter(
            query_x0,
            query_x1,
            c="darkorange",
            marker="o",
            zorder=2,
            s=25,
            edgecolor="black",
            label="Queried Data",
        )
        ax.legend(
            [(class_0, class_1, class_2), train_pts, query_pts],
            ["Pool Data", "Train Data", "Queried Data"],
            loc="lower left",
            fontsize=6,
            framealpha=0.4,
            handler_map={tuple: HandlerTuple(ndivide=None)},
        )
    # Setup legends for only when train data is shown
    else:
        ax.legend(
            [(class_0, class_1, class_2), train_pts],
            ["Pool Data", "Train Data"],
            loc="lower left",
            fontsize=6,
            framealpha=0.4,
            handler_map={tuple: HandlerTuple(ndivide=None)},
        )

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())

    ax.set_xlabel("Feature 1")
    ax.set_ylabel("Feature 2")

    pml.savefig(save_name)

In [6]:

def animate_classification(
    sampling_dict_class: dict,
    plot_list: list,
    grid_row: int = 3,
    grid_col: int = 3,
    n_queries=100,
):
    """
    Make animations for classification

    Args:
    ----------
    sampling_dict_class :
        Dictionary to store data for all different uncertanity techniques

    plot_list:
        Choose the plots you want to see

    grid_row:
        # of rows in plot

    grid_col:
        # of columns in plot

    n_queries:
        # of queries for uncertanity sampling

    Returns:
    ----------
    animation:
        An animation of all the plots

    """

    # Intialize Plots
    if len(plot_list) == 3:
        fig, ax = plt.subplots(grid_row, grid_col, figsize=(20, 15))
    else:
        fig, ax = plt.subplots(grid_row, grid_col, figsize=(16, 10))

    camera = Camera(fig)

    # Generate relevant data
    (
        x_min,
        x_max,
        y_min,
        y_max,
        xx,
        yy,
        X_train,
        X_pool,
        y_train,
        y_pool,
        X_test,
        y_test,
    ) = alu.make_data_class()

    # Create all data, needed for plotting later
    X_all = np.append(X_train, X_pool, axis=0)
    y_all = np.append(y_train, y_pool, axis=0)

    # Iterate over each sampling technique and store its results
    # in correspoding dictionary for later
    for uncert_key, uncert_val in sampling_dict_class.items():
        (
            uncert_val["queries_np_X"],
            uncert_val["queries_np_y"],
            uncert_val["score"],
            uncert_val["X_pool"],
            uncert_val["y_pool"],
        ) = alu.process_uncertainty_result(
            uncert_val["model"],
            uncert_val["query_strat"],
            "Classification",
            uncert_key,
            n_queries=n_queries,
            X_pool=X_pool,
            y_pool=y_pool,
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            y_test=y_test,
        )

    # Loop over each iteration in each sampling stratergy
    for i in range(n_queries + 1):

        # Go over each sampling stratergy
        for col, (key, value) in enumerate(sampling_dict_class.items()):

            # If initial iteration only consider the train points else consider queried points+train points
            if i == 0:
                X_train_qry = X_train
                y_train_qry = y_train
            else:
                X_train_qry = np.append(X_train, value["queries_np_X"][:i, :], axis=0)
                y_train_qry = np.append(y_train, value["queries_np_y"][:i])

            # Fit the model
            model_qry = clone(uncert_val["model"]).fit(X_train_qry, y_train_qry)

            # Predict probabilities on meshgrid to display countors
            probs = model_qry.predict_proba(np.c_[xx.ravel(), yy.ravel()])

            # Make predictions on meshgrid to display decision bounds
            predictions = model_qry.predict(np.c_[xx.ravel(), yy.ravel()])
            predictions = predictions.reshape(xx.shape)

            # For each sampling type plot a column of 3 plots
            for row, graph_type in enumerate(plot_list):

                if graph_type == "decision_bound":

                    ax[row][col].grid(False)

                    # Make a custom cmap
                    custom_cmap = ListedColormap(["#9898ff", "#fafab0", "#a0faa0"])

                    # Plot contour
                    ax[row][col].contourf(xx, yy, predictions, cmap=custom_cmap)

                    # Plot all the data indicating different classes
                    ax[row][col].scatter(
                        X_all[:, 0],
                        X_all[:, 1],
                        c=np.array(["b", "y", "g"])[y_all],
                        s=30,
                    )

                    # Highlight train data
                    ax[row][col].scatter(
                        X_train[:, 0],
                        X_train[:, 1],
                        c=np.array(["b", "y", "g"])[y_train],
                        s=35,
                        edgecolor="black",
                    )

                    # Select all the points queried till now
                    qry_pt = value["queries_np_X"][:i, :]

                    # Only display queried points when query iteration>0
                    if i != 0:
                        ax[row][col].scatter(
                            qry_pt[:, 0],
                            qry_pt[:, 1],
                            c="darkorange",
                            marker="o",
                            s=35,
                            edgecolor="black",
                        )

                    # Graph Options
                    ax[row][col].set_xlim(xx.min(), xx.max())
                    ax[row][col].set_ylim(yy.min(), yy.max())
                    ax[row][col].set_xticks(())
                    ax[row][col].set_yticks(())

                    # Take a snap of graphs at any given iteration
                    if i in [] and pml.is_latexify_enabled() and value["save_dec"] and value["save_fig"]:
                        print(
                            f"Plotting Decision Boundary, the Current Stratergy is {key} and accuracy at iteration:{i} is {value['score'][i]}"
                        )
                        final_decision_bound(
                            key,
                            i,
                            X_all,
                            y_all,
                            X_train,
                            y_train,
                            predictions,
                            qry_pt[:, 0],
                            qry_pt[:, 1],
                            xx,
                            yy,
                            custom_cmap,
                            f"{value['save_dec']}_db_iter_{i}",
                        )

                elif graph_type == "decision_countour":

                    # Decision Boundary Plot using countours

                    # Color maps for countors
                    cmaps = ["Blues", "Purples", "Greens"]

                    # Plot contours for each class
                    for prob_idx in range(probs.shape[1]):
                        plot_countor(
                            probs[:, prob_idx].reshape(xx.shape),
                            xx,
                            yy,
                            cmaps[prob_idx],
                            ax[row][col],
                            fig,
                        )

                    # Plot all pool data
                    ax[row][col].scatter(
                        value["X_pool"][0][:, 0],
                        value["X_pool"][0][:, 1],
                        c="darkgrey",
                        alpha=0.3,
                    )

                    # Plot all queried data
                    ax[row][col].scatter(
                        X_train_qry[:, 0][-i:],
                        X_train_qry[:, 1][-i:],
                        c=np.array(["blue", "purple", "green"])[y_train_qry[-i:]],
                        edgecolor="black",
                        zorder=3,
                    )

                    # Plot train data
                    ax[row][col].scatter(
                        X_train[:, 0],
                        X_train[:, 1],
                        c=np.array(["blue", "purple", "green"])[y_train],
                        zorder=3,
                    )

                    # Plot the next queried point
                    if i != 0:
                        ax[row][col].scatter(
                            X_train_qry[:, 0][-1],
                            X_train_qry[:, 1][-1],
                            c=np.array(["blue", "purple", "green"])[y_train_qry][-1],
                            s=100,
                            edgecolor="black",
                            zorder=4,
                        )

                    # Graph options
                    ax[row][col].set_xlim(xx.min(), xx.max())
                    ax[row][col].set_ylim(yy.min(), yy.max())
                    ax[row][col].set_xticks(())
                    ax[row][col].set_yticks(())
                    ax[row][col].set_title(key, color=value["color_scheme"])

                    # Save at an any iteration
                    if (
                        i in value["save_iter_dc"]
                        and pml.is_latexify_enabled()
                        and value["save_dec"]
                        and value["save_fig"]
                    ):
                        print(
                            f"Plotting Decision Contour, the Current Stratergy is {key} and accuracy at iteration:{i} is {value['score'][i]}"
                        )
                        final_decision_countor(
                            key,
                            i,
                            X_train,
                            y_train,
                            X_train_qry,
                            y_train_qry,
                            probs,
                            xx,
                            yy,
                            cmaps,
                            value["X_pool"][0],
                            value["y_pool"][0],
                            f"{value['save_dec']}_dc_iter_{i}",
                        )

                elif graph_type == "accuracy_line":

                    # Accuracy Plot
                    ax[row][col].grid(True)

                    # Plot accuracy curves
                    if i == 0:
                        ax[row][col].scatter([i], value["score"][i], color=value["color_scheme"])
                    else:
                        ax[row][col].plot(
                            [itr for itr in range(i + 1)],
                            value["score"][: i + 1],
                            color=value["color_scheme"],
                        )
                        ax[row][col].set_xticks([i for i in range(0, i + 1, 2)])

                    ax[row][col].set_xlabel("Number of Iterations")

                    if col == 0:
                        ax[row][col].set_ylabel("Classification Accuracy")

        plt.subplots_adjust(hspace=0.2)

        camera.snap()

    animation = camera.animate(interval=500, blit=True)
    return animation

In [7]:

model_LR = LogisticRegression(max_iter=10000)

sampling_dict_class_animate = {
    "Least Confidence Sampling": {
        "model": clone(model_LR),
        "query_strat": uncertainty_sampling,
        "score": [],
        "marker": "o",
        "type": "uncertanity",
        "queries_np_X": None,
        "queries_np_y": None,
        "color_scheme": "blue",
        "save_dec": "least_confidence_decision",
        "save_iter_dc": [11],
        "save_fig": False,
        "X_pool": None,
        "y_pool": None,
    },
    "Entropy Sampling": {
        "model": clone(model_LR),
        "query_strat": entropy_sampling,
        "score": [],
        "marker": "^",
        "type": "uncertanity",
        "queries_np_X": None,
        "queries_np_y": None,
        "color_scheme": "darkorange",
        "save_dec": "entropy_sampling_decision",
        "save_iter_dc": [11],
        "save_fig": False,
        "X_pool": None,
        "y_pool": None,
    },
    "Margin Sampling": {
        "model": clone(model_LR),
        "query_strat": margin_sampling,
        "score": [],
        "marker": "s",
        "type": "uncertanity",
        "queries_np_X": None,
        "queries_np_y": None,
        "color_scheme": "green",
        "save_dec": "margin_sampling_decision",
        "save_iter_dc": [11],
        "save_fig": True,
        "X_pool": None,
        "y_pool": None,
    },
    "Random Sampling": {
        "model": clone(model_LR),
        "query_strat": alu.random_sampling,
        "score": [],
        "mean_score": [],
        "std_dev": [],
        "marker": "H",
        "type": "random",
        "n_iter": 50,
        "queries_np_X": None,
        "queries_np_y": None,
        "color_scheme": "red",
        "save_dec": "random_sampling_decision",
        "save_iter_dc": [0, 11],
        "save_fig": True,
        "X_pool": None,
        "y_pool": None,
    },
}


# All Plots
# plot_list = ["decision_countour", "decision_bound", "accuracy_line"]

plot_list = ["decision_countour", "accuracy_line"]

animate = animate_classification(sampling_dict_class_animate, plot_list, 2, 4, n_queries=12)
HTML(animate.to_jshtml())

Out[7]:

Decision Contour Plots

Product

Resources

Company