CoCalc -- active_learning_compare

GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book2/34/active_learning_compare_class.ipynb
²²⁵² views

Kernel: Python 3 (ipykernel)

In [1]:

# sklearn Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# sklearn Utils
from sklearn.base import BaseEstimator
from sklearn.base import clone
from sklearn.model_selection import train_test_split

import numpy as np
from numpy.random import default_rng
import random

np.random.seed(42)

from matplotlib.legend_handler import HandlerTuple
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

try:
    import probml_utils as pml
except ModuleNotFoundError:
    %pip install -qq git+https://github.com/probml/probml-utils.git
    import probml_utils as pml

from probml_utils import active_learn_utils as alu

try:
    import modAL
except ModuleNotFoundError:
    %pip install -qq modAL
    import modAL


from modAL.uncertainty import uncertainty_sampling
from modAL.uncertainty import entropy_sampling
from modAL.uncertainty import margin_sampling
from modAL.disagreement import max_std_sampling

from modAL.models import ActiveLearner
from modAL.utils.data import modALinput
from modAL.models import Committee

from typing import Callable
from typing import Tuple
from typing import Optional
import warnings
from typing import List

from functools import reduce
import operator

In [2]:

# Default RC parms for grid and line plot
p = plt.rcParams

# Grid Setting
p["grid.color"] = "#999999"
p["grid.linestyle"] = "--"

# Markersize setting for scatter
if pml.is_latexify_enabled():
    p["lines.markersize"] = 3
    p["lines.markeredgewidth"] = 1
    p["lines.linewidth"] = 1.5
    p["grid.linewidth"] = 0.5
else:
    p["lines.markersize"] = 5
    p["lines.markeredgewidth"] = 1.5
    p["lines.linewidth"] = 2

In [3]:

def plot_data(
    sampling_dict: dict,
    n_queries: int,
    problem_type: str,
    latexify_dict: dict = {"fig_name": None, "legend_pos": "best"},
    grid_flag: bool = True,
) -> None:
    """
    Make accurracy/rmse vs # of query iteration plots

    Args:
    ----------
    sampling_dict :
        Dictionary with log of all the points queried along with
        other info

    n_queries :
        The number for iterations that were run for each sampling technique

    problem_type :
        Classification or Regression Problem

    latexify_dict :
        Options for latexified figures

    grid_flag:
        To toggle grids on graphs

    Returns:
    ----------
    None
    """

    # Initialize figure
    fig = plt.figure()
    ax = plt.gca()

    # Toggle Grid
    ax.grid(grid_flag)

    # Number of Iterations for x-axis labels
    x_axis = np.array([i for i in range(n_queries + 1)])

    for key, value in sampling_dict.items():

        # If not random sampling just plot the score
        if value["type"] != "random":
            plt.plot(x_axis, value["score"], label=key + " sampling", marker=value["marker"])
        # If random sampling query strat, plot random accuracies along with mean accuracy or rmse
        elif value["type"] == "random":
            # Choose any n random accuracy curves
            all_random_iter = len(value["score"]) - 1
            random_choice = np.random.choice(all_random_iter, size=value["disp_randoms"], replace=False)
            # Plot random curves
            for i in random_choice:
                scores = value["score"][i]
                plt.plot(
                    x_axis,
                    scores,
                    label=key + " samples",
                    marker=value["marker"],
                    color="blueviolet",
                    alpha=0.25,
                )
            # Plot mean accuracy curve
            plt.plot(
                x_axis,
                value["mean_score"],
                label=f"Mean of {key} sampling",
                marker=value["marker"],
                color="blueviolet",
            )

    sns.despine()
    plt.xticks([i for i in range(x_axis[0], x_axis[-1] + 1, 5)])
    plt.xlabel("Number of Points queried")

    # Remove Duplicates created due to Different Random Plots
    handles, labels = ax.get_legend_handles_labels()
    newLabels, newHandles = [], []
    for handle, label in zip(handles, labels):
        if label not in newLabels:
            newLabels.append(label)
            newHandles.append(handle)

    # Different axes labels based on problem type
    if problem_type == "Classification":
        plt.ylabel("Accuracy")
    else:
        plt.ylabel("RMSE")

    # Custom plotting options
    if pml.is_latexify_enabled() and latexify_dict["fig_name"]:
        pml.latexify(**latexify_dict["latexify_parms"])
        plt.legend(handles=newHandles, labels=newLabels, **latexify_dict["legend_params"])
        pml.savefig(latexify_dict["fig_name"])
    else:
        plt.legend(
            handles=newHandles,
            labels=newLabels,
            loc="upper right",
            bbox_to_anchor=(1.55, 1),
        )

    plt.show()

In [4]:

def compare_sampling(
    sampling_dict_data: dict,
    problem_type: str,
    n_queries: int,
    X_pool: np.ndarray,
    y_pool: np.ndarray,
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray,
    latexify_dict={"fig_name": None, "legend_pos": "best"},
) -> None:
    """
    Driver function to run uncertanity, QBC or random sampling approach

    Args:
    ----------
    sampling_dict_data :
        Dictionary with log of all the points queried along with
        other info

    problem_type :
        Classification or Regression Problem

    n_queries :
        The number for iteration to run for each sampling technique

    latexify_dict :
        Options for latexified figures

    X_pool:
        Features pool data

    y_pool:
        Labels pool data

    X_train:
        Features train data

    y_train:
        Labels train data

    X_test:
        Features test data

    y_train:
        Labels test data


    Returns:
    ----------
    None

    """

    # Loop over each uncertanity type
    for key, value in sampling_dict_data.items():

        # For uncertanity
        if value["type"] == "uncertanity":
            uncertanity_info_dict = alu.uncertainty_sampling(
                value["query_strat"],
                value["model"],
                key,
                n_queries,
                problem_type=problem_type,
                X_pool=X_pool,
                y_pool=y_pool,
                X_train=X_train,
                y_train=y_train,
                X_test=X_test,
                y_test=y_test,
            )
            value["score"] = uncertanity_info_dict["score"]
        # For uncertanity
        elif value["type"] == "qbc":
            qbc_info_dict = alu.qbc(
                value["query_strat"],
                value["model"],
                problem_type,
                n_queries,
                X_pool=X_pool,
                y_pool=y_pool,
                X_train=X_train,
                y_train=y_train,
                X_test=X_test,
                y_test=y_test,
            )
            value["score"] = qbc_info_dict["score"]
        # For Random
        else:
            for i in range(value["n_iter"]):
                uncertanity_info_dict = alu.uncertainty_sampling(
                    value["query_strat"],
                    value["model"],
                    key,
                    n_queries,
                    problem_type=problem_type,
                    X_pool=X_pool,
                    y_pool=y_pool,
                    X_train=X_train,
                    y_train=y_train,
                    X_test=X_test,
                    y_test=y_test,
                )
                value["score"].append(uncertanity_info_dict["score"])

            # Mean and std dev for errors for random sampling
            err_arr = np.array(sampling_dict_data[key]["score"])
            mean_err_arr = np.mean(err_arr, axis=0)
            stddev_err_arr = np.std(err_arr, axis=0)

            sampling_dict_data[key]["mean_score"] = mean_err_arr
            sampling_dict_data[key]["std_dev"] = stddev_err_arr

    # Plot the data
    plot_data(sampling_dict_data, n_queries, problem_type, latexify_dict)

Comparison of uncertanity sampling and random sampling

In [5]:

# Get Data for classification
(
    _,
    _,
    _,
    _,
    _,
    _,
    X_train,
    X_pool,
    y_train,
    y_pool,
    X_test,
    y_test,
) = alu.make_data_class()

In [6]:

# Display data plot

pml.latexify(width_scale_factor=2)
fig = plt.figure()
ax = plt.gca()

class_0_pool = ax.scatter(
    X_pool[:, 0][y_pool == 0],
    X_pool[:, 1][y_pool == 0],
    c="blue",
    label="Class 0",
    alpha=0.15,
)
class_1_pool = ax.scatter(
    X_pool[:, 0][y_pool == 1],
    X_pool[:, 1][y_pool == 1],
    c="purple",
    label="Class 1",
    alpha=0.15,
)
class_2_pool = ax.scatter(
    X_pool[:, 0][y_pool == 2],
    X_pool[:, 1][y_pool == 2],
    c="green",
    label="Class 2",
    alpha=0.15,
)

# Scatter Plots for train data
class_0_train = ax.scatter(X_train[:, 0][y_train == 0], X_train[:, 1][y_train == 0], c="blue", zorder=2)
class_1_train = ax.scatter(X_train[:, 0][y_train == 1], X_train[:, 1][y_train == 1], c="purple", zorder=2)
class_2_train = ax.scatter(X_train[:, 0][y_train == 2], X_train[:, 1][y_train == 2], c="green", zorder=2)


ax.set_xlabel("$X_0$")
ax.set_ylabel("$X_1$")


if pml.is_latexify_enabled():
    legend_fontsize = 5
    ax.legend(
        [
            (class_0_train, class_1_train, class_2_train),
            (class_0_pool, class_1_pool, class_2_pool),
        ],
        ["Train Data", "Pool Data"],
        loc="lower left",
        fontsize=legend_fontsize,
        handler_map={tuple: HandlerTuple(ndivide=None)},
    )
    pml.savefig("1d_classification_dataset")
else:
    ax.legend(
        [
            (class_0_train, class_1_train, class_2_train),
            (class_0_pool, class_1_pool, class_2_pool),
        ],
        ["Train Data", "Pool Data"],
        loc="lower left",
        handler_map={tuple: HandlerTuple(ndivide=None)},
    )

Out[6]:

/home/nitish1295/.local/lib/python3.8/site-packages/probml_utils/plotting.py:25: UserWarning: LATEXIFY environment variable not set, not latexifying
  warnings.warn("LATEXIFY environment variable not set, not latexifying")

In [7]:

committee_class_list = [RandomForestClassifier(random_state=42), LogisticRegression(random_state=42, max_iter=1000)]

In [8]:

sampling_dict_class = {
    "Uncertanity": {
        "model": clone(RandomForestClassifier(random_state=42)),
        "query_strat": margin_sampling,
        "score": [],
        "marker": None,
        "type": "uncertanity",
    },
    "QBC": {
        "model": committee_class_list,
        "query_strat": margin_sampling,
        "score": [],
        "marker": None,
        "type": "qbc",
    },
    "Random": {
        "model": clone(RandomForestClassifier(random_state=42)),
        "query_strat": alu.random_sampling,
        "score": [],
        "mean_score": [],
        "std_dev": [],
        "marker": None,
        "type": "random",
        "n_iter": 50,
        "disp_randoms": 4,
        "mode": "all_random",
    },
}

# Specify latexify options for accuracy plot
latexify_dict = {
    "fig_name": "uncertanity_sampling_classification",
    "legend_params": {"loc": "lower right", "fontsize": 5, "framealpha": 0.5},
    "latexify_parms": {"width_scale_factor": 2},
}

compare_sampling(
    sampling_dict_class,
    "Classification",
    n_queries=40,
    X_pool=X_pool,
    y_pool=y_pool,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    latexify_dict=latexify_dict,
)

Out[8]:

Comparison of uncertanity sampling and random sampling

Product

Resources

Company