CoCalc -- active_learning_compare

GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book2/34/active_learning_compare_reg.ipynb
¹¹⁹² views

Kernel: Python 3 (ipykernel)

In [1]:

# sklearn Models
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import linear_model
from sklearn.gaussian_process.kernels import (
    WhiteKernel,
    Matern,
    ConstantKernel,
)

# sklearn Utils
from sklearn.base import BaseEstimator
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


import numpy as np
from numpy.random import default_rng
import random

np.random.seed(42)

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

try:
    import probml_utils as pml
except ModuleNotFoundError:
    %pip install -qq git+https://github.com/probml/probml-utils.git
    import probml_utils as pml

from probml_utils import active_learn_utils as alu

try:
    import modAL
except ModuleNotFoundError:
    %pip install -qq modAL
    import modAL


from modAL.disagreement import max_std_sampling

from modAL.models import ActiveLearner
from modAL.utils.data import modALinput
from modAL.models import CommitteeRegressor

from typing import Callable
from typing import Tuple
from typing import Optional
import warnings
from typing import List

from functools import reduce
import operator

In [2]:

# Default RC parms for grid and line plot
p = plt.rcParams

# Grid Setting
p["grid.color"] = "#999999"
p["grid.linestyle"] = "--"

# Markersize setting for scatter
if pml.is_latexify_enabled():
    p["lines.markersize"] = 3
    p["lines.markeredgewidth"] = 1
    p["lines.linewidth"] = 1.5
    p["grid.linewidth"] = 0.5
else:
    p["lines.markersize"] = 5
    p["lines.markeredgewidth"] = 1.5
    p["lines.linewidth"] = 2

In [3]:

def plot_data(
    sampling_dict: dict,
    n_queries: int,
    problem_type: str,
    latexify_dict: dict = {"fig_name": None, "legend_pos": "best"},
    grid_flag: bool = True,
) -> None:
    """
    Make accurracy/rmse vs # of query iteration plots

    Args:
    ----------
    sampling_dict :
        Dictionary with log of all the points queried along with
        other info

    n_queries :
        The number for iterations that were run for each sampling technique

    problem_type :
        Classification or Regression Problem

    latexify_dict :
        Options for latexified figures

    grid_flag:
        To toggle grids on graphs

    Returns:
    ----------
    None
    """

    # Initialize figure
    fig = plt.figure()
    ax = plt.gca()

    # Toggle Grid
    ax.grid(grid_flag)

    # Number of Iterations for x-axis labels
    x_axis = np.array([i for i in range(n_queries + 1)])

    for key, value in sampling_dict.items():

        # If not random sampling just plot the score
        if value["type"] != "random":
            plt.plot(x_axis, value["score"], label=key + " sampling", marker=value["marker"])
        # If random sampling query strat, plot random accuracies along with mean accuracy or rmse
        elif value["type"] == "random":
            # Choose any n random accuracy curves
            all_random_iter = len(value["score"]) - 1
            random_choice = np.random.choice(all_random_iter, size=value["disp_randoms"], replace=False)
            # Plot random curves
            for i in random_choice:
                scores = value["score"][i]
                plt.plot(
                    x_axis,
                    scores,
                    label=key + " samples",
                    marker=value["marker"],
                    color="blueviolet",
                    alpha=0.25,
                )
            # Plot mean accuracy curve
            plt.plot(
                x_axis,
                value["mean_score"],
                label=f"Mean of {key} sampling",
                marker=value["marker"],
                color="blueviolet",
            )

    sns.despine()
    plt.xticks([i for i in range(x_axis[0], x_axis[-1] + 1, 5)])
    plt.xlabel("Number of Points queried")

    # Remove Duplicates created due to Different Random Plots
    handles, labels = ax.get_legend_handles_labels()
    newLabels, newHandles = [], []
    for handle, label in zip(handles, labels):
        if label not in newLabels:
            newLabels.append(label)
            newHandles.append(handle)

    # Different axes labels based on problem type
    if problem_type == "Classification":
        plt.ylabel("Accuracy")
    else:
        plt.ylabel("RMSE")

    # Custom plotting options
    if pml.is_latexify_enabled() and latexify_dict["fig_name"]:
        pml.latexify(**latexify_dict["latexify_parms"])
        plt.legend(handles=newHandles, labels=newLabels, **latexify_dict["legend_params"])
        pml.savefig(latexify_dict["fig_name"])
    else:
        plt.legend(
            handles=newHandles,
            labels=newLabels,
            loc="upper right",
            bbox_to_anchor=(1.55, 1),
        )

    plt.show()

In [4]:

def compare_sampling(
    sampling_dict_data: dict,
    problem_type: str,
    n_queries: int,
    X_pool: np.ndarray,
    y_pool: np.ndarray,
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray,
    latexify_dict={"fig_name": None, "legend_pos": "best"},
) -> None:
    """
    Driver function to run uncertanity, QBC or random sampling approach

    Args:
    ----------
    sampling_dict_data :
        Dictionary with log of all the points queried along with
        other info

    problem_type :
        Classification or Regression Problem

    n_queries :
        The number for iteration to run for each sampling technique

    latexify_dict :
        Options for latexified figures

    X_pool:
        Features pool data

    y_pool:
        Labels pool data

    X_train:
        Features train data

    y_train:
        Labels train data

    X_test:
        Features test data

    y_train:
        Labels test data


    Returns:
    ----------
    None

    """

    # Loop over each uncertanity type
    for key, value in sampling_dict_data.items():

        # For uncertanity
        if value["type"] == "uncertanity":
            uncertanity_info_dict = alu.uncertainty_sampling(
                value["query_strat"],
                value["model"],
                key,
                n_queries,
                problem_type=problem_type,
                X_pool=X_pool,
                y_pool=y_pool,
                X_train=X_train,
                y_train=y_train,
                X_test=X_test,
                y_test=y_test,
            )
            value["score"] = uncertanity_info_dict["score"]
        # For uncertanity
        elif value["type"] == "qbc":
            qbc_info_dict = alu.qbc(
                value["query_strat"],
                value["model"],
                problem_type,
                n_queries,
                X_pool=X_pool,
                y_pool=y_pool,
                X_train=X_train,
                y_train=y_train,
                X_test=X_test,
                y_test=y_test,
            )
            value["score"] = qbc_info_dict["score"]
        # For Random
        else:
            for i in range(value["n_iter"]):
                uncertanity_info_dict = alu.uncertainty_sampling(
                    value["query_strat"],
                    value["model"],
                    key,
                    n_queries,
                    problem_type=problem_type,
                    X_pool=X_pool,
                    y_pool=y_pool,
                    X_train=X_train,
                    y_train=y_train,
                    X_test=X_test,
                    y_test=y_test,
                )
                value["score"].append(uncertanity_info_dict["score"])

            # Mean and std dev for errors for random sampling
            err_arr = np.array(sampling_dict_data[key]["score"])
            mean_err_arr = np.mean(err_arr, axis=0)
            stddev_err_arr = np.std(err_arr, axis=0)

            sampling_dict_data[key]["mean_score"] = mean_err_arr
            sampling_dict_data[key]["std_dev"] = stddev_err_arr

    # Plot the data
    plot_data(sampling_dict_data, n_queries, problem_type, latexify_dict)

Regression Error Plots with different Uncertanity Sampling techniques

In [5]:

# Create custom dataset for Regression
X_test, X_train, X_pool, y_test, y_train, y_pool = alu.make_data_reg()

In [6]:

pml.latexify(width_scale_factor=2)
fig = plt.figure()
ax = plt.gca()

ax.scatter(X_pool, y_pool, c="blue", label="Pool Data", alpha=0.2)
ax.scatter(X_train, y_train, c="tab:red", label="Train Data")
ax.set_xlabel("$X$")
ax.set_ylabel("$y$")

if pml.is_latexify_enabled():
    legend_fontsize = 5
    ax.legend(loc="upper left", fontsize=6)
    pml.savefig("1d_regression_dataset")
else:
    ax.legend(loc="upper left")

Out[6]:

/home/nitish1295/.local/lib/python3.8/site-packages/probml_utils/plotting.py:25: UserWarning: LATEXIFY environment variable not set, not latexifying
  warnings.warn("LATEXIFY environment variable not set, not latexifying")

In [7]:

# Choose a model
kernel_0 = ConstantKernel(0.1) * (Matern(length_scale=0.1, nu=2.5)) + WhiteKernel(noise_level_bounds=(1e-10, 1e1))
kernel_1 = ConstantKernel(0.1) * (Matern(length_scale=0.1, nu=1.5)) + WhiteKernel(noise_level_bounds=(1e-10, 1e1))

model_0 = make_pipeline(
    StandardScaler(),
    GaussianProcessRegressor(kernel=kernel_0, n_restarts_optimizer=10, random_state=0),
)

model_1 = make_pipeline(
    StandardScaler(),
    GaussianProcessRegressor(kernel=kernel_1, n_restarts_optimizer=10, random_state=0),
)

committee_reg_list = [model_0, model_1]

In [8]:

warnings.filterwarnings("ignore")

# Change/Add sampling stratergies in below dictionary
sampling_dict_reg = {
    "Uncertainty": {
        "model": clone(model_0),
        "query_strat": max_std_sampling,
        "score": [],
        "marker": None,
        "type": "uncertanity",
    },
    "Random": {
        "model": clone(model_0),
        "query_strat": alu.random_sampling,
        "score": [],
        "marker": None,
        "type": "random",
        "n_iter": 25,
        "disp_randoms": 4,
        "mode": "all_random",
    },
    "QBC": {
        "model": committee_reg_list,
        "query_strat": max_std_sampling,
        "score": [],
        "marker": None,
        "type": "qbc",
    },
}

# Specify latexify options
latexify_dict = {
    "fig_name": "uncertanity_sampling_regression",
    "legend_params": {"loc": "upper right", "fontsize": 5, "framealpha": 0.5},
    "latexify_parms": {"width_scale_factor": 2},
}

compare_sampling(
    sampling_dict_reg,
    "Regression",
    n_queries=15,
    X_pool=X_pool,
    y_pool=y_pool,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    latexify_dict=latexify_dict,
)

Out[8]:

Regression Error Plots with different Uncertanity Sampling techniques

Product

Resources

Company