Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book2/34/active_learning_compare_reg.ipynb
1192 views
Kernel: Python 3 (ipykernel)
# sklearn Models from sklearn.gaussian_process import GaussianProcessRegressor from sklearn import linear_model from sklearn.gaussian_process.kernels import ( WhiteKernel, Matern, ConstantKernel, ) # sklearn Utils from sklearn.base import BaseEstimator from sklearn.base import clone from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split import numpy as np from numpy.random import default_rng import random np.random.seed(42) import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns try: import probml_utils as pml except ModuleNotFoundError: %pip install -qq git+https://github.com/probml/probml-utils.git import probml_utils as pml from probml_utils import active_learn_utils as alu try: import modAL except ModuleNotFoundError: %pip install -qq modAL import modAL from modAL.disagreement import max_std_sampling from modAL.models import ActiveLearner from modAL.utils.data import modALinput from modAL.models import CommitteeRegressor from typing import Callable from typing import Tuple from typing import Optional import warnings from typing import List from functools import reduce import operator
# Default RC parms for grid and line plot p = plt.rcParams # Grid Setting p["grid.color"] = "#999999" p["grid.linestyle"] = "--" # Markersize setting for scatter if pml.is_latexify_enabled(): p["lines.markersize"] = 3 p["lines.markeredgewidth"] = 1 p["lines.linewidth"] = 1.5 p["grid.linewidth"] = 0.5 else: p["lines.markersize"] = 5 p["lines.markeredgewidth"] = 1.5 p["lines.linewidth"] = 2
def plot_data( sampling_dict: dict, n_queries: int, problem_type: str, latexify_dict: dict = {"fig_name": None, "legend_pos": "best"}, grid_flag: bool = True, ) -> None: """ Make accurracy/rmse vs # of query iteration plots Args: ---------- sampling_dict : Dictionary with log of all the points queried along with other info n_queries : The number for iterations that were run for each sampling technique problem_type : Classification or Regression Problem latexify_dict : Options for latexified figures grid_flag: To toggle grids on graphs Returns: ---------- None """ # Initialize figure fig = plt.figure() ax = plt.gca() # Toggle Grid ax.grid(grid_flag) # Number of Iterations for x-axis labels x_axis = np.array([i for i in range(n_queries + 1)]) for key, value in sampling_dict.items(): # If not random sampling just plot the score if value["type"] != "random": plt.plot(x_axis, value["score"], label=key + " sampling", marker=value["marker"]) # If random sampling query strat, plot random accuracies along with mean accuracy or rmse elif value["type"] == "random": # Choose any n random accuracy curves all_random_iter = len(value["score"]) - 1 random_choice = np.random.choice(all_random_iter, size=value["disp_randoms"], replace=False) # Plot random curves for i in random_choice: scores = value["score"][i] plt.plot( x_axis, scores, label=key + " samples", marker=value["marker"], color="blueviolet", alpha=0.25, ) # Plot mean accuracy curve plt.plot( x_axis, value["mean_score"], label=f"Mean of {key} sampling", marker=value["marker"], color="blueviolet", ) sns.despine() plt.xticks([i for i in range(x_axis[0], x_axis[-1] + 1, 5)]) plt.xlabel("Number of Points queried") # Remove Duplicates created due to Different Random Plots handles, labels = ax.get_legend_handles_labels() newLabels, newHandles = [], [] for handle, label in zip(handles, labels): if label not in newLabels: newLabels.append(label) newHandles.append(handle) # Different axes labels based on problem type if problem_type == "Classification": plt.ylabel("Accuracy") else: plt.ylabel("RMSE") # Custom plotting options if pml.is_latexify_enabled() and latexify_dict["fig_name"]: pml.latexify(**latexify_dict["latexify_parms"]) plt.legend(handles=newHandles, labels=newLabels, **latexify_dict["legend_params"]) pml.savefig(latexify_dict["fig_name"]) else: plt.legend( handles=newHandles, labels=newLabels, loc="upper right", bbox_to_anchor=(1.55, 1), ) plt.show()
def compare_sampling( sampling_dict_data: dict, problem_type: str, n_queries: int, X_pool: np.ndarray, y_pool: np.ndarray, X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, latexify_dict={"fig_name": None, "legend_pos": "best"}, ) -> None: """ Driver function to run uncertanity, QBC or random sampling approach Args: ---------- sampling_dict_data : Dictionary with log of all the points queried along with other info problem_type : Classification or Regression Problem n_queries : The number for iteration to run for each sampling technique latexify_dict : Options for latexified figures X_pool: Features pool data y_pool: Labels pool data X_train: Features train data y_train: Labels train data X_test: Features test data y_train: Labels test data Returns: ---------- None """ # Loop over each uncertanity type for key, value in sampling_dict_data.items(): # For uncertanity if value["type"] == "uncertanity": uncertanity_info_dict = alu.uncertainty_sampling( value["query_strat"], value["model"], key, n_queries, problem_type=problem_type, X_pool=X_pool, y_pool=y_pool, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, ) value["score"] = uncertanity_info_dict["score"] # For uncertanity elif value["type"] == "qbc": qbc_info_dict = alu.qbc( value["query_strat"], value["model"], problem_type, n_queries, X_pool=X_pool, y_pool=y_pool, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, ) value["score"] = qbc_info_dict["score"] # For Random else: for i in range(value["n_iter"]): uncertanity_info_dict = alu.uncertainty_sampling( value["query_strat"], value["model"], key, n_queries, problem_type=problem_type, X_pool=X_pool, y_pool=y_pool, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, ) value["score"].append(uncertanity_info_dict["score"]) # Mean and std dev for errors for random sampling err_arr = np.array(sampling_dict_data[key]["score"]) mean_err_arr = np.mean(err_arr, axis=0) stddev_err_arr = np.std(err_arr, axis=0) sampling_dict_data[key]["mean_score"] = mean_err_arr sampling_dict_data[key]["std_dev"] = stddev_err_arr # Plot the data plot_data(sampling_dict_data, n_queries, problem_type, latexify_dict)

Regression Error Plots with different Uncertanity Sampling techniques

# Create custom dataset for Regression X_test, X_train, X_pool, y_test, y_train, y_pool = alu.make_data_reg()
pml.latexify(width_scale_factor=2) fig = plt.figure() ax = plt.gca() ax.scatter(X_pool, y_pool, c="blue", label="Pool Data", alpha=0.2) ax.scatter(X_train, y_train, c="tab:red", label="Train Data") ax.set_xlabel("$X$") ax.set_ylabel("$y$") if pml.is_latexify_enabled(): legend_fontsize = 5 ax.legend(loc="upper left", fontsize=6) pml.savefig("1d_regression_dataset") else: ax.legend(loc="upper left")
/home/nitish1295/.local/lib/python3.8/site-packages/probml_utils/plotting.py:25: UserWarning: LATEXIFY environment variable not set, not latexifying warnings.warn("LATEXIFY environment variable not set, not latexifying")
Image in a Jupyter notebook
# Choose a model kernel_0 = ConstantKernel(0.1) * (Matern(length_scale=0.1, nu=2.5)) + WhiteKernel(noise_level_bounds=(1e-10, 1e1)) kernel_1 = ConstantKernel(0.1) * (Matern(length_scale=0.1, nu=1.5)) + WhiteKernel(noise_level_bounds=(1e-10, 1e1)) model_0 = make_pipeline( StandardScaler(), GaussianProcessRegressor(kernel=kernel_0, n_restarts_optimizer=10, random_state=0), ) model_1 = make_pipeline( StandardScaler(), GaussianProcessRegressor(kernel=kernel_1, n_restarts_optimizer=10, random_state=0), ) committee_reg_list = [model_0, model_1]
warnings.filterwarnings("ignore") # Change/Add sampling stratergies in below dictionary sampling_dict_reg = { "Uncertainty": { "model": clone(model_0), "query_strat": max_std_sampling, "score": [], "marker": None, "type": "uncertanity", }, "Random": { "model": clone(model_0), "query_strat": alu.random_sampling, "score": [], "marker": None, "type": "random", "n_iter": 25, "disp_randoms": 4, "mode": "all_random", }, "QBC": { "model": committee_reg_list, "query_strat": max_std_sampling, "score": [], "marker": None, "type": "qbc", }, } # Specify latexify options latexify_dict = { "fig_name": "uncertanity_sampling_regression", "legend_params": {"loc": "upper right", "fontsize": 5, "framealpha": 0.5}, "latexify_parms": {"width_scale_factor": 2}, } compare_sampling( sampling_dict_reg, "Regression", n_queries=15, X_pool=X_pool, y_pool=y_pool, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, latexify_dict=latexify_dict, )
Image in a Jupyter notebook