Path: blob/master/deprecated/scripts/ard_vb_linreg_logreg.py
1192 views
# Bayesian Linear and logistic Regression with ARD prior (fitted with Variational Bayes)12#https://github.com/AmazaspShumik/sklearn-bayes/blob/master/skbayes/rvm_ard_models/vrvm.py34import superimport56import numpy as np7from sklearn.externals import six8from scipy.special import expit9from scipy.linalg import solve_triangular10from sklearn.linear_model.base import LinearModel, LinearClassifierMixin11from sklearn.base import RegressorMixin, BaseEstimator12from sklearn.utils import check_X_y,check_array13from sklearn.metrics.pairwise import pairwise_kernels14from sklearn.utils.validation import check_is_fitted15from sklearn.utils.multiclass import check_classification_targets16from sklearn.utils import as_float_array17import warnings181920class VBRegressionARD(LinearModel,RegressorMixin):21'''22Bayesian Linear Regression with ARD prior (fitted with Variational Bayes)2324Parameters25----------26n_iter: int, optional (DEFAULT = 100)27Maximum number of iterations28fit_intercept : boolean, optional (DEFAULT = True)29If True, intercept will be used in computation3031tol: float, optional (DEFAULT = 1e-3)32If absolute change in precision parameter for weights is below threshold33algorithm terminates.3435copy_X : boolean, optional (DEFAULT = True)36If True, X will be copied, otherwise it will be overwritten.3738verbose : boolean, optional (DEFAULT = True)39Verbose mode when fitting the model4041a: float, optional, (DEFAULT = 1e-5)42Shape parameters for Gamma distributed precision of weights4344b: float, optional, (DEFAULT = 1e-5)45Rate parameter for Gamma distributed precision of weights4647c: float, optional, (DEFAULT = 1e-5)48Shape parameter for Gamma distributed precision of noise4950d: float, optional, (DEFAULT = 1e-5)51Rate parameter for Gamma distributed precision of noise5253prune_thresh: float, ( DEFAULT = 1e-3 )54Threshold for pruning out variable (applied after model is fitted)555657Attributes58----------59coef_ : array, shape = (n_features)60Coefficients of the regression model (mean of posterior distribution)61active_ : array, dtype = np.bool, shape = (n_features)62True for non-zero coefficients, False otherwise63sigma_ : array, shape = (n_features, n_features)64Estimated covariance matrix of the weights, computed only65for non-zero coefficients6667Reference:68----------69[1] Bishop & Tipping (2000), Variational Relevance Vector Machine70[2] Jan Drugowitch (2014), Variational Bayesian Inference for Bayesian Linear71and Logistic Regression72[3] Bishop (2006) Pattern Recognition and Machine Learning (ch. 7)73'''74def __init__(self, n_iter = 100, tol = 1e-3, fit_intercept = True,75a = 1e-5, b = 1e-5, c = 1e-5, d = 1e-5, copy_X = True,76prune_thresh = 1e-3, verbose = False):77self.n_iter = n_iter78self.tol = tol79self.fit_intercept = fit_intercept80self.a,self.b = a,b81self.c,self.d = c,d82self.copy_X = copy_X83self.verbose = verbose84self.prune_thresh = prune_thresh858687def _center_data(self,X,y):88''' Centers data'''89X = as_float_array(X,self.copy_X)90# normalisation should be done in preprocessing!91X_std = np.ones(X.shape[1], dtype = X.dtype)92if self.fit_intercept:93X_mean = np.average(X,axis = 0)94y_mean = np.average(y,axis = 0)95X -= X_mean96y = y - y_mean97else:98X_mean = np.zeros(X.shape[1],dtype = X.dtype)99y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)100return X,y, X_mean, y_mean, X_std101102103def fit(self,X,y):104'''105Fits variational relevance ARD regression106107Parameters108-----------109X: array-like of size [n_samples, n_features]110Training data, matrix of explanatory variables111112y: array-like of size [n_samples, n_features]113Target values114115Returns116-------117self : object118Returns self.119'''120# precompute some values for faster iterations121X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)122n_samples, n_features = X.shape123X, y, X_mean, y_mean, X_std = self._center_data(X, y)124XX = np.dot(X.T,X)125XY = np.dot(X.T,y)126Y2 = np.sum(y**2)127128# final update for a and c129a = (self.a + 0.5) * np.ones(n_features, dtype = np.float)130c = (self.c + 0.5 * n_samples) #* np.ones(n_features, dtype = np.float)131# initial values of b,d before mean field approximation132d = self.d #* np.ones(n_features, dtype = np.float)133b = self.b * np.ones(n_features, dtype = np.float)134active = np.ones(n_features, dtype = np.bool)135w0 = np.zeros(n_features)136w = np.copy(w0)137138for i in range(self.n_iter):139# ---------------------- update q(w) -----------------------140141# calculate expectations for precision of noise & precision of weights142e_tau = c / d143e_A = a / b144XXa = XX[active,:][:,active]145XYa = XY[active]146Xa = X[:,active]147# parameters of updated posterior distribution148w[active],Ri = self._posterior_weights(XXa,XYa,e_tau,e_A[active])149150# --------------------- update q(tau) ------------------------151# update rate parameter for Gamma distributed precision of noise152XSX = np.sum( np.dot(Xa,Ri.T)**2)153XMw = np.sum( np.dot(Xa,w[active])**2 )154XYw = np.sum( w[active]*XYa )155d = self.d + 0.5*(Y2 + XMw + XSX) - XYw156157# -------------------- update q(alpha(j)) for each j ----------158# update rate parameter for Gamma distributed precision of weights159b[active] = self.b + 0.5*(w[active]**2 + np.sum(Ri**2,axis = 1))160161# -------------------- check convergence ----------------------162# determine relevant vector as is described in Bishop & Tipping163# (i.e. using mean of posterior distribution)164active = np.abs(w) > self.prune_thresh165166# make sure there is at least one relevant feature167if np.sum(active) == 0:168active[np.argmax(np.abs(w))] = True169# all irrelevant features are forced to zero170w[~active] = 0171# check convergence172if np.sum(abs(w-w0) > self.tol) == 0 or i==self.n_iter-1:173break174w0 = np.copy(w)175# if only one relevant feature => terminate176if np.sum(active)== 1:177if X.shape[1] > 3 and self.prune_thresh > 1e-1:178warnings.warn(("Only one relevant feature found! it can be useful to decrease"179"value for parameter prune_thresh"))180break181182# update parameters after last update183e_tau = c / d184e_A = a / b185XXa = XX[active,:][:,active]186XYa = XY[active]187w[active], self.sigma_ = self._posterior_weights(XXa,XYa,e_tau,e_A[active],True)188self._e_tau_ = e_tau189self.coef_ = w190self._set_intercept(X_mean,y_mean,X_std)191self.active_ = active192return self193194195196def predict_dist(self,X):197'''198Computes predictive distribution for test set.199Predictive distribution for each data point is one dimensional200Gaussian and therefore is characterised by mean and standard201deviation.202203Parameters204-----------205X: {array-like, sparse} [n_samples_test, n_features]206Test data, matrix of explanatory variables207208Returns209-------210y_hat: numpy array of size (n_samples_test,)211Estimated values of targets on test set (Mean of predictive distribution)212213var_hat: numpy array of size (n_samples_test,)214Error bounds (Standard deviation of predictive distribution)215'''216y_hat = self._decision_function(X)217data_noise = 1./self._e_tau_218model_noise = np.sum( np.dot(X[:,self.active_],self.sigma_) * X[:,self.active_], axis = 1)219var_hat = data_noise + model_noise220return y_hat, var_hat221222223224def _posterior_weights(self, XX, XY, exp_tau, exp_A, full_covar = False):225'''226Calculates parameters of posterior distribution of weights227228Parameters:229-----------230X: numpy array of size n_features231Matrix of active features (changes at each iteration)232233XY: numpy array of size [n_features]234Dot product of X and Y (for faster computations)235exp_tau: float236Mean of precision parameter of noise237238exp_A: numpy array of size n_features239Vector of precisions for weights240241Returns:242--------243[Mw, Sigma]: list of two numpy arrays244245Mw: mean of posterior distribution246Sigma: covariance matrix247'''248# compute precision parameter249S = exp_tau*XX250np.fill_diagonal(S, np.diag(S) + exp_A)251252# cholesky decomposition253R = np.linalg.cholesky(S)254255# find mean of posterior distribution256RtMw = solve_triangular(R, exp_tau*XY, lower = True, check_finite = False)257Mw = solve_triangular(R.T, RtMw, lower = False, check_finite = False)258259# use cholesky decomposition of S to find inverse ( or diagonal of inverse)260Ri = solve_triangular(R, np.eye(R.shape[1]), lower = True, check_finite = False)261if full_covar:262Sigma = np.dot(Ri.T,Ri)263return [Mw,Sigma]264else:265return [Mw,Ri]266267268#---------------------- Classification ---------------------------------------------269270271272def lam(eps):273'''274Calculates lambda eps [part of local variational approximation275to sigmoid function]276'''277return 0.5 / eps * ( expit(eps) - 0.5 )278279280class VBClassificationARD(LinearClassifierMixin, BaseEstimator):281'''282Variational Bayesian Logistic Regression with local variational approximation.283284285Parameters:286-----------287n_iter: int, optional (DEFAULT = 50 )288Maximum number of iterations289290tol: float, optional (DEFAULT = 1e-3)291Convergence threshold, if cange in coefficients is less than threshold292algorithm is terminated293294fit_intercept: bool, optinal ( DEFAULT = True )295If True uses bias term in model fitting296297a: float, optional (DEFAULT = 1e-6)298Rate parameter for Gamma prior on precision parameter of coefficients299300b: float, optional (DEFAULT = 1e-6)301Shape parameter for Gamma prior on precision parameter of coefficients302303prune_thresh: float, optional (DEFAULT = 1e-4)304Threshold for pruning out variable (applied after model is fitted)305306verbose: bool, optional (DEFAULT = False)307Verbose mode308309310Attributes311----------312coef_ : array, shape = (n_features)313Coefficients of the regression model (mean of posterior distribution)314sigma_ : array, shape = (n_features, n_features)315estimated covariance matrix of the weights, computed only316for non-zero coefficients317318intercept_: array, shape = (n_features)319intercepts320321active_ : array, dtype = np.bool, shape = (n_features)322True for non-zero coefficients, False otherwise323References:324-----------325[1] Bishop 2006, Pattern Recognition and Machine Learning ( Chapter 7,10 )326[2] Murphy 2012, Machine Learning A Probabilistic Perspective ( Chapter 14,21 )327[3] Bishop & Tipping 2000, Variational Relevance Vector Machine328'''329def __init__(self, n_iter = 100, tol = 1e-3, fit_intercept = True,330a = 1e-4, b = 1e-4, prune_thresh = 1e-4, verbose = True):331self.n_iter = n_iter332self.tol = tol333self.verbose = verbose334self.prune_thresh = prune_thresh335self.fit_intercept = fit_intercept336self.a = a337self.b = b338339340def fit(self,X,y):341'''342Fits variational Bayesian Logistic Regression with local variational bound343344Parameters345----------346X: array-like of size (n_samples, n_features)347Matrix of explanatory variables348349y: array-like of size (n_samples,)350Vector of dependent variables351Returns352-------353self: object354self355'''356# preprocess data357X,y = check_X_y( X, y , dtype = np.float64)358check_classification_targets(y)359self.classes_ = np.unique(y)360n_classes = len(self.classes_)361362# take into account bias term if required363n_samples, n_features = X.shape364n_features = n_features + int(self.fit_intercept)365if self.fit_intercept:366X = np.hstack( (np.ones([n_samples,1]),X))367368# handle multiclass problems using One-vs-Rest369if n_classes < 2:370raise ValueError("Need samples of at least 2 classes")371if n_classes > 2:372self.coef_, self.sigma_ = [0]*n_classes,[0]*n_classes373self.intercept_ = [0]*n_classes374self.active_ = [0]*n_classes375else:376self.coef_, self.sigma_, self.intercept_ = [0],[0],[0]377self.active_ = [0]378379# hyperparameters of precision for weights380a = self.a + 0.5 * np.ones(n_features)381b = self.b * np.ones(n_features)382383for i in range(len(self.coef_)):384if n_classes == 2:385pos_class = self.classes_[1]386else:387pos_class = self.classes_[i]388mask = (y == pos_class)389y_bin = np.ones(y.shape)390y_bin[~mask] = 0391coef_, sigma_, intercept_, active_ = self._fit(X,y_bin,a,b)392self.coef_[i] = coef_393self.intercept_[i] = intercept_394self.sigma_[i] = sigma_395self.active_[i] = active_396397self.coef_ = np.asarray(self.coef_)398return self399400401def predict_proba(self,x):402'''403Predicts probabilities of targets for test set404405Parameters406----------407X: array-like of size [n_samples_test,n_features]408Matrix of explanatory variables (test set)409410Returns411-------412probs: numpy array of size [n_samples_test]413Estimated probabilities of target classes414'''415scores = self.decision_function(x)416if self.fit_intercept:417x = np.hstack( (np.ones([x.shape[0],1]),x))418var = [np.sum(np.dot(x[:,a],s)*x[:,a],axis = 1) for a,s in zip(self.active_,self.sigma_)]419sigma = np.asarray(var)420ks = 1. / ( 1. + np.pi*sigma / 8)**0.5421probs = expit(scores.T*ks).T422if probs.shape[1] == 1:423probs = np.hstack([1 - probs, probs])424else:425probs /= np.reshape(np.sum(probs, axis = 1), (probs.shape[0],1))426return probs427428429def _fit(self,X,y,a,b):430'''431Fits single classifier for each class (for OVR framework)432'''433eps = 1 # default starting parameter for Jaakola Jordan bound434w0 = np.zeros(X.shape[1])435w = np.copy(w0)436active = np.ones(X.shape[1], dtype = np.bool)437XY = np.dot(X.T, y - 0.5)438439for i in range(self.n_iter):440# In the E-step we update approximation of441# posterior distribution q(w,alpha) = q(w)*q(alpha)442443# --------- update q(w) ------------------444l = lam(eps)445Xa = X[:,active]446XYa = XY[active] #np.dot(Xa.T,(y-0.5))447w[active],Ri = u,v = self._posterior_dist(Xa,l,a[active],b[active],XYa)448449# -------- update q(alpha) ---------------450b[active] = self.b + 0.5*(w[active]**2 + np.sum(Ri**2,1))451452# -------- update eps ------------453# In the M-step we update parameter eps which controls454# accuracy of local variational approximation to lower bound455XMX = np.dot(Xa,w[active])**2456XSX = np.sum( np.dot(Xa,Ri.T)**2, axis = 1)457eps = np.sqrt( XMX + XSX )458459# determine relevant vector as is described in Bishop & Tipping460# (i.e. using mean of posterior distribution)461active = np.abs(w) > self.prune_thresh462463# do not prune intercept & make sure there is at least one 'relevant feature'.464# If only one relevant feature , then choose rv with largest posterior mean465if self.fit_intercept:466active[0] = True467if np.sum(active[1:]) == 0:468active[np.argmax(np.abs(w[1:]))] = True469else:470if np.sum(active) == 0:471active[np.argmax(np.abs(w))] = True472# all irrelevant features are forced to zero473w[~active] = 0474# check convergence475if np.sum(abs(w-w0) > self.tol) == 0 or i==self.n_iter-1:476break477w0 = np.copy(w)478# if only one relevant feature => terminate479if np.sum(active) - 1*self.fit_intercept == 1:480if X.shape[1] > 3 and self.prune_thresh > 1e-1:481warnings.warn(("Only one relevant feature found! it can be useful to decrease"482"value for parameter prune_thresh"))483break484485l = lam(eps)486Xa = X[:,active]487XYa = np.dot(Xa.T,(y-0.5))488w[active] , sigma_ = self._posterior_dist(Xa,l,a[active],b[active],XYa,True)489490# separate intercept & coefficients491intercept_ = 0492if self.fit_intercept:493intercept_ = w[0]494coef_ = np.copy(w[1:])495else:496coef_ = w497return coef_, sigma_ , intercept_, active498499500def _posterior_dist(self,X,l,a,b,XY,full_covar = False):501'''502Finds gaussian approximation to posterior of coefficients503'''504sigma_inv = 2*np.dot(X.T*l,X)505alpha_vec = a / b506if self.fit_intercept:507alpha_vec[0] = np.finfo(np.float64).eps508np.fill_diagonal(sigma_inv, np.diag(sigma_inv) + alpha_vec)509R = np.linalg.cholesky(sigma_inv)510Z = solve_triangular(R,XY, lower = True)511mean_ = solve_triangular(R.T,Z,lower = False)512Ri = solve_triangular(R,np.eye(X.shape[1]), lower = True)513if full_covar:514sigma_ = np.dot(Ri.T,Ri)515return mean_ , sigma_516else:517return mean_ , Ri518519520521522