Path: blob/master/deprecated/scripts/ard_linreg_logreg.py
1192 views
# Linear and logistic Regression with Automatic Relevance Determination (Fast Version uses1# Sparse Bayesian Learning)23#https://github.com/AmazaspShumik/sklearn-bayes/blob/master/skbayes/rvm_ard_models/fast_rvm.py45import superimport67import numpy as np8from sklearn.base import RegressorMixin, BaseEstimator9#from sklearn.externals import six10from sklearn.linear_model.base import LinearModel, LinearClassifierMixin11from sklearn.utils import check_X_y,check_array,as_float_array12from sklearn.utils.multiclass import check_classification_targets13from sklearn.utils.extmath import pinvh,log_logistic,safe_sparse_dot14from sklearn.metrics.pairwise import pairwise_kernels15from sklearn.utils.validation import check_is_fitted16from scipy.special import expit17from scipy.optimize import fmin_l_bfgs_b18from scipy.linalg import solve_triangular19from scipy.stats import logistic20from numpy.linalg import LinAlgError21import scipy.sparse22import warnings2324#TODO: predict_proba for RVC with Laplace Approximation252627def update_precisions(Q,S,q,s,A,active,tol,n_samples,clf_bias):28'''29Selects one feature to be added/recomputed/deleted to model based on30effect it will have on value of log marginal likelihood.31'''32# initialise vector holding changes in log marginal likelihood33deltaL = np.zeros(Q.shape[0])3435# identify features that can be added , recomputed and deleted in model36theta = q**2 - s37add = (theta > 0) * (active == False)38recompute = (theta > 0) * (active == True)39delete = ~(add + recompute)4041# compute sparsity & quality parameters corresponding to features in42# three groups identified above43Qadd,Sadd = Q[add], S[add]44Qrec,Srec,Arec = Q[recompute], S[recompute], A[recompute]45Qdel,Sdel,Adel = Q[delete], S[delete], A[delete]4647# compute new alpha's (precision parameters) for features that are48# currently in model and will be recomputed49Anew = s[recompute]**2/ ( theta[recompute] + np.finfo(np.float32).eps)50delta_alpha = (1./Anew - 1./Arec)5152# compute change in log marginal likelihood53deltaL[add] = ( Qadd**2 - Sadd ) / Sadd + np.log(Sadd/Qadd**2 )54denom = np.maximum(1e-5, 1 + Srec*delta_alpha) # Kevin Murphy hack55deltaL[recompute] = Qrec**2 / (Srec + 1. / delta_alpha) - np.log(denom)56deltaL[delete] = Qdel**2 / (Sdel - Adel) - np.log(1 - Sdel / Adel)57deltaL = deltaL / n_samples5859# find feature which caused largest change in likelihood60feature_index = np.argmax(deltaL)6162# no deletions or additions63same_features = np.sum( theta[~recompute] > 0) == 06465# changes in precision for features already in model is below threshold66no_delta = np.sum( abs( Anew - Arec ) > tol ) == 06768# check convergence: if no features to add or delete and small change in69# precision for current features then terminate70converged = False71if same_features and no_delta:72converged = True73return [A,converged]7475# if not converged update precision parameter of weights and return76if theta[feature_index] > 0:77A[feature_index] = s[feature_index]**2 / theta[feature_index]78if active[feature_index] == False:79active[feature_index] = True80else:81# at least two active features82if active[feature_index] == True and np.sum(active) >= 2:83# do not remove bias term in classification84# (in regression it is factored in through centering)85if not (feature_index == 0 and clf_bias):86active[feature_index] = False87A[feature_index] = np.PINF8889return [A,converged]909192###############################################################################93# ARD REGRESSION AND CLASSIFICATION94###############################################################################959697#-------------------------- Regression ARD ------------------------------------9899100class RegressionARD(LinearModel,RegressorMixin):101'''102Regression with Automatic Relevance Determination (Fast Version uses103Sparse Bayesian Learning)104105Parameters106----------107n_iter: int, optional (DEFAULT = 100)108Maximum number of iterations109110tol: float, optional (DEFAULT = 1e-3)111If absolute change in precision parameter for weights is below threshold112algorithm terminates.113114fit_intercept : boolean, optional (DEFAULT = True)115whether to calculate the intercept for this model. If set116to false, no intercept will be used in calculations117(e.g. data is expected to be already centered).118119copy_X : boolean, optional (DEFAULT = True)120If True, X will be copied; else, it may be overwritten.121122verbose : boolean, optional (DEFAULT = True)123Verbose mode when fitting the model124125126Attributes127----------128coef_ : array, shape = (n_features)129Coefficients of the regression model (mean of posterior distribution)130131alpha_ : float132estimated precision of the noise133134active_ : array, dtype = np.bool, shape = (n_features)135True for non-zero coefficients, False otherwise136137lambda_ : array, shape = (n_features)138estimated precisions of the coefficients139140sigma_ : array, shape = (n_features, n_features)141estimated covariance matrix of the weights, computed only142for non-zero coefficients143144145References146----------147[1] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003)148(http://www.miketipping.com/papers/met-fastsbl.pdf)149[2] Analysis of sparse Bayesian learning (Tipping & Faul 2001)150(http://www.miketipping.com/abstracts.htm#Faul:NIPS01)151152'''153154def __init__( self, n_iter = 300, tol = 1e-3, fit_intercept = True,155copy_X = True, verbose = False):156self.n_iter = n_iter157self.tol = tol158self.scores_ = list()159self.fit_intercept = fit_intercept160self.copy_X = copy_X161self.verbose = verbose162163164def _center_data(self,X,y):165''' Centers data'''166X = as_float_array(X,self.copy_X)167# normalisation should be done in preprocessing!168X_std = np.ones(X.shape[1], dtype = X.dtype)169if self.fit_intercept:170X_mean = np.average(X,axis = 0)171y_mean = np.average(y,axis = 0)172X -= X_mean173y = y - y_mean174else:175X_mean = np.zeros(X.shape[1],dtype = X.dtype)176y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)177return X,y, X_mean, y_mean, X_std178179180def fit(self,X,y):181'''182Fits ARD Regression with Sequential Sparse Bayes Algorithm.183184Parameters185-----------186X: {array-like, sparse matrix} of size (n_samples, n_features)187Training data, matrix of explanatory variables188189y: array-like of size [n_samples, n_features]190Target values191192Returns193-------194self : object195Returns self.196'''197X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)198X, y, X_mean, y_mean, X_std = self._center_data(X, y)199n_samples, n_features = X.shape200201# precompute X'*Y , X'*X for faster iterations & allocate memory for202# sparsity & quality vectors203XY = np.dot(X.T,y)204XX = np.dot(X.T,X)205XXd = np.diag(XX)206207# initialise precision of noise & and coefficients208var_y = np.var(y)209210# check that variance is non zero !!!211if var_y == 0 :212beta = 1e-2213else:214beta = 1. / np.var(y)215216A = np.PINF * np.ones(n_features)217active = np.zeros(n_features , dtype = np.bool)218219# in case of almost perfect multicollinearity between some features220# start from feature 0221if np.sum( XXd - X_mean**2 < np.finfo(np.float32).eps ) > 0:222A[0] = np.finfo(np.float16).eps223active[0] = True224else:225# start from a single basis vector with largest projection on targets226proj = XY**2 / XXd227start = np.argmax(proj)228active[start] = True229A[start] = XXd[start]/( proj[start] - var_y)230231warning_flag = 0232for i in range(self.n_iter):233XXa = XX[active,:][:,active]234XYa = XY[active]235Aa = A[active]236237# mean & covariance of posterior distribution238Mn,Ri,cholesky = self._posterior_dist(Aa,beta,XXa,XYa)239if cholesky:240Sdiag = np.sum(Ri**2,0)241else:242Sdiag = np.copy(np.diag(Ri))243warning_flag += 1244245# raise warning in case cholesky failes246if warning_flag == 1:247warnings.warn(("Cholesky decomposition failed ! Algorithm uses pinvh, "248"which is significantly slower, if you use RVR it "249"is advised to change parameters of kernel"))250251# compute quality & sparsity parameters252s,q,S,Q = self._sparsity_quality(XX,XXd,XY,XYa,Aa,Ri,active,beta,cholesky)253254# update precision parameter for noise distribution255rss = np.sum( ( y - np.dot(X[:,active] , Mn) )**2 )256beta = n_samples - np.sum(active) + np.sum(Aa * Sdiag )257beta /= ( rss + np.finfo(np.float32).eps )258259# update precision parameters of coefficients260A,converged = update_precisions(Q,S,q,s,A,active,self.tol,261n_samples,False)262if self.verbose:263print(('Iteration: {0}, number of features '264'in the model: {1}').format(i,np.sum(active)))265if converged or i == self.n_iter - 1:266if converged and self.verbose:267print('Algorithm converged !')268break269270# after last update of alpha & beta update parameters271# of posterior distribution272XXa,XYa,Aa = XX[active,:][:,active],XY[active],A[active]273Mn, Sn, cholesky = self._posterior_dist(Aa,beta,XXa,XYa,True)274self.coef_ = np.zeros(n_features)275self.coef_[active] = Mn276self.sigma_ = Sn277self.active_ = active278self.lambda_ = A279self.alpha_ = beta280self._set_intercept(X_mean,y_mean,X_std)281return self282283284def predict_dist(self,X):285'''286Computes predictive distribution for test set.287Predictive distribution for each data point is one dimensional288Gaussian and therefore is characterised by mean and variance.289290Parameters291-----------292X: {array-like, sparse} (n_samples_test, n_features)293Test data, matrix of explanatory variables294295Returns296-------297: list of length two [y_hat, var_hat]298299y_hat: numpy array of size (n_samples_test,)300Estimated values of targets on test set (i.e. mean of predictive301distribution)302303var_hat: numpy array of size (n_samples_test,)304Variance of predictive distribution305'''306y_hat = self._decision_function(X)307var_hat = 1./self.alpha_308var_hat += np.sum( np.dot(X[:,self.active_],self.sigma_) * X[:,self.active_], axis = 1)309return y_hat, var_hat310311312def _posterior_dist(self,A,beta,XX,XY,full_covar=False):313'''314Calculates mean and covariance matrix of posterior distribution315of coefficients.316'''317# compute precision matrix for active features318Sinv = beta * XX319np.fill_diagonal(Sinv, np.diag(Sinv) + A)320cholesky = True321# try cholesky, if it fails go back to pinvh322try:323# find posterior mean : R*R.T*mean = beta*X.T*Y324# solve(R*z = beta*X.T*Y) => find z => solve(R.T*mean = z) => find mean325R = np.linalg.cholesky(Sinv)326Z = solve_triangular(R,beta*XY, check_finite=False, lower = True)327Mn = solve_triangular(R.T,Z, check_finite=False, lower = False)328329# invert lower triangular matrix from cholesky decomposition330Ri = solve_triangular(R,np.eye(A.shape[0]), check_finite=False, lower=True)331if full_covar:332Sn = np.dot(Ri.T,Ri)333return Mn,Sn,cholesky334else:335return Mn,Ri,cholesky336except LinAlgError:337cholesky = False338Sn = pinvh(Sinv)339Mn = beta*np.dot(Sinv,XY)340return Mn, Sn, cholesky341342343344345def _sparsity_quality(self,XX,XXd,XY,XYa,Aa,Ri,active,beta,cholesky):346'''347Calculates sparsity and quality parameters for each feature348349Theoretical Note:350-----------------351Here we used Woodbury Identity for inverting covariance matrix352of target distribution353C = 1/beta + 1/alpha * X' * X354C^-1 = beta - beta^2 * X * Sn * X'355'''356bxy = beta*XY357bxx = beta*XXd358if cholesky:359# here Ri is inverse of lower triangular matrix obtained from cholesky decomp360xxr = np.dot(XX[:,active],Ri.T)361rxy = np.dot(Ri,XYa)362S = bxx - beta**2 * np.sum( xxr**2, axis=1)363Q = bxy - beta**2 * np.dot( xxr, rxy)364else:365# here Ri is covariance matrix366XXa = XX[:,active]367XS = np.dot(XXa,Ri)368S = bxx - beta**2 * np.sum(XS*XXa,1)369Q = bxy - beta**2 * np.dot(XS,XYa)370# Use following:371# (EQ 1) q = A*Q/(A - S) ; s = A*S/(A-S), so if A = np.PINF q = Q, s = S372qi = np.copy(Q)373si = np.copy(S)374# If A is not np.PINF, then it should be 'active' feature => use (EQ 1)375Qa,Sa = Q[active], S[active]376qi[active] = Aa * Qa / (Aa - Sa )377si[active] = Aa * Sa / (Aa - Sa )378return [si,qi,S,Q]379380381#----------------------- Classification ARD -----------------------------------382383384def _logistic_cost_grad(X,Y,w,diagA):385'''386Calculates cost and gradient for logistic regression387'''388n = X.shape[0]389Xw = np.dot(X,w)390s = expit(Xw)391wdA = w*diagA392wdA[0] = 1e-3 # broad prior for bias term => almost no regularization393cost = np.sum( Xw* (1-Y) - log_logistic(Xw)) + np.sum(w*wdA)/2394grad = np.dot(X.T, s - Y) + wdA395return [cost/n,grad/n]396397398399class ClassificationARD(BaseEstimator,LinearClassifierMixin):400'''401Logistic Regression with Automatic Relevance determination (Fast Version uses402Sparse Bayesian Learning)403404Parameters405----------406n_iter: int, optional (DEFAULT = 100)407Maximum number of iterations before termination408409tol: float, optional (DEFAULT = 1e-3)410If absolute change in precision parameter for weights is below threshold411algorithm terminates.412413normalize: bool, optional (DEFAULT = True)414If True normalizes features415416n_iter_solver: int, optional (DEFAULT = 20)417Maximum number of iterations before termination of solver418419tol_solver: float, optional (DEFAULT = 1e-5)420Convergence threshold for solver (it is used in estimating posterior421distribution)422423fit_intercept : bool, optional ( DEFAULT = True )424If True will use intercept in the model. If set425to false, no intercept will be used in calculations426427verbose : boolean, optional (DEFAULT = True)428Verbose mode when fitting the model429430431Attributes432----------433coef_ : array, shape = (n_features)434Coefficients of the regression model (mean of posterior distribution)435436lambda_ : float437estimated precisions of weights438439active_ : array, dtype = np.bool, shape = (n_features)440True for non-zero coefficients, False otherwise441442sigma_ : array, shape = (n_features, n_features)443estimated covariance matrix of the weights, computed only444for non-zero coefficients445446447References448----------449[1] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003)450(http://www.miketipping.com/papers/met-fastsbl.pdf)451[2] Analysis of sparse Bayesian learning (Tipping & Faul 2001)452(http://www.miketipping.com/abstracts.htm#Faul:NIPS01)453'''454def __init__(self, n_iter=100, tol=1e-4, n_iter_solver=15, normalize=True,455tol_solver=1e-4, fit_intercept=True, verbose=False):456self.n_iter = n_iter457self.tol = tol458self.n_iter_solver = n_iter_solver459self.normalize = normalize460self.tol_solver = tol_solver461self.fit_intercept = fit_intercept462self.verbose = verbose463464465def fit(self,X,y):466'''467Fits Logistic Regression with ARD468469Parameters470----------471X: array-like of size [n_samples, n_features]472Training data, matrix of explanatory variables473474y: array-like of size [n_samples]475Target values476477Returns478-------479self : object480Returns self.481'''482X, y = check_X_y(X, y, accept_sparse = None, dtype=np.float64)483484# normalize, if required485if self.normalize:486self._x_mean = np.mean(X,0)487self._x_std = np.std(X,0)488X = (X - self._x_mean) / self._x_std489490# add bias term if required491if self.fit_intercept:492X = np.concatenate((np.ones([X.shape[0],1]),X),1)493494# preprocess targets495check_classification_targets(y)496self.classes_ = np.unique(y)497n_classes = len(self.classes_)498if n_classes < 2:499raise ValueError("Need samples of at least 2 classes"500" in the data, but the data contains only one"501" class: %r" % self.classes_[0])502503# if multiclass use OVR (i.e. fit classifier for each class)504if n_classes < 2:505raise ValueError("Need samples of at least 2 classes")506if n_classes > 2:507self.coef_, self.sigma_ = [0]*n_classes,[0]*n_classes508self.intercept_ , self.active_ = [0]*n_classes, [0]*n_classes509self.lambda_ = [0]*n_classes510else:511self.coef_, self.sigma_, self.intercept_,self.active_ = [0],[0],[0],[0]512self.lambda_ = [0]513514for i in range(len(self.classes_)):515if n_classes == 2:516pos_class = self.classes_[1]517else:518pos_class = self.classes_[i]519mask = (y == pos_class)520y_bin = np.zeros(y.shape, dtype=np.float64)521y_bin[mask] = 1522coef,bias,active,sigma,lambda_ = self._fit(X,y_bin)523self.coef_[i], self.intercept_[i], self.sigma_[i] = coef, bias, sigma524self.active_[i], self.lambda_[i] = active, lambda_525# in case of binary classification fit only one classifier526if n_classes == 2:527break528self.coef_ = np.asarray(self.coef_)529self.intercept_ = np.asarray(self.intercept_)530return self531532533def _fit(self,X,y):534'''535Fits binary classification536'''537n_samples,n_features = X.shape538A = np.PINF * np.ones(n_features)539active = np.zeros(n_features , dtype = np.bool)540541# if we fit intercept, make it active from the beginning542if self.fit_intercept:543active[0] = True544A[0] = np.finfo(np.float16).eps545546warning_flag = 0547for i in range(self.n_iter):548Xa = X[:,active]549Aa = A[active]550551# mean & precision of posterior distribution552Mn,Sn,B,t_hat, cholesky = self._posterior_dist(Xa,y, Aa)553if not cholesky:554warning_flag += 1555556# raise warning in case cholesky failes (but only once)557if warning_flag == 1:558warnings.warn(("Cholesky decomposition failed ! Algorithm uses pinvh, "559"which is significantly slower, if you use RVC it "560"is advised to change parameters of kernel"))561562# compute quality & sparsity parameters563s,q,S,Q = self._sparsity_quality(X,Xa,t_hat,B,A,Aa,active,Sn,cholesky)564565# update precision parameters of coefficients566A,converged = update_precisions(Q,S,q,s,A,active,self.tol,n_samples,self.fit_intercept)567568# terminate if converged569if converged or i == self.n_iter - 1:570break571572Xa,Aa = X[:,active], A[active]573Mn,Sn,B,t_hat,cholesky = self._posterior_dist(Xa,y,Aa)574# in case Sn is inverse of lower triangular matrix of Cholesky decomposition575# compute covariance using formula Sn = np.dot(Rinverse.T , Rinverse)576if cholesky:577Sn = np.dot(Sn.T,Sn)578intercept_ = 0579if self.fit_intercept:580n_features -= 1581if active[0] == True:582intercept_ = Mn[0]583Mn = Mn[1:]584active = active[1:]585coef_ = np.zeros([1,n_features])586coef_[0,active] = Mn587return coef_.squeeze(), intercept_, active, Sn, A588589590def predict(self,X):591'''592Estimates target values on test set593594Parameters595----------596X: array-like of size (n_samples_test, n_features)597Matrix of explanatory variables598599Returns600-------601y_pred: numpy arra of size (n_samples_test,)602Predicted values of targets603'''604probs = self.predict_proba(X)605indices = np.argmax(probs, axis = 1)606y_pred = self.classes_[indices]607return y_pred608609610def _decision_function_active(self,X,coef_,active_,intercept_):611''' Constructs decision function using only relevant features '''612if self.normalize:613X = (X - self._x_mean[active_]) / self._x_std[active_]614decision = safe_sparse_dot(X,coef_[active_]) + intercept_615return decision616617618def decision_function(self,X):619'''620Computes distance to separating hyperplane between classes. The larger621is the absolute value of the decision function further data point is622from the decision boundary.623624Parameters625----------626X: array-like of size (n_samples_test,n_features)627Matrix of explanatory variables628629Returns630-------631decision: numpy array of size (n_samples_test,)632Distance to decision boundary633'''634check_is_fitted(self, 'coef_')635X = check_array(X, accept_sparse=None, dtype = np.float64)636n_features = self.coef_.shape[1]637if X.shape[1] != n_features:638raise ValueError("X has %d features per sample; expecting %d"639% (X.shape[1], n_features))640decision = [self._decision_function_active(X[:,active],coef,active,bias) for641coef,active,bias in zip(self.coef_,self.active_,self.intercept_)]642decision = np.asarray(decision).squeeze().T643return decision644645646def predict_proba(self,X):647'''648Predicts probabilities of targets for test set using probit649function to approximate convolution of sigmoid and Gaussian.650651Parameters652----------653X: array-like of size (n_samples_test,n_features)654Matrix of explanatory variables655656Returns657-------658probs: numpy array of size (n_samples_test,)659Estimated probabilities of target classes660'''661y_hat = self.decision_function(X)662X = check_array(X, accept_sparse=None, dtype = np.float64)663if self.normalize:664X = (X - self._x_mean) / self._x_std665if self.fit_intercept:666X = np.concatenate((np.ones([X.shape[0],1]), X),1)667if y_hat.ndim == 1:668pr = self._convo_approx(X[:,self.lambda_[0]!=np.PINF],669y_hat,self.sigma_[0])670prob = np.vstack([1 - pr, pr]).T671else:672pr = [self._convo_approx(X[:,idx != np.PINF],y_hat[:,i],673self.sigma_[i]) for i,idx in enumerate(self.lambda_) ]674pr = np.asarray(pr).T675prob = pr / np.reshape(np.sum(pr, axis = 1), (pr.shape[0],1))676return prob677678679def _convo_approx(self,X,y_hat,sigma):680''' Computes approximation to convolution of sigmoid and gaussian'''681var = np.sum(np.dot(X,sigma)*X,1)682ks = 1. / ( 1. + np.pi * var/ 8)**0.5683pr = expit(y_hat * ks)684return pr685686687def _sparsity_quality(self,X,Xa,y,B,A,Aa,active,Sn,cholesky):688'''689Calculates sparsity & quality parameters for each feature690'''691XB = X.T*B692bxx = np.dot(B,X**2)693Q = np.dot(X.T,y)694if cholesky:695# Here Sn is inverse of lower triangular matrix, obtained from696# cholesky decomposition697XBX = np.dot(XB,Xa)698XBX = np.dot(XBX,Sn,out=XBX)699S = bxx - np.sum(XBX**2,1)700else:701XSX = np.dot(np.dot(Xa,Sn),Xa.T)702S = bxx - np.sum( np.dot( XB,XSX )*XB,1 )703qi = np.copy(Q)704si = np.copy(S)705Qa,Sa = Q[active], S[active]706qi[active] = Aa * Qa / (Aa - Sa )707si[active] = Aa * Sa / (Aa - Sa )708return [si,qi,S,Q]709710711def _posterior_dist(self,X,y,A):712'''713Uses Laplace approximation for calculating posterior distribution714'''715f = lambda w: _logistic_cost_grad(X,y,w,A)716w_init = np.random.random(X.shape[1])717Mn = fmin_l_bfgs_b(f, x0 = w_init, pgtol = self.tol_solver,718maxiter = self.n_iter_solver)[0]719Xm = np.dot(X,Mn)720s = expit(Xm)721B = logistic._pdf(Xm) # avoids underflow722S = np.dot(X.T*B,X)723np.fill_diagonal(S, np.diag(S) + A)724t_hat = y - s725cholesky = True726# try using Cholesky , if it fails then fall back on pinvh727try:728R = np.linalg.cholesky(S)729Sn = solve_triangular(R,np.eye(A.shape[0]),730check_finite=False,lower=True)731except LinAlgError:732Sn = pinvh(S)733cholesky = False734return [Mn,Sn,B,t_hat,cholesky]735736737738###############################################################################739# Relevance Vector Machine: RVR and RVC740###############################################################################741742743744def get_kernel( X, Y, gamma, degree, coef0, kernel, kernel_params ):745'''746Calculates kernelised features for RVR and RVC747'''748if callable(kernel):749params = kernel_params or {}750else:751params = {"gamma": gamma,752"degree": degree,753"coef0": coef0 }754return pairwise_kernels(X, Y, metric=kernel,755filter_params=True, **params)756757758759class RVR(RegressionARD):760'''761Relevance Vector Regression (Fast Version uses Sparse Bayesian Learning)762763Parameters764----------765n_iter: int, optional (DEFAULT = 300)766Maximum number of iterations767768fit_intercept : boolean, optional (DEFAULT = True)769whether to calculate the intercept for this model770771tol: float, optional (DEFAULT = 1e-3)772If absolute change in precision parameter for weights is below tol773algorithm terminates.774775copy_X : boolean, optional (DEFAULT = True)776If True, X will be copied; else, it may be overwritten.777778verbose : boolean, optional (DEFAULT = True)779Verbose mode when fitting the model780781kernel: str, optional (DEFAULT = 'poly')782Type of kernel to be used (all kernels: ['rbf' | 'poly' | 'sigmoid', 'linear']783784degree : int, (DEFAULT = 3)785Degree for poly kernels. Ignored by other kernels.786787gamma : float, optional (DEFAULT = 1/n_features)788Kernel coefficient for rbf and poly kernels, ignored by other kernels789790coef0 : float, optional (DEFAULT = 1)791Independent term in poly and sigmoid kernels, ignored by other kernels792793kernel_params : mapping of string to any, optional794Parameters (keyword arguments) and values for kernel passed as795callable object, ignored by other kernels796797798Attributes799----------800coef_ : array, shape = (n_features)801Coefficients of the regression model (mean of posterior distribution)802803alpha_ : float804estimated precision of the noise805806active_ : array, dtype = np.bool, shape = (n_features)807True for non-zero coefficients, False otherwise808809lambda_ : array, shape = (n_features)810estimated precisions of the coefficients811812sigma_ : array, shape = (n_features, n_features)813estimated covariance matrix of the weights, computed only814for non-zero coefficients815816relevant_vectors_ : array817Relevant Vectors818819References820----------821[1] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003)822(http://www.miketipping.com/papers/met-fastsbl.pdf)823[2] Analysis of sparse Bayesian learning (Tipping & Faul 2001)824(http://www.miketipping.com/abstracts.htm#Faul:NIPS01)825'''826def __init__(self, n_iter=300, tol = 1e-3, fit_intercept = True, copy_X = True,827verbose = False, kernel = 'poly', degree = 3, gamma = None,828coef0 = 1, kernel_params = None):829super(RVR,self).__init__(n_iter,tol,fit_intercept,copy_X,verbose)830self.kernel = kernel831self.degree = degree832self.gamma = gamma833self.coef0 = coef0834self.kernel_params = kernel_params835836837def fit(self,X,y):838'''839Fit Relevance Vector Regression Model840841Parameters842-----------843X: {array-like,sparse matrix} of size (n_samples, n_features)844Training data, matrix of explanatory variables845846y: array-like of size (n_samples, )847Target values848849Returns850-------851self: object852self853'''854X,y = check_X_y(X,y,accept_sparse=['csr','coo','bsr'],dtype = np.float64)855# kernelise features856K = get_kernel( X, X, self.gamma, self.degree, self.coef0,857self.kernel, self.kernel_params)858859# use fit method of RegressionARD860_ = super(RVR,self).fit(K,y)861862# convert to csr (need to use __getitem__)863convert_tocsr = [scipy.sparse.coo.coo_matrix,864scipy.sparse.dia.dia_matrix,865scipy.sparse.bsr.bsr_matrix]866if type(X) in convert_tocsr:867X = X.tocsr()868self.relevant_ = np.where(self.active_== True)[0]869if X.ndim == 1:870self.relevant_vectors_ = X[self.relevant_]871else:872self.relevant_vectors_ = X[self.relevant_,:]873return self874875876def _decision_function(self,X):877''' Decision function '''878_, predict_vals = self._kernel_decision_function(X)879return predict_vals880881882def _kernel_decision_function(self,X):883''' Computes kernel and decision function based on kernel'''884check_is_fitted(self,'coef_')885X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])886K = get_kernel( X, self.relevant_vectors_, self.gamma, self.degree,887self.coef0, self.kernel, self.kernel_params)888return K , np.dot(K,self.coef_[self.active_]) + self.intercept_889890891def predict_dist(self,X):892'''893Computes predictive distribution for test set. Predictive distribution894for each data point is one dimensional Gaussian and therefore is895characterised by mean and variance.896897Parameters898----------899X: {array-like,sparse matrix} of size (n_samples_test, n_features)900Matrix of explanatory variables901902Returns903-------904: list of length two [y_hat, var_hat]905906y_hat: numpy array of size (n_samples_test,)907Estimated values of targets on test set (i.e. mean of predictive908distribution)909910var_hat: numpy array of size (n_samples_test,)911Variance of predictive distribution912'''913# kernel matrix and mean of predictive distribution914K, y_hat = self._kernel_decision_function(X)915var_hat = 1./self.alpha_916var_hat += np.sum( np.dot(K,self.sigma_) * K, axis = 1)917return y_hat,var_hat918919920921class RVC(ClassificationARD):922'''923Relevance Vector Classifier (Fast Version, uses Sparse Bayesian Learning )924925926Parameters927----------928n_iter: int, optional (DEFAULT = 100)929Maximum number of iterations before termination930931tol: float, optional (DEFAULT = 1e-4)932If absolute change in precision parameter for weights is below tol, then933the algorithm terminates.934n_iter_solver: int, optional (DEFAULT = 15)935Maximum number of iterations before termination of solver936937tol_solver: float, optional (DEFAULT = 1e-4)938Convergence threshold for solver (it is used in estimating posterior939distribution)940941fit_intercept : bool, optional ( DEFAULT = True )942If True will use intercept in the model943verbose : boolean, optional (DEFAULT = True)944Verbose mode when fitting the model945946kernel: str, optional (DEFAULT = 'rbf')947Type of kernel to be used (all kernels: ['rbf' | 'poly' | 'sigmoid']948949degree : int, (DEFAULT = 3)950Degree for poly kernels. Ignored by other kernels.951952gamma : float, optional (DEFAULT = 1/n_features)953Kernel coefficient for rbf and poly kernels, ignored by other kernels954955coef0 : float, optional (DEFAULT = 0.1)956Independent term in poly and sigmoid kernels, ignored by other kernels957958kernel_params : mapping of string to any, optional959Parameters (keyword arguments) and values for kernel passed as960callable object, ignored by other kernels961962963Attributes964----------965coef_ : array, shape = (n_features)966Coefficients of the model (mean of posterior distribution)967968lambda_ : float969Estimated precisions of weights970971active_ : array, dtype = np.bool, shape = (n_features)972True for non-zero coefficients, False otherwise973974sigma_ : array, shape = (n_features, n_features)975Estimated covariance matrix of the weights, computed only for non-zero976coefficients977978979References980----------981[1] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003)982(http://www.miketipping.com/papers/met-fastsbl.pdf)983[2] Analysis of sparse Bayesian learning (Tipping & Faul 2001)984(http://www.miketipping.com/abstracts.htm#Faul:NIPS01)985'''986987def __init__(self, n_iter = 100, tol = 1e-4, n_iter_solver = 15, tol_solver = 1e-4,988fit_intercept = True, verbose = False, kernel = 'rbf', degree = 2,989gamma = None, coef0 = 1, kernel_params = None):990super(RVC,self).__init__(n_iter,tol,n_iter_solver,True,tol_solver,991fit_intercept,verbose)992self.kernel = kernel993self.degree = degree994self.gamma = gamma995self.coef0 = coef0996self.kernel_params = kernel_params997998999def fit(self,X,y):1000'''1001Fit Relevance Vector Classifier10021003Parameters1004-----------1005X: array-like of size [n_samples, n_features]1006Training data, matrix of explanatory variables10071008y: array-like of size [n_samples, n_features]1009Target values10101011Returns1012-------1013self: object1014self1015'''1016X,y = check_X_y(X,y, accept_sparse = None, dtype = np.float64)1017# kernelise features1018K = get_kernel( X, X, self.gamma, self.degree, self.coef0,1019self.kernel, self.kernel_params)1020# use fit method of ClassificationARD1021_ = super(RVC,self).fit(K,y)1022self.relevant_ = [np.where(active==True)[0] for active in self.active_]1023if X.ndim == 1:1024self.relevant_vectors_ = [ X[relevant_] for relevant_ in self.relevant_]1025else:1026self.relevant_vectors_ = [ X[relevant_,:] for relevant_ in self.relevant_ ]1027return self102810291030def decision_function(self,X):1031'''1032Computes distance to separating hyperplane between classes. The larger1033is the absolute value of the decision function further data point is1034from the decision boundary.10351036Parameters1037----------1038X: array-like of size (n_samples_test,n_features)1039Matrix of explanatory variables10401041Returns1042-------1043decision: numpy array of size (n_samples_test,)1044Distance to decision boundary1045'''1046check_is_fitted(self, 'coef_')1047X = check_array(X, accept_sparse=None, dtype = np.float64)1048n_features = self.relevant_vectors_[0].shape[1]1049if X.shape[1] != n_features:1050raise ValueError("X has %d features per sample; expecting %d"1051% (X.shape[1], n_features))1052kernel = lambda rvs : get_kernel(X,rvs,self.gamma, self.degree,1053self.coef0, self.kernel, self.kernel_params)1054decision = []1055for rv,cf,act,b in zip(self.relevant_vectors_,self.coef_,self.active_,1056self.intercept_):1057# if there are no relevant vectors => use intercept only1058if rv.shape[0] == 0:1059decision.append( np.ones(X.shape[0])*b )1060else:1061decision.append(self._decision_function_active(kernel(rv),cf,act,b))1062decision = np.asarray(decision).squeeze().T1063return decision106410651066def predict_proba(self,X):1067'''1068Predicts probabilities of targets.10691070Theoretical Note1071================1072Current version of method does not use MacKay's approximation1073to convolution of Gaussian and sigmoid. This results in less accurate1074estimation of class probabilities and therefore possible increase1075in misclassification error for multiclass problems (prediction accuracy1076for binary classification problems is not changed)10771078Parameters1079----------1080X: array-like of size (n_samples_test,n_features)1081Matrix of explanatory variables10821083Returns1084-------1085probs: numpy array of size (n_samples_test,)1086Estimated probabilities of target classes1087'''1088prob = expit(self.decision_function(X))1089if prob.ndim == 1:1090prob = np.vstack([1 - prob, prob]).T1091prob = prob / np.reshape(np.sum(prob, axis = 1), (prob.shape[0],1))1092return prob109310941095