Path: blob/master/model_selection/prob_calibration/calibration_module/calibrator.py
2604 views
import numpy as np1from sklearn.linear_model import LogisticRegression2from sklearn.base import BaseEstimator3from calibration_module.utils import create_binned_data, get_bin_boundaries456__all__ = [7'HistogramCalibrator',8'PlattCalibrator',9'PlattHistogramCalibrator'10]111213class HistogramCalibrator(BaseEstimator):14"""15Bins the data based on equal size interval (each bin contains approximately16equal size of samples).1718Parameters19----------20n_bins : int, default 1521A bigger bin number requires more data. In general,22the larger the bin size, the closer the calibration error23will be to the true calibration error.2425Attributes26----------27bins_ : 1d ndarray28Boundaries for each bin.2930bins_score_ : 1d ndarray31Calibration score for each bin.32"""3334def __init__(self, n_bins: int=15):35self.n_bins = n_bins3637def fit(self, y_prob: np.ndarray, y_true: np.ndarray):38"""39Learns the bin boundaries and calibration score for each bin.4041Parameters42----------43y_prob : 1d ndarray44Raw probability/score of the positive class.4546y_true : 1d ndarray47Binary true targets.4849Returns50-------51self52"""53binned_y_true, binned_y_prob = create_binned_data(y_true, y_prob, self.n_bins)54self.bins_ = get_bin_boundaries(binned_y_prob)55self.bins_score_ = np.array([np.mean(value) for value in binned_y_true])56return self5758def predict(self, y_prob: np.ndarray) -> np.ndarray:59"""60Predicts the calibrated probability.6162Parameters63----------64y_prob : 1d ndarray65Raw probability/score of the positive class.6667Returns68-------69y_calibrated_prob : 1d ndarray70Calibrated probability.71"""72indices = np.searchsorted(self.bins_, y_prob)73return self.bins_score_[indices]747576class PlattCalibrator(BaseEstimator):77"""78Boils down to applying a Logistic Regression.7980Parameters81----------82log_odds : bool, default True83Logistic Regression assumes a linear relationship between its input84and the log-odds of the class probabilities. Converting the probability85to log-odds scale typically improves performance.8687Attributes88----------89coef_ : ndarray of shape (1,)90Binary logistic regression's coefficient.9192intercept_ : ndarray of shape (1,)93Binary logistic regression's intercept.94"""9596def __init__(self, log_odds: bool=True):97self.log_odds = log_odds9899def fit(self, y_prob: np.ndarray, y_true: np.ndarray):100"""101Learns the logistic regression weights.102103Parameters104----------105y_prob : 1d ndarray106Raw probability/score of the positive class.107108y_true : 1d ndarray109Binary true targets.110111Returns112-------113self114"""115self.fit_predict(y_prob, y_true)116return self117118@staticmethod119def _convert_to_log_odds(y_prob: np.ndarray) -> np.ndarray:120eps = 1e-12121y_prob = np.clip(y_prob, eps, 1 - eps)122y_prob = np.log(y_prob / (1 - y_prob))123return y_prob124125def predict(self, y_prob: np.ndarray) -> np.ndarray:126"""127Predicts the calibrated probability.128129Parameters130----------131y_prob : 1d ndarray132Raw probability/score of the positive class.133134Returns135-------136y_calibrated_prob : 1d ndarray137Calibrated probability.138"""139if self.log_odds:140y_prob = self._convert_to_log_odds(y_prob)141142output = self._transform(y_prob)143return output144145def _transform(self, y_prob: np.ndarray) -> np.ndarray:146output = y_prob * self.coef_[0] + self.intercept_147output = 1 / (1 + np.exp(-output))148return output149150def fit_predict(self, y_prob: np.ndarray, y_true: np.ndarray) -> np.ndarray:151"""152Chain the .fit and .predict step together.153154Parameters155----------156y_prob : 1d ndarray157Raw probability/score of the positive class.158159y_true : 1d ndarray160Binary true targets.161162Returns163-------164y_calibrated_prob : 1d ndarray165Calibrated probability.166"""167if self.log_odds:168y_prob = self._convert_to_log_odds(y_prob)169170# the class expects 2d ndarray as input features171logistic = LogisticRegression(C=1e10, solver='lbfgs')172logistic.fit(y_prob.reshape(-1, 1), y_true)173self.coef_ = logistic.coef_[0]174self.intercept_ = logistic.intercept_175176y_calibrated_prob = self._transform(y_prob)177return y_calibrated_prob178179180class PlattHistogramCalibrator(PlattCalibrator):181"""182Boils down to first applying a Logistic Regression then perform183histogram binning.184185Parameters186----------187log_odds : bool, default True188Logistic Regression assumes a linear relationship between its input189and the log-odds of the class probabilities. Converting the probability190to log-odds scale typically improves performance.191192n_bins : int, default 15193A bigger bin number requires more data. In general,194the larger the bin size, the closer the calibration error195will be to the true calibration error.196197Attributes198----------199coef_ : ndarray of shape (1,)200Binary logistic regresion's coefficient.201202intercept_ : ndarray of shape (1,)203Binary logistic regression's intercept.204205bins_ : 1d ndarray206Boundaries for each bin.207208bins_score_ : 1d ndarray209Calibration score for each bin.210"""211212def __init__(self, log_odds: bool=True, n_bins: int=15):213super().__init__(log_odds)214self.n_bins = n_bins215216def fit(self, y_prob: np.ndarray, y_true: np.ndarray):217"""218Learns the logistic regression weights and the219bin boundaries and calibration score for each bin.220221Parameters222----------223y_prob : 1d ndarray224Raw probability/score of the positive class.225226y_true : 1d ndarray227Binary true targets.228229Returns230-------231self232"""233y_prob_platt = super().fit_predict(y_prob, y_true)234binned_y_true, binned_y_prob = create_binned_data(y_true, y_prob_platt, self.n_bins)235self.bins_ = get_bin_boundaries(binned_y_prob)236self.bins_score_ = np.array([np.mean(value) for value in binned_y_prob])237return self238239def predict(self, y_prob: np.ndarray) -> np.ndarray:240"""241Predicts the calibrated probability.242243Parameters244----------245y_prob : 1d ndarray246Raw probability/score of the positive class.247248Returns249-------250y_calibrated_prob : 1d ndarray251Calibrated probability.252"""253y_prob_platt = super().predict(y_prob)254indices = np.searchsorted(self.bins_, y_prob_platt)255return self.bins_score_[indices]256257258