CoCalc -- calibrator.py

GitHub Repository: ethen8181/machine-learning
Path: blob/master/model_selection/prob_calibration/calibration_module/calibrator.py
²⁶⁰⁴ views
1
import numpy as np
2
from sklearn.linear_model import LogisticRegression
3
from sklearn.base import BaseEstimator
4
from calibration_module.utils import create_binned_data, get_bin_boundaries
5

6

7
__all__ = [
8
    'HistogramCalibrator',
9
    'PlattCalibrator',
10
    'PlattHistogramCalibrator'
11
]
12

13

14
class HistogramCalibrator(BaseEstimator):
15
    """
16
    Bins the data based on equal size interval (each bin contains approximately
17
    equal size of samples).
18

19
    Parameters
20
    ----------
21
    n_bins : int, default 15
22
        A bigger bin number requires more data. In general,
23
        the larger the bin size, the closer the calibration error
24
        will be to the true calibration error.
25

26
    Attributes
27
    ----------
28
    bins_ : 1d ndarray
29
        Boundaries for each bin.
30

31
    bins_score_ : 1d ndarray
32
        Calibration score for each bin.
33
    """
34

35
    def __init__(self, n_bins: int=15):
36
        self.n_bins = n_bins
37

38
    def fit(self, y_prob: np.ndarray, y_true: np.ndarray):
39
        """
40
        Learns the bin boundaries and calibration score for each bin.
41

42
        Parameters
43
        ----------
44
        y_prob : 1d ndarray
45
            Raw probability/score of the positive class.
46

47
        y_true : 1d ndarray
48
            Binary true targets.
49

50
        Returns
51
        -------
52
        self
53
        """
54
        binned_y_true, binned_y_prob = create_binned_data(y_true, y_prob, self.n_bins)
55
        self.bins_ = get_bin_boundaries(binned_y_prob)
56
        self.bins_score_ = np.array([np.mean(value) for value in binned_y_true])
57
        return self
58

59
    def predict(self, y_prob: np.ndarray) -> np.ndarray:
60
        """
61
        Predicts the calibrated probability.
62

63
        Parameters
64
        ----------
65
        y_prob : 1d ndarray
66
            Raw probability/score of the positive class.
67

68
        Returns
69
        -------
70
        y_calibrated_prob : 1d ndarray
71
            Calibrated probability.
72
        """
73
        indices = np.searchsorted(self.bins_, y_prob)
74
        return self.bins_score_[indices]
75

76

77
class PlattCalibrator(BaseEstimator):
78
    """
79
    Boils down to applying a Logistic Regression.
80

81
    Parameters
82
    ----------
83
    log_odds : bool, default True
84
        Logistic Regression assumes a linear relationship between its input
85
        and the log-odds of the class probabilities. Converting the probability
86
        to log-odds scale typically improves performance.
87

88
    Attributes
89
    ----------
90
    coef_ : ndarray of shape (1,)
91
        Binary logistic regression's coefficient.
92

93
    intercept_ : ndarray of shape (1,)
94
        Binary logistic regression's intercept.
95
    """
96

97
    def __init__(self, log_odds: bool=True):
98
        self.log_odds = log_odds
99

100
    def fit(self, y_prob: np.ndarray, y_true: np.ndarray):
101
        """
102
        Learns the logistic regression weights.
103

104
        Parameters
105
        ----------
106
        y_prob : 1d ndarray
107
            Raw probability/score of the positive class.
108

109
        y_true : 1d ndarray
110
            Binary true targets.
111

112
        Returns
113
        -------
114
        self
115
        """
116
        self.fit_predict(y_prob, y_true)
117
        return self
118

119
    @staticmethod
120
    def _convert_to_log_odds(y_prob: np.ndarray) -> np.ndarray:
121
        eps = 1e-12
122
        y_prob = np.clip(y_prob, eps, 1 - eps)
123
        y_prob = np.log(y_prob / (1 - y_prob))
124
        return y_prob
125

126
    def predict(self, y_prob: np.ndarray) -> np.ndarray:
127
        """
128
        Predicts the calibrated probability.
129

130
        Parameters
131
        ----------
132
        y_prob : 1d ndarray
133
            Raw probability/score of the positive class.
134

135
        Returns
136
        -------
137
        y_calibrated_prob : 1d ndarray
138
            Calibrated probability.
139
        """
140
        if self.log_odds:
141
            y_prob = self._convert_to_log_odds(y_prob)
142

143
        output = self._transform(y_prob)
144
        return output
145

146
    def _transform(self, y_prob: np.ndarray) -> np.ndarray:
147
        output = y_prob * self.coef_[0] + self.intercept_
148
        output = 1 / (1 + np.exp(-output))
149
        return output
150

151
    def fit_predict(self, y_prob: np.ndarray, y_true: np.ndarray) -> np.ndarray:
152
        """
153
        Chain the .fit and .predict step together.
154

155
        Parameters
156
        ----------
157
        y_prob : 1d ndarray
158
            Raw probability/score of the positive class.
159

160
        y_true : 1d ndarray
161
            Binary true targets.
162

163
        Returns
164
        -------
165
        y_calibrated_prob : 1d ndarray
166
            Calibrated probability. 
167
        """
168
        if self.log_odds:
169
            y_prob = self._convert_to_log_odds(y_prob)
170

171
        # the class expects 2d ndarray as input features
172
        logistic = LogisticRegression(C=1e10, solver='lbfgs')
173
        logistic.fit(y_prob.reshape(-1, 1), y_true)
174
        self.coef_ = logistic.coef_[0]
175
        self.intercept_ = logistic.intercept_
176

177
        y_calibrated_prob = self._transform(y_prob)
178
        return y_calibrated_prob
179

180

181
class PlattHistogramCalibrator(PlattCalibrator):
182
    """
183
    Boils down to first applying a Logistic Regression then perform
184
    histogram binning.
185

186
    Parameters
187
    ----------
188
    log_odds : bool, default True
189
        Logistic Regression assumes a linear relationship between its input
190
        and the log-odds of the class probabilities. Converting the probability
191
        to log-odds scale typically improves performance.
192

193
    n_bins : int, default 15
194
        A bigger bin number requires more data. In general,
195
        the larger the bin size, the closer the calibration error
196
        will be to the true calibration error.
197

198
    Attributes
199
    ----------
200
    coef_ : ndarray of shape (1,)
201
        Binary logistic regresion's coefficient.
202

203
    intercept_ : ndarray of shape (1,)
204
        Binary logistic regression's intercept.
205

206
    bins_ : 1d ndarray
207
        Boundaries for each bin.
208

209
    bins_score_ : 1d ndarray
210
        Calibration score for each bin.
211
    """
212

213
    def __init__(self, log_odds: bool=True, n_bins: int=15):
214
        super().__init__(log_odds)
215
        self.n_bins = n_bins
216

217
    def fit(self, y_prob: np.ndarray, y_true: np.ndarray):
218
        """
219
        Learns the logistic regression weights and the
220
        bin boundaries and calibration score for each bin.
221

222
        Parameters
223
        ----------
224
        y_prob : 1d ndarray
225
            Raw probability/score of the positive class.
226

227
        y_true : 1d ndarray
228
            Binary true targets.
229

230
        Returns
231
        -------
232
        self
233
        """
234
        y_prob_platt = super().fit_predict(y_prob, y_true)
235
        binned_y_true, binned_y_prob = create_binned_data(y_true, y_prob_platt, self.n_bins)
236
        self.bins_ = get_bin_boundaries(binned_y_prob)
237
        self.bins_score_ = np.array([np.mean(value) for value in binned_y_prob])
238
        return self
239

240
    def predict(self, y_prob: np.ndarray) -> np.ndarray:
241
        """
242
        Predicts the calibrated probability.
243

244
        Parameters
245
        ----------
246
        y_prob : 1d ndarray
247
            Raw probability/score of the positive class.
248

249
        Returns
250
        -------
251
        y_calibrated_prob : 1d ndarray
252
            Calibrated probability.
253
        """
254
        y_prob_platt = super().predict(y_prob)
255
        indices = np.searchsorted(self.bins_, y_prob_platt)
256
        return self.bins_score_[indices]
257

258
Product

Resources

Company