Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/model_selection/prob_calibration/calibration_module/calibrator.py
2604 views
1
import numpy as np
2
from sklearn.linear_model import LogisticRegression
3
from sklearn.base import BaseEstimator
4
from calibration_module.utils import create_binned_data, get_bin_boundaries
5
6
7
__all__ = [
8
'HistogramCalibrator',
9
'PlattCalibrator',
10
'PlattHistogramCalibrator'
11
]
12
13
14
class HistogramCalibrator(BaseEstimator):
15
"""
16
Bins the data based on equal size interval (each bin contains approximately
17
equal size of samples).
18
19
Parameters
20
----------
21
n_bins : int, default 15
22
A bigger bin number requires more data. In general,
23
the larger the bin size, the closer the calibration error
24
will be to the true calibration error.
25
26
Attributes
27
----------
28
bins_ : 1d ndarray
29
Boundaries for each bin.
30
31
bins_score_ : 1d ndarray
32
Calibration score for each bin.
33
"""
34
35
def __init__(self, n_bins: int=15):
36
self.n_bins = n_bins
37
38
def fit(self, y_prob: np.ndarray, y_true: np.ndarray):
39
"""
40
Learns the bin boundaries and calibration score for each bin.
41
42
Parameters
43
----------
44
y_prob : 1d ndarray
45
Raw probability/score of the positive class.
46
47
y_true : 1d ndarray
48
Binary true targets.
49
50
Returns
51
-------
52
self
53
"""
54
binned_y_true, binned_y_prob = create_binned_data(y_true, y_prob, self.n_bins)
55
self.bins_ = get_bin_boundaries(binned_y_prob)
56
self.bins_score_ = np.array([np.mean(value) for value in binned_y_true])
57
return self
58
59
def predict(self, y_prob: np.ndarray) -> np.ndarray:
60
"""
61
Predicts the calibrated probability.
62
63
Parameters
64
----------
65
y_prob : 1d ndarray
66
Raw probability/score of the positive class.
67
68
Returns
69
-------
70
y_calibrated_prob : 1d ndarray
71
Calibrated probability.
72
"""
73
indices = np.searchsorted(self.bins_, y_prob)
74
return self.bins_score_[indices]
75
76
77
class PlattCalibrator(BaseEstimator):
78
"""
79
Boils down to applying a Logistic Regression.
80
81
Parameters
82
----------
83
log_odds : bool, default True
84
Logistic Regression assumes a linear relationship between its input
85
and the log-odds of the class probabilities. Converting the probability
86
to log-odds scale typically improves performance.
87
88
Attributes
89
----------
90
coef_ : ndarray of shape (1,)
91
Binary logistic regression's coefficient.
92
93
intercept_ : ndarray of shape (1,)
94
Binary logistic regression's intercept.
95
"""
96
97
def __init__(self, log_odds: bool=True):
98
self.log_odds = log_odds
99
100
def fit(self, y_prob: np.ndarray, y_true: np.ndarray):
101
"""
102
Learns the logistic regression weights.
103
104
Parameters
105
----------
106
y_prob : 1d ndarray
107
Raw probability/score of the positive class.
108
109
y_true : 1d ndarray
110
Binary true targets.
111
112
Returns
113
-------
114
self
115
"""
116
self.fit_predict(y_prob, y_true)
117
return self
118
119
@staticmethod
120
def _convert_to_log_odds(y_prob: np.ndarray) -> np.ndarray:
121
eps = 1e-12
122
y_prob = np.clip(y_prob, eps, 1 - eps)
123
y_prob = np.log(y_prob / (1 - y_prob))
124
return y_prob
125
126
def predict(self, y_prob: np.ndarray) -> np.ndarray:
127
"""
128
Predicts the calibrated probability.
129
130
Parameters
131
----------
132
y_prob : 1d ndarray
133
Raw probability/score of the positive class.
134
135
Returns
136
-------
137
y_calibrated_prob : 1d ndarray
138
Calibrated probability.
139
"""
140
if self.log_odds:
141
y_prob = self._convert_to_log_odds(y_prob)
142
143
output = self._transform(y_prob)
144
return output
145
146
def _transform(self, y_prob: np.ndarray) -> np.ndarray:
147
output = y_prob * self.coef_[0] + self.intercept_
148
output = 1 / (1 + np.exp(-output))
149
return output
150
151
def fit_predict(self, y_prob: np.ndarray, y_true: np.ndarray) -> np.ndarray:
152
"""
153
Chain the .fit and .predict step together.
154
155
Parameters
156
----------
157
y_prob : 1d ndarray
158
Raw probability/score of the positive class.
159
160
y_true : 1d ndarray
161
Binary true targets.
162
163
Returns
164
-------
165
y_calibrated_prob : 1d ndarray
166
Calibrated probability.
167
"""
168
if self.log_odds:
169
y_prob = self._convert_to_log_odds(y_prob)
170
171
# the class expects 2d ndarray as input features
172
logistic = LogisticRegression(C=1e10, solver='lbfgs')
173
logistic.fit(y_prob.reshape(-1, 1), y_true)
174
self.coef_ = logistic.coef_[0]
175
self.intercept_ = logistic.intercept_
176
177
y_calibrated_prob = self._transform(y_prob)
178
return y_calibrated_prob
179
180
181
class PlattHistogramCalibrator(PlattCalibrator):
182
"""
183
Boils down to first applying a Logistic Regression then perform
184
histogram binning.
185
186
Parameters
187
----------
188
log_odds : bool, default True
189
Logistic Regression assumes a linear relationship between its input
190
and the log-odds of the class probabilities. Converting the probability
191
to log-odds scale typically improves performance.
192
193
n_bins : int, default 15
194
A bigger bin number requires more data. In general,
195
the larger the bin size, the closer the calibration error
196
will be to the true calibration error.
197
198
Attributes
199
----------
200
coef_ : ndarray of shape (1,)
201
Binary logistic regresion's coefficient.
202
203
intercept_ : ndarray of shape (1,)
204
Binary logistic regression's intercept.
205
206
bins_ : 1d ndarray
207
Boundaries for each bin.
208
209
bins_score_ : 1d ndarray
210
Calibration score for each bin.
211
"""
212
213
def __init__(self, log_odds: bool=True, n_bins: int=15):
214
super().__init__(log_odds)
215
self.n_bins = n_bins
216
217
def fit(self, y_prob: np.ndarray, y_true: np.ndarray):
218
"""
219
Learns the logistic regression weights and the
220
bin boundaries and calibration score for each bin.
221
222
Parameters
223
----------
224
y_prob : 1d ndarray
225
Raw probability/score of the positive class.
226
227
y_true : 1d ndarray
228
Binary true targets.
229
230
Returns
231
-------
232
self
233
"""
234
y_prob_platt = super().fit_predict(y_prob, y_true)
235
binned_y_true, binned_y_prob = create_binned_data(y_true, y_prob_platt, self.n_bins)
236
self.bins_ = get_bin_boundaries(binned_y_prob)
237
self.bins_score_ = np.array([np.mean(value) for value in binned_y_prob])
238
return self
239
240
def predict(self, y_prob: np.ndarray) -> np.ndarray:
241
"""
242
Predicts the calibrated probability.
243
244
Parameters
245
----------
246
y_prob : 1d ndarray
247
Raw probability/score of the positive class.
248
249
Returns
250
-------
251
y_calibrated_prob : 1d ndarray
252
Calibrated probability.
253
"""
254
y_prob_platt = super().predict(y_prob)
255
indices = np.searchsorted(self.bins_, y_prob_platt)
256
return self.bins_score_[indices]
257
258