Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/model_selection/partial_dependence/partial_dependence.py
2585 views
1
import numpy as np
2
import pandas as pd
3
import matplotlib.pyplot as plt
4
from math import ceil
5
from joblib import Parallel, delayed
6
from matplotlib.gridspec import GridSpec
7
8
9
__all__ = ['PartialDependenceExplainer']
10
11
12
class PartialDependenceExplainer:
13
"""
14
Partial Dependence explanation [1]_.
15
16
- Supports scikit-learn like classification and regression classifiers.
17
- Works for both numerical and categorical columns.
18
19
Parameters
20
----------
21
estimator : sklearn-like classifier
22
Model that was fitted on the data.
23
24
n_grid_points : int, default 50
25
Number of grid points used in replacement
26
for the original numeric data. Only used
27
if the targeted column is numeric. For categorical
28
column, the number of grid points will always be
29
the distinct number of categories in that column.
30
Smaller number of grid points serves as an
31
approximation for the total number of unique
32
points and will result in faster computation
33
34
batch_size : int, default = 'auto'
35
Compute partial depedence prediction batch by batch to save
36
memory usage, the default batch size will be
37
ceil(number of rows in the data / the number of grid points used)
38
39
n_jobs : int, default 1
40
Number of jobs to run in parallel, if the model already fits
41
extremely fast on the data, then specify 1 so that there's no
42
overhead of spawning different processes to do the computation
43
44
verbose : int, default 1
45
The verbosity level: if non zero, progress messages are printed.
46
Above 50, the output is sent to stdout. The frequency of the messages increases
47
with the verbosity level. If it more than 10, all iterations are reported.
48
49
pre_dispatch : int or str, default '2*n_jobs'
50
Controls the number of jobs that get dispatched during parallel
51
execution. Reducing this number can be useful to avoid an
52
explosion of memory consumption when more jobs get dispatched
53
than CPUs can process. Possible inputs:
54
- None, in which case all the jobs are immediately
55
created and spawned. Use this for lightweight and
56
fast-running jobs, to avoid delays due to on-demand
57
spawning of the jobs
58
- An int, giving the exact number of total jobs that are
59
spawned
60
- A string, giving an expression as a function of n_jobs,
61
as in '2*n_jobs'
62
63
Attributes
64
----------
65
feature_name_ : str
66
The input feature_name to the .fit unmodified, will
67
be used in subsequent method.
68
69
feature_type_ : str
70
The input feature_type to the .fit unmodified, will
71
be used in subsequent method.
72
73
feature_grid_ : 1d ndarray
74
Unique grid points that were used to generate the
75
partial dependence result.
76
77
results : list of DataFrame
78
Partial dependence result. If it's a classification
79
estimator then each index of the list is the result
80
for each class. On the other hand, if it's a regression
81
estimator, it will be a list with 1 element.
82
83
References
84
----------
85
.. [1] `Python partial dependence plot toolbox
86
<https://github.com/SauceCat/PDPbox>`_
87
"""
88
89
def __init__(self, estimator, n_grid_points = 50, batch_size = 'auto',
90
n_jobs = 1, verbose = 1, pre_dispatch = '2*n_jobs'):
91
self.n_jobs = n_jobs
92
self.verbose = verbose
93
self.estimator = estimator
94
self.pre_dispatch = pre_dispatch
95
self.n_grid_points = n_grid_points
96
97
def fit(self, data, feature_name, feature_type):
98
"""
99
Obtain the partial dependence result.
100
101
Parameters
102
----------
103
data : DataFrame, shape [n_samples, n_features]
104
Input data to the estimator/model.
105
106
feature_name : str
107
Feature's name in the data what we wish to explain.
108
109
feature_type : str, {'num', 'cat'}
110
Specify whether feature_name is a numerical or
111
categorical column.
112
113
Returns
114
-------
115
self
116
"""
117
118
# check whether it's a classification or regression model
119
estimator = self.estimator
120
try:
121
n_classes = estimator.classes_.size
122
is_classifier = True
123
predict = estimator.predict_proba
124
except AttributeError:
125
# for regression problem, still set the
126
# number of classes to 1 to initialize
127
# the loop later downstream
128
n_classes = 1
129
is_classifier = False
130
predict = estimator.predict
131
132
target = data[feature_name]
133
unique_target = np.unique(target)
134
n_unique = unique_target.size
135
if feature_type == 'num':
136
if self.n_grid_points >= n_unique:
137
feature_grid = unique_target
138
else:
139
# when the number of required grid points is smaller than the number of
140
# unique values, we choose the percentile points to make sure the grid points
141
# span widely across the whole value range
142
percentile = np.percentile(target, np.linspace(0, 100, self.n_grid_points))
143
feature_grid = np.unique(percentile)
144
145
feature_cols = feature_grid
146
else:
147
feature_grid = unique_target
148
feature_cols = np.asarray(['{}_{}'.format(feature_name, category)
149
for category in unique_target])
150
151
# compute prediction batch by batch to save memory usage
152
n_rows = data.shape[0]
153
batch_size = ceil(n_rows / feature_grid.size)
154
parallel = Parallel(
155
n_jobs = self.n_jobs, verbose = self.verbose, pre_dispatch = self.pre_dispatch)
156
outputs = parallel(delayed(_predict_batch)(data_batch,
157
feature_grid,
158
feature_name,
159
is_classifier,
160
n_classes,
161
predict)
162
for data_batch in _data_iter(data, batch_size))
163
results = []
164
for output in zip(*outputs):
165
result = pd.concat(output, ignore_index = True)
166
result.columns = feature_cols
167
results.append(result)
168
169
self.results_ = results
170
self.feature_name_ = feature_name
171
self.feature_grid_ = feature_grid
172
self.feature_type_ = feature_type
173
return self
174
175
def plot(self, centered = True, target_class = 0):
176
"""
177
Use the partial dependence result to generate
178
a partial dependence plot (using matplotlib).
179
180
Parameters
181
----------
182
centered : bool, default True
183
Center the partial dependence plot by subtacting every partial
184
dependence result table's column value with the value of the first
185
column, i.e. first column's value will serve as the baseline
186
(centered at 0) for all other values.
187
188
target_class : int, default 0
189
The target class to show for the partial dependence result,
190
for regression task, we can leave the default number unmodified,
191
but for classification task, we should specify the target class
192
parameter to meet our needs
193
194
Returns
195
-------
196
figure
197
"""
198
figure = GridSpec(5, 1)
199
ax1 = plt.subplot(figure[0, :])
200
self._plot_title(ax1)
201
ax2 = plt.subplot(figure[1:, :])
202
self._plot_content(ax2, centered, target_class)
203
return figure
204
205
def _plot_title(self, ax):
206
font_family = 'Arial'
207
title = 'Partial Dependence Plot for {}'.format(self.feature_name_)
208
subtitle = 'Number of unique grid points: {}'.format(self.feature_grid_.size)
209
title_fontsize = 15
210
subtitle_fontsize = 12
211
212
ax.set_facecolor('white')
213
ax.text(
214
0, 0.7, title,
215
fontsize = title_fontsize, fontname = font_family)
216
ax.text(
217
0, 0.4, subtitle, color = 'grey',
218
fontsize = subtitle_fontsize, fontname = font_family)
219
ax.axis('off')
220
221
def _plot_content(self, ax, centered, target_class):
222
# pd (partial dependence)
223
pd_linewidth = 2
224
pd_markersize = 5
225
pd_color = '#1A4E5D'
226
fill_alpha = 0.2
227
fill_color = '#66C2D7'
228
zero_linewidth = 1.5
229
zero_color = '#E75438'
230
xlabel_fontsize = 10
231
232
results = self.results_[target_class]
233
feature_cols = results.columns
234
if self.feature_type_ == 'cat':
235
# ticks = all the unique categories
236
x = range(len(feature_cols))
237
ax.set_xticks(x)
238
ax.set_xticklabels(feature_cols)
239
else:
240
x = feature_cols
241
242
# center the partial dependence plot by subtacting every value
243
# with the value of the first column, i.e. first column's value
244
# will serve as the baseline (centered at 0) for all other values
245
pd = results.values.mean(axis = 0)
246
if centered:
247
pd -= pd[0]
248
249
pd_std = results.values.std(axis = 0)
250
upper = pd + pd_std
251
lower = pd - pd_std
252
253
ax.plot(
254
x, pd, color = pd_color, linewidth = pd_linewidth,
255
marker = 'o', markersize = pd_markersize)
256
ax.plot(
257
x, [0] * pd.size, color = zero_color,
258
linestyle = '--', linewidth = zero_linewidth)
259
ax.fill_between(x, upper, lower, alpha = fill_alpha, color = fill_color)
260
ax.set_xlabel(self.feature_name_, fontsize = xlabel_fontsize)
261
self._modify_axis(ax)
262
263
def _modify_axis(self, ax):
264
tick_labelsize = 8
265
tick_colors = '#9E9E9E'
266
tick_labelcolor = '#424242'
267
268
ax.tick_params(
269
axis = 'both', which = 'major', colors = tick_colors,
270
labelsize = tick_labelsize, labelcolor = tick_labelcolor)
271
272
ax.set_facecolor('white')
273
ax.get_yaxis().tick_left()
274
ax.get_xaxis().tick_bottom()
275
for direction in ('top', 'left', 'right', 'bottom'):
276
ax.spines[direction].set_visible(False)
277
278
for axis in ('x', 'y'):
279
ax.grid(True, 'major', axis, ls = '--', lw = .5, c = 'k', alpha = .3)
280
281
282
def _data_iter(data, batch_size):
283
"""Used by PartialDependenceExplainer to loop through the data by batch"""
284
n_rows = data.shape[0]
285
for i in range(0, n_rows, batch_size):
286
yield data[i:i + batch_size].reset_index(drop = True)
287
288
289
def _predict_batch(data_batch, feature_grid, feature_name,
290
is_classifier, n_classes, predict):
291
"""Used by PartialDependenceExplainer to generate prediction by batch"""
292
293
# repeat the index and use it to slice the data to create the repeated data
294
# instead of creating the repetition using the values, i.e.
295
# np.repeat(data_batch.values, repeats = feature_grid.size, axis = 0)
296
# this prevents everything from getting converted to a different data type, e.g.
297
# if there is 1 object type column then everything would get converted to object
298
index_batch = np.repeat(data_batch.index.values, repeats = feature_grid.size)
299
ice_data = data_batch.iloc[index_batch].copy()
300
ice_data[feature_name] = np.tile(feature_grid, data_batch.shape[0])
301
302
results = []
303
prediction = predict(ice_data)
304
for n_class in range(n_classes):
305
if is_classifier:
306
result = prediction[:, n_class]
307
else:
308
result = prediction
309
310
# reshape tiled data back to original batch's shape
311
reshaped = result.reshape((data_batch.shape[0], feature_grid.size))
312
result = pd.DataFrame(reshaped)
313
results.append(result)
314
315
return results
316
317