CoCalc -- partial_dependence.py

GitHub Repository: ethen8181/machine-learning
Path: blob/master/model_selection/partial_dependence/partial_dependence.py
²⁵⁸⁵ views
1
import numpy as np
2
import pandas as pd
3
import matplotlib.pyplot as plt
4
from math import ceil
5
from joblib import Parallel, delayed
6
from matplotlib.gridspec import GridSpec
7

8

9
__all__ = ['PartialDependenceExplainer']
10

11

12
class PartialDependenceExplainer:
13
    """
14
    Partial Dependence explanation [1]_.
15

16
    - Supports scikit-learn like classification and regression classifiers.
17
    - Works for both numerical and categorical columns.
18

19
    Parameters
20
    ----------
21
    estimator : sklearn-like classifier
22
        Model that was fitted on the data.
23

24
    n_grid_points : int, default 50
25
        Number of grid points used in replacement
26
        for the original numeric data. Only used
27
        if the targeted column is numeric. For categorical
28
        column, the number of grid points will always be
29
        the distinct number of categories in that column.
30
        Smaller number of grid points serves as an
31
        approximation for the total number of unique
32
        points and will result in faster computation
33

34
    batch_size : int, default = 'auto'
35
        Compute partial depedence prediction batch by batch to save
36
        memory usage, the default batch size will be
37
        ceil(number of rows in the data / the number of grid points used)
38

39
    n_jobs : int, default 1
40
        Number of jobs to run in parallel, if the model already fits
41
        extremely fast on the data, then specify 1 so that there's no
42
        overhead of spawning different processes to do the computation
43

44
    verbose : int, default 1
45
        The verbosity level: if non zero, progress messages are printed.
46
        Above 50, the output is sent to stdout. The frequency of the messages increases
47
        with the verbosity level. If it more than 10, all iterations are reported.
48

49
    pre_dispatch : int or str, default '2*n_jobs'
50
        Controls the number of jobs that get dispatched during parallel
51
        execution. Reducing this number can be useful to avoid an
52
        explosion of memory consumption when more jobs get dispatched
53
        than CPUs can process. Possible inputs:
54
            - None, in which case all the jobs are immediately
55
              created and spawned. Use this for lightweight and
56
              fast-running jobs, to avoid delays due to on-demand
57
              spawning of the jobs
58
            - An int, giving the exact number of total jobs that are
59
              spawned
60
            - A string, giving an expression as a function of n_jobs,
61
              as in '2*n_jobs'
62

63
    Attributes
64
    ----------
65
    feature_name_ : str
66
        The input feature_name to the .fit unmodified, will
67
        be used in subsequent method.
68

69
    feature_type_ : str
70
        The input feature_type to the .fit unmodified, will
71
        be used in subsequent method.
72

73
    feature_grid_ : 1d ndarray
74
        Unique grid points that were used to generate the
75
        partial dependence result.
76

77
    results : list of DataFrame
78
        Partial dependence result. If it's a classification
79
        estimator then each index of the list is the result
80
        for each class. On the other hand, if it's a regression
81
        estimator, it will be a list with 1 element.
82

83
    References
84
    ----------
85
    .. [1] `Python partial dependence plot toolbox
86
            <https://github.com/SauceCat/PDPbox>`_
87
    """
88

89
    def __init__(self, estimator, n_grid_points = 50, batch_size = 'auto',
90
                 n_jobs = 1, verbose = 1, pre_dispatch = '2*n_jobs'):
91
        self.n_jobs = n_jobs
92
        self.verbose = verbose
93
        self.estimator = estimator
94
        self.pre_dispatch = pre_dispatch
95
        self.n_grid_points = n_grid_points
96

97
    def fit(self, data, feature_name, feature_type):
98
        """
99
        Obtain the partial dependence result.
100

101
        Parameters
102
        ----------
103
        data : DataFrame, shape [n_samples, n_features]
104
            Input data to the estimator/model.
105

106
        feature_name : str
107
            Feature's name in the data what we wish to explain.
108

109
        feature_type : str, {'num', 'cat'}
110
            Specify whether feature_name is a numerical or
111
            categorical column.
112

113
        Returns
114
        -------
115
        self
116
        """
117

118
        # check whether it's a classification or regression model
119
        estimator = self.estimator
120
        try:
121
            n_classes = estimator.classes_.size
122
            is_classifier = True
123
            predict = estimator.predict_proba
124
        except AttributeError:
125
            # for regression problem, still set the
126
            # number of classes to 1 to initialize
127
            # the loop later downstream
128
            n_classes = 1
129
            is_classifier = False
130
            predict = estimator.predict
131

132
        target = data[feature_name]
133
        unique_target = np.unique(target)
134
        n_unique = unique_target.size
135
        if feature_type == 'num':
136
            if self.n_grid_points >= n_unique:
137
                feature_grid = unique_target
138
            else:
139
                # when the number of required grid points is smaller than the number of
140
                # unique values, we choose the percentile points to make sure the grid points
141
                # span widely across the whole value range
142
                percentile = np.percentile(target, np.linspace(0, 100, self.n_grid_points))
143
                feature_grid = np.unique(percentile)
144

145
            feature_cols = feature_grid
146
        else:
147
            feature_grid = unique_target
148
            feature_cols = np.asarray(['{}_{}'.format(feature_name, category)
149
                                       for category in unique_target])
150

151
        # compute prediction batch by batch to save memory usage
152
        n_rows = data.shape[0]
153
        batch_size = ceil(n_rows / feature_grid.size)
154
        parallel = Parallel(
155
            n_jobs = self.n_jobs, verbose = self.verbose, pre_dispatch = self.pre_dispatch)
156
        outputs = parallel(delayed(_predict_batch)(data_batch,
157
                                                   feature_grid,
158
                                                   feature_name,
159
                                                   is_classifier,
160
                                                   n_classes,
161
                                                   predict)
162
                           for data_batch in _data_iter(data, batch_size))
163
        results = []
164
        for output in zip(*outputs):
165
            result = pd.concat(output, ignore_index = True)
166
            result.columns = feature_cols
167
            results.append(result)
168

169
        self.results_ = results
170
        self.feature_name_ = feature_name
171
        self.feature_grid_ = feature_grid
172
        self.feature_type_ = feature_type
173
        return self
174

175
    def plot(self, centered = True, target_class = 0):
176
        """
177
        Use the partial dependence result to generate
178
        a partial dependence plot (using matplotlib).
179

180
        Parameters
181
        ----------
182
        centered : bool, default True
183
            Center the partial dependence plot by subtacting every partial
184
            dependence result table's column value with the value of the first
185
            column, i.e. first column's value will serve as the baseline
186
            (centered at 0) for all other values.
187

188
        target_class : int, default 0
189
            The target class to show for the partial dependence result,
190
            for regression task, we can leave the default number unmodified,
191
            but for classification task, we should specify the target class
192
            parameter to meet our needs
193

194
        Returns
195
        -------
196
        figure
197
        """
198
        figure = GridSpec(5, 1)
199
        ax1 = plt.subplot(figure[0, :])
200
        self._plot_title(ax1)
201
        ax2 = plt.subplot(figure[1:, :])
202
        self._plot_content(ax2, centered, target_class)
203
        return figure
204

205
    def _plot_title(self, ax):
206
        font_family = 'Arial'
207
        title = 'Partial Dependence Plot for {}'.format(self.feature_name_)
208
        subtitle = 'Number of unique grid points: {}'.format(self.feature_grid_.size)
209
        title_fontsize = 15
210
        subtitle_fontsize = 12
211

212
        ax.set_facecolor('white')
213
        ax.text(
214
            0, 0.7, title,
215
            fontsize = title_fontsize, fontname = font_family)
216
        ax.text(
217
            0, 0.4, subtitle, color = 'grey',
218
            fontsize = subtitle_fontsize, fontname = font_family)
219
        ax.axis('off')
220

221
    def _plot_content(self, ax, centered, target_class):
222
        # pd (partial dependence)
223
        pd_linewidth = 2
224
        pd_markersize = 5
225
        pd_color = '#1A4E5D'
226
        fill_alpha = 0.2
227
        fill_color = '#66C2D7'
228
        zero_linewidth = 1.5
229
        zero_color = '#E75438'
230
        xlabel_fontsize = 10
231

232
        results = self.results_[target_class]
233
        feature_cols = results.columns
234
        if self.feature_type_ == 'cat':
235
            # ticks = all the unique categories
236
            x = range(len(feature_cols))
237
            ax.set_xticks(x)
238
            ax.set_xticklabels(feature_cols)
239
        else:
240
            x = feature_cols
241

242
        # center the partial dependence plot by subtacting every value
243
        # with the value of the first column, i.e. first column's value
244
        # will serve as the baseline (centered at 0) for all other values
245
        pd = results.values.mean(axis = 0)
246
        if centered:
247
            pd -= pd[0]
248

249
        pd_std = results.values.std(axis = 0)
250
        upper = pd + pd_std
251
        lower = pd - pd_std
252

253
        ax.plot(
254
            x, pd, color = pd_color, linewidth = pd_linewidth,
255
            marker = 'o', markersize = pd_markersize)
256
        ax.plot(
257
            x, [0] * pd.size, color = zero_color,
258
            linestyle = '--', linewidth = zero_linewidth)
259
        ax.fill_between(x, upper, lower, alpha = fill_alpha, color = fill_color)
260
        ax.set_xlabel(self.feature_name_, fontsize = xlabel_fontsize)
261
        self._modify_axis(ax)
262

263
    def _modify_axis(self, ax):
264
        tick_labelsize = 8
265
        tick_colors = '#9E9E9E'
266
        tick_labelcolor = '#424242'
267

268
        ax.tick_params(
269
            axis = 'both', which = 'major', colors = tick_colors,
270
            labelsize = tick_labelsize, labelcolor = tick_labelcolor)
271

272
        ax.set_facecolor('white')
273
        ax.get_yaxis().tick_left()
274
        ax.get_xaxis().tick_bottom()
275
        for direction in ('top', 'left', 'right', 'bottom'):
276
            ax.spines[direction].set_visible(False)
277

278
        for axis in ('x', 'y'):
279
            ax.grid(True, 'major', axis, ls = '--', lw = .5, c = 'k', alpha = .3)
280

281

282
def _data_iter(data, batch_size):
283
    """Used by PartialDependenceExplainer to loop through the data by batch"""
284
    n_rows = data.shape[0]
285
    for i in range(0, n_rows, batch_size):
286
        yield data[i:i + batch_size].reset_index(drop = True)
287

288

289
def _predict_batch(data_batch, feature_grid, feature_name,
290
                   is_classifier, n_classes, predict):
291
    """Used by PartialDependenceExplainer to generate prediction by batch"""
292

293
    # repeat the index and use it to slice the data to create the repeated data
294
    # instead of creating the repetition using the values, i.e.
295
    # np.repeat(data_batch.values, repeats = feature_grid.size, axis = 0)
296
    # this prevents everything from getting converted to a different data type, e.g.
297
    # if there is 1 object type column then everything would get converted to object
298
    index_batch = np.repeat(data_batch.index.values, repeats = feature_grid.size)
299
    ice_data = data_batch.iloc[index_batch].copy()
300
    ice_data[feature_name] = np.tile(feature_grid, data_batch.shape[0])
301

302
    results = []
303
    prediction = predict(ice_data)
304
    for n_class in range(n_classes):
305
        if is_classifier:
306
            result = prediction[:, n_class]
307
        else:
308
            result = prediction
309

310
        # reshape tiled data back to original batch's shape
311
        reshaped = result.reshape((data_batch.shape[0], feature_grid.size))
312
        result = pd.DataFrame(reshaped)
313
        results.append(result)
314

315
    return results
316

317
Product

Resources

Company