CoCalc -- model.py

GitHub Repository: ethen8181/machine-learning
Path: blob/master/projects/kaggle_rossman_store_sales/gbt_module/model.py
²⁶¹⁷ views
1
import numpy as np
2
from sklearn.base import BaseEstimator
3
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit
4

5
__all__ = ['GBTPipeline']
6

7

8
class GBTPipeline(BaseEstimator):
9
    """
10
    Gradient Boosted Tree Pipeline set up to do train/validation split
11
    and hyperparameter search.
12
    """
13

14
    def __init__(self, input_cols, cat_cols, label_col, weights_col,
15
                 model_task, model_id, model_type,
16
                 model_parameters, model_hyper_parameters, search_parameters):
17
        self.input_cols = input_cols
18
        self.cat_cols = cat_cols
19
        self.label_col = label_col
20
        self.weights_col = weights_col
21
        self.model_id = model_id
22
        self.model_type = model_type
23
        self.model_task = model_task
24
        self.model_parameters = model_parameters
25
        self.model_hyper_parameters = model_hyper_parameters
26
        self.search_parameters = search_parameters
27

28
    def fit(self, data, val_fold, fit_params=None):
29
        """
30
        Fit the pipeline to the input data.
31

32
        Parameters
33
        ----------
34
        data : pd.DataFrame
35
            Input training data. The data will be split into train/validation set
36
            by providing the val_fold (validation fold) parameter.
37

38
        val_fold : 1d ndarray
39
            The validation fold used for the `PredefinedSplit
40
            <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.PredefinedSplit.html>`_
41

42
        fit_params : dict
43
            Any additional parameters that are passed to the fit method of the model. e.g.
44
            `LGBMClassifier.fit
45
             <https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMClassifier.fit>`_
46

47
        Returns
48
        -------
49
        self
50
        """
51
        data_features = data[self.input_cols]
52
        label = data[self.label_col]
53
        sample_weights = data[self.weights_col] if self.weights_col is not None else None
54

55
        self.fit_params_ = self._create_default_fit_params(data_features, label,
56
                                                           val_fold, sample_weights)
57
        if fit_params is not None:
58
            self.fit_params_.update(fit_params)
59

60
        model = self._get_model()
61
        cv = PredefinedSplit(val_fold)
62
        model_tuned = RandomizedSearchCV(
63
            estimator=model,
64
            param_distributions=self.model_hyper_parameters,
65
            cv=cv,
66
            **self.search_parameters
67
        ).fit(data_features, label, **self.fit_params_)
68
        self.model_tuned_ = model_tuned
69
        return self
70

71
    def _get_model(self):
72
        if self.model_task == 'classification' and self.model_type == 'lgb':
73
            from lightgbm import LGBMClassifier
74
            model = LGBMClassifier(**self.model_parameters)
75
        elif self.model_task == 'regression' and self.model_type == 'lgb':
76
            from lightgbm import LGBMRegressor
77
            model = LGBMRegressor(**self.model_parameters)
78
        else:
79
            raise ValueError("model_task should be regression/classification")
80

81
        return model
82

83
    def _create_default_fit_params(self, data, label, val_fold, sample_weights):
84
        mask = val_fold != -1
85
        data_train = data[~mask]
86
        data_val = data[mask]
87
        label_train = label[~mask]
88
        label_val = label[mask]
89
        fit_params = {
90
            'eval_set': [(data_train, label_train), (data_val, label_val)],
91
            'feature_name': self.input_cols,
92
            'categorical_feature': self.cat_cols
93
        }
94
        if sample_weights is not None:
95
            fit_params['sample_weights'] = sample_weights
96

97
        return fit_params
98

99
    def predict(self, data):
100
        """
101
        Prediction estimates from the best model.
102

103
        Parameters
104
        ----------
105
        data : pd.DataFrame
106
            Data that contains the same input_cols and cat_cols as the data that
107
            was used to fit the model.
108

109
        Returns
110
        -------
111
        prediction : ndarray
112
        """
113
        best = self.model_tuned_.best_estimator_
114
        return best.predict(data, num_iteration=best.best_iteration_)
115

116
    def get_feature_importance(self, threshold=1e-3):
117
        """
118
        Sort the feature importance based on decreasing order of the
119
        normalized gain.
120

121
        Parameters
122
        ----------
123
        threshold : float, default 1e-3
124
            Features that have a normalized gain smaller
125
            than the specified ``threshold`` will not be returned.
126
        """
127
        booster = self.model_tuned_.best_estimator_.booster_
128
        importance = booster.feature_importance(importance_type='gain')
129
        importance /= importance.sum()
130
        feature_name = np.array(booster.feature_name())
131

132
        mask = importance > threshold
133
        importance = importance[mask]
134
        feature_name = feature_name[mask]
135
        idx = np.argsort(importance)[::-1]
136
        return list(zip(feature_name[idx], np.round(importance[idx], 4)))
137

138
    def save(self, path=None):
139
        import os
140
        from joblib import dump
141

142
        model_checkpoint = self.model_id + '.pkl' if path is None else path
143

144
        # create the directory if it's not the current directory and it doesn't exist already
145
        model_dir = os.path.split(model_checkpoint)[0]
146
        if model_dir.strip() and not os.path.isdir(model_dir):
147
            os.makedirs(model_dir, exist_ok=True)
148

149
        dump(self, model_checkpoint)
150
        return model_checkpoint
151

152
    @classmethod
153
    def load(cls, path):
154
        from joblib import load
155
        loaded_model = load(path)
156
        return loaded_model
157

158
Product

Resources

Company