Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/projects/kaggle_rossman_store_sales/gbt_module/model.py
2617 views
1
import numpy as np
2
from sklearn.base import BaseEstimator
3
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit
4
5
__all__ = ['GBTPipeline']
6
7
8
class GBTPipeline(BaseEstimator):
9
"""
10
Gradient Boosted Tree Pipeline set up to do train/validation split
11
and hyperparameter search.
12
"""
13
14
def __init__(self, input_cols, cat_cols, label_col, weights_col,
15
model_task, model_id, model_type,
16
model_parameters, model_hyper_parameters, search_parameters):
17
self.input_cols = input_cols
18
self.cat_cols = cat_cols
19
self.label_col = label_col
20
self.weights_col = weights_col
21
self.model_id = model_id
22
self.model_type = model_type
23
self.model_task = model_task
24
self.model_parameters = model_parameters
25
self.model_hyper_parameters = model_hyper_parameters
26
self.search_parameters = search_parameters
27
28
def fit(self, data, val_fold, fit_params=None):
29
"""
30
Fit the pipeline to the input data.
31
32
Parameters
33
----------
34
data : pd.DataFrame
35
Input training data. The data will be split into train/validation set
36
by providing the val_fold (validation fold) parameter.
37
38
val_fold : 1d ndarray
39
The validation fold used for the `PredefinedSplit
40
<https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.PredefinedSplit.html>`_
41
42
fit_params : dict
43
Any additional parameters that are passed to the fit method of the model. e.g.
44
`LGBMClassifier.fit
45
<https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMClassifier.fit>`_
46
47
Returns
48
-------
49
self
50
"""
51
data_features = data[self.input_cols]
52
label = data[self.label_col]
53
sample_weights = data[self.weights_col] if self.weights_col is not None else None
54
55
self.fit_params_ = self._create_default_fit_params(data_features, label,
56
val_fold, sample_weights)
57
if fit_params is not None:
58
self.fit_params_.update(fit_params)
59
60
model = self._get_model()
61
cv = PredefinedSplit(val_fold)
62
model_tuned = RandomizedSearchCV(
63
estimator=model,
64
param_distributions=self.model_hyper_parameters,
65
cv=cv,
66
**self.search_parameters
67
).fit(data_features, label, **self.fit_params_)
68
self.model_tuned_ = model_tuned
69
return self
70
71
def _get_model(self):
72
if self.model_task == 'classification' and self.model_type == 'lgb':
73
from lightgbm import LGBMClassifier
74
model = LGBMClassifier(**self.model_parameters)
75
elif self.model_task == 'regression' and self.model_type == 'lgb':
76
from lightgbm import LGBMRegressor
77
model = LGBMRegressor(**self.model_parameters)
78
else:
79
raise ValueError("model_task should be regression/classification")
80
81
return model
82
83
def _create_default_fit_params(self, data, label, val_fold, sample_weights):
84
mask = val_fold != -1
85
data_train = data[~mask]
86
data_val = data[mask]
87
label_train = label[~mask]
88
label_val = label[mask]
89
fit_params = {
90
'eval_set': [(data_train, label_train), (data_val, label_val)],
91
'feature_name': self.input_cols,
92
'categorical_feature': self.cat_cols
93
}
94
if sample_weights is not None:
95
fit_params['sample_weights'] = sample_weights
96
97
return fit_params
98
99
def predict(self, data):
100
"""
101
Prediction estimates from the best model.
102
103
Parameters
104
----------
105
data : pd.DataFrame
106
Data that contains the same input_cols and cat_cols as the data that
107
was used to fit the model.
108
109
Returns
110
-------
111
prediction : ndarray
112
"""
113
best = self.model_tuned_.best_estimator_
114
return best.predict(data, num_iteration=best.best_iteration_)
115
116
def get_feature_importance(self, threshold=1e-3):
117
"""
118
Sort the feature importance based on decreasing order of the
119
normalized gain.
120
121
Parameters
122
----------
123
threshold : float, default 1e-3
124
Features that have a normalized gain smaller
125
than the specified ``threshold`` will not be returned.
126
"""
127
booster = self.model_tuned_.best_estimator_.booster_
128
importance = booster.feature_importance(importance_type='gain')
129
importance /= importance.sum()
130
feature_name = np.array(booster.feature_name())
131
132
mask = importance > threshold
133
importance = importance[mask]
134
feature_name = feature_name[mask]
135
idx = np.argsort(importance)[::-1]
136
return list(zip(feature_name[idx], np.round(importance[idx], 4)))
137
138
def save(self, path=None):
139
import os
140
from joblib import dump
141
142
model_checkpoint = self.model_id + '.pkl' if path is None else path
143
144
# create the directory if it's not the current directory and it doesn't exist already
145
model_dir = os.path.split(model_checkpoint)[0]
146
if model_dir.strip() and not os.path.isdir(model_dir):
147
os.makedirs(model_dir, exist_ok=True)
148
149
dump(self, model_checkpoint)
150
return model_checkpoint
151
152
@classmethod
153
def load(cls, path):
154
from joblib import load
155
loaded_model = load(path)
156
return loaded_model
157
158