Path: blob/master/projects/kaggle_rossman_store_sales/gbt_module/model.py
2617 views
import numpy as np1from sklearn.base import BaseEstimator2from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit34__all__ = ['GBTPipeline']567class GBTPipeline(BaseEstimator):8"""9Gradient Boosted Tree Pipeline set up to do train/validation split10and hyperparameter search.11"""1213def __init__(self, input_cols, cat_cols, label_col, weights_col,14model_task, model_id, model_type,15model_parameters, model_hyper_parameters, search_parameters):16self.input_cols = input_cols17self.cat_cols = cat_cols18self.label_col = label_col19self.weights_col = weights_col20self.model_id = model_id21self.model_type = model_type22self.model_task = model_task23self.model_parameters = model_parameters24self.model_hyper_parameters = model_hyper_parameters25self.search_parameters = search_parameters2627def fit(self, data, val_fold, fit_params=None):28"""29Fit the pipeline to the input data.3031Parameters32----------33data : pd.DataFrame34Input training data. The data will be split into train/validation set35by providing the val_fold (validation fold) parameter.3637val_fold : 1d ndarray38The validation fold used for the `PredefinedSplit39<https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.PredefinedSplit.html>`_4041fit_params : dict42Any additional parameters that are passed to the fit method of the model. e.g.43`LGBMClassifier.fit44<https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMClassifier.fit>`_4546Returns47-------48self49"""50data_features = data[self.input_cols]51label = data[self.label_col]52sample_weights = data[self.weights_col] if self.weights_col is not None else None5354self.fit_params_ = self._create_default_fit_params(data_features, label,55val_fold, sample_weights)56if fit_params is not None:57self.fit_params_.update(fit_params)5859model = self._get_model()60cv = PredefinedSplit(val_fold)61model_tuned = RandomizedSearchCV(62estimator=model,63param_distributions=self.model_hyper_parameters,64cv=cv,65**self.search_parameters66).fit(data_features, label, **self.fit_params_)67self.model_tuned_ = model_tuned68return self6970def _get_model(self):71if self.model_task == 'classification' and self.model_type == 'lgb':72from lightgbm import LGBMClassifier73model = LGBMClassifier(**self.model_parameters)74elif self.model_task == 'regression' and self.model_type == 'lgb':75from lightgbm import LGBMRegressor76model = LGBMRegressor(**self.model_parameters)77else:78raise ValueError("model_task should be regression/classification")7980return model8182def _create_default_fit_params(self, data, label, val_fold, sample_weights):83mask = val_fold != -184data_train = data[~mask]85data_val = data[mask]86label_train = label[~mask]87label_val = label[mask]88fit_params = {89'eval_set': [(data_train, label_train), (data_val, label_val)],90'feature_name': self.input_cols,91'categorical_feature': self.cat_cols92}93if sample_weights is not None:94fit_params['sample_weights'] = sample_weights9596return fit_params9798def predict(self, data):99"""100Prediction estimates from the best model.101102Parameters103----------104data : pd.DataFrame105Data that contains the same input_cols and cat_cols as the data that106was used to fit the model.107108Returns109-------110prediction : ndarray111"""112best = self.model_tuned_.best_estimator_113return best.predict(data, num_iteration=best.best_iteration_)114115def get_feature_importance(self, threshold=1e-3):116"""117Sort the feature importance based on decreasing order of the118normalized gain.119120Parameters121----------122threshold : float, default 1e-3123Features that have a normalized gain smaller124than the specified ``threshold`` will not be returned.125"""126booster = self.model_tuned_.best_estimator_.booster_127importance = booster.feature_importance(importance_type='gain')128importance /= importance.sum()129feature_name = np.array(booster.feature_name())130131mask = importance > threshold132importance = importance[mask]133feature_name = feature_name[mask]134idx = np.argsort(importance)[::-1]135return list(zip(feature_name[idx], np.round(importance[idx], 4)))136137def save(self, path=None):138import os139from joblib import dump140141model_checkpoint = self.model_id + '.pkl' if path is None else path142143# create the directory if it's not the current directory and it doesn't exist already144model_dir = os.path.split(model_checkpoint)[0]145if model_dir.strip() and not os.path.isdir(model_dir):146os.makedirs(model_dir, exist_ok=True)147148dump(self, model_checkpoint)149return model_checkpoint150151@classmethod152def load(cls, path):153from joblib import load154loaded_model = load(path)155return loaded_model156157158