CoCalc -- tree.py

GitHub Repository: ethen8181/machine-learning
Path: blob/master/trees/tree.py
¹⁴⁷⁰ views
1
import numpy as np
2

3

4
class Tree:
5
    """
6
    Classification tree using information gain with entropy as impurity
7

8
    Parameters
9
    ----------
10
    max_features : int or None, default None
11
        The number of features to consider when looking for the best split,
12
        None uses all features
13

14
    min_samples_split : int, default 10
15
        The minimum number of samples required to split an internal node
16

17
    max_depth : int, default 3
18
        Maximum depth of the tree
19

20
    minimum_gain : float, default 1e-7
21
        Minimum information gain required for splitting
22
    """
23

24
    def __init__(self, max_depth = 3, max_features = None,
25
                 minimum_gain = 1e-7, min_samples_split = 10):
26

27
        self.max_depth = max_depth
28
        self.max_features = max_features
29
        self.minimum_gain = minimum_gain
30
        self.min_samples_split = min_samples_split
31

32
    def fit(self, X, y):
33
        """pass in the 2d-array dataset and the response column"""
34
        self.n_class = np.unique(y).shape[0]
35

36
        # in the case you're wondering why we have this implementation of
37
        # choosing the number of features to consider when looking
38
        # for the best split, it will become much clearer when we
39
        # start discussing Random Forest algorithm
40
        if self.max_features is None or self.max_features > X.shape[1]:
41
            self.max_features = X.shape[1]
42

43
        self.feature_importance = np.zeros(X.shape[1])
44
        self.tree = _create_decision_tree(X, y, self.max_depth,
45
                                          self.minimum_gain, self.max_features,
46
                                          self.min_samples_split, self.n_class,
47
                                          self.feature_importance, X.shape[0])
48
        self.feature_importance /= np.sum(self.feature_importance)
49
        return self
50

51
    def predict(self, X):
52
        proba = self.predict_proba(X)
53
        pred = np.argmax(proba, axis = 1)
54
        return pred
55

56
    def predict_proba(self, X):
57
        proba = np.empty((X.shape[0], self.n_class))
58
        for i in range(X.shape[0]):
59
            proba[i] = self._predict_row(X[i, :], self.tree)
60

61
        return proba
62

63
    def _predict_row(self, row, tree):
64
        """Predict single row"""
65
        if tree['is_leaf']:
66
            return tree['prob']
67
        else:
68
            if row[tree['split_col']] <= tree['threshold']:
69
                return self._predict_row(row, tree['left'])
70
            else:
71
                return self._predict_row(row, tree['right'])
72

73

74
def _create_decision_tree(X, y, max_depth,
75
                          minimum_gain, max_features,
76
                          min_samples_split, n_class,
77
                          feature_importance, n_row):
78
    """recursively grow the decision tree until it reaches the stopping criteria"""
79
    try:
80
        assert max_depth > 0
81
        assert X.shape[0] > min_samples_split
82
        column, value, gain = _find_best_split(X, y, max_features)
83
        assert gain > minimum_gain
84
        feature_importance[column] += (X.shape[0] / n_row) * gain
85

86
        # split the dataset and grow left and right child
87
        left_X, right_X, left_y, right_y = _split(X, y, column, value)
88
        left_child = _create_decision_tree(left_X, left_y, max_depth - 1,
89
                                           minimum_gain, max_features,
90
                                           min_samples_split, n_class,
91
                                           feature_importance, n_row)
92
        right_child = _create_decision_tree(right_X, right_y, max_depth - 1,
93
                                            minimum_gain, max_features,
94
                                            min_samples_split, n_class,
95
                                            feature_importance, n_row)
96
    except AssertionError:
97
        # if criteria reached, compute the classification
98
        # probability and return it as a leaf node
99

100
        # note that some leaf node may only contain partial classes,
101
        # thus specify the minlength to class that don't appear will
102
        # still get assign a probability of 0
103
        counts = np.bincount(y, minlength = n_class)
104
        prob = counts / y.shape[0]
105
        leaf = {'is_leaf': True, 'prob': prob}
106
        return leaf
107

108
    node = {'is_leaf': False,
109
            'left': left_child,
110
            'right': right_child,
111
            'split_col': column,
112
            'threshold': value}
113
    return node
114

115

116
def _find_best_split(X, y, max_features):
117
    """Greedy algorithm to find the best feature and value for a split"""
118
    subset = np.random.choice(X.shape[1], max_features, replace = False)
119
    max_col, max_val, max_gain = None, None, None
120
    parent_entropy = _compute_entropy(y)
121

122
    for column in subset:
123
        split_values = _find_splits(X, column)
124
        for value in split_values:
125
            splits = _split(X, y, column, value, return_X = False)
126
            gain = parent_entropy - _compute_splits_entropy(y, splits)
127

128
            if max_gain is None or gain > max_gain:
129
                max_col, max_val, max_gain = column, value, gain
130

131
    return max_col, max_val, max_gain
132

133

134
def _compute_entropy(split):
135
    """entropy score using a fix log base 2"""
136
    _, counts = np.unique(split, return_counts = True)
137
    p = counts / split.shape[0]
138
    entropy = -np.sum(p * np.log2(p))
139
    return entropy
140

141

142
def _find_splits(X, column):
143
    """
144
    find all possible split values (threshold),
145
    by getting unique values in a sorted order
146
    and finding cutoff point (average) between every two values
147
    """
148
    X_unique = np.unique(X[:, column])
149
    split_values = np.empty(X_unique.shape[0] - 1)
150
    for i in range(1, X_unique.shape[0]):
151
        average = (X_unique[i - 1] + X_unique[i]) / 2
152
        split_values[i - 1] = average
153

154
    return split_values
155

156

157
def _compute_splits_entropy(y, splits):
158
    """compute the entropy for the splits (the two child nodes)"""
159
    splits_entropy = 0
160
    for split in splits:
161
        splits_entropy += (split.shape[0] / y.shape[0]) * _compute_entropy(split)
162

163
    return splits_entropy
164

165

166
def _split(X, y, column, value, return_X = True):
167
    """split the response column using the cutoff threshold"""
168
    left_mask = X[:, column] <= value
169
    right_mask = X[:, column] > value
170
    left_y, right_y = y[left_mask], y[right_mask]
171

172
    if not return_X:
173
        return left_y, right_y
174
    else:
175
        left_X, right_X = X[left_mask], X[right_mask]
176
        return left_X, right_X, left_y, right_y
177

178

179
__all__ = [Tree]
180

181
Product

Resources

Company