Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/trees/tree.py
1470 views
1
import numpy as np
2
3
4
class Tree:
5
"""
6
Classification tree using information gain with entropy as impurity
7
8
Parameters
9
----------
10
max_features : int or None, default None
11
The number of features to consider when looking for the best split,
12
None uses all features
13
14
min_samples_split : int, default 10
15
The minimum number of samples required to split an internal node
16
17
max_depth : int, default 3
18
Maximum depth of the tree
19
20
minimum_gain : float, default 1e-7
21
Minimum information gain required for splitting
22
"""
23
24
def __init__(self, max_depth = 3, max_features = None,
25
minimum_gain = 1e-7, min_samples_split = 10):
26
27
self.max_depth = max_depth
28
self.max_features = max_features
29
self.minimum_gain = minimum_gain
30
self.min_samples_split = min_samples_split
31
32
def fit(self, X, y):
33
"""pass in the 2d-array dataset and the response column"""
34
self.n_class = np.unique(y).shape[0]
35
36
# in the case you're wondering why we have this implementation of
37
# choosing the number of features to consider when looking
38
# for the best split, it will become much clearer when we
39
# start discussing Random Forest algorithm
40
if self.max_features is None or self.max_features > X.shape[1]:
41
self.max_features = X.shape[1]
42
43
self.feature_importance = np.zeros(X.shape[1])
44
self.tree = _create_decision_tree(X, y, self.max_depth,
45
self.minimum_gain, self.max_features,
46
self.min_samples_split, self.n_class,
47
self.feature_importance, X.shape[0])
48
self.feature_importance /= np.sum(self.feature_importance)
49
return self
50
51
def predict(self, X):
52
proba = self.predict_proba(X)
53
pred = np.argmax(proba, axis = 1)
54
return pred
55
56
def predict_proba(self, X):
57
proba = np.empty((X.shape[0], self.n_class))
58
for i in range(X.shape[0]):
59
proba[i] = self._predict_row(X[i, :], self.tree)
60
61
return proba
62
63
def _predict_row(self, row, tree):
64
"""Predict single row"""
65
if tree['is_leaf']:
66
return tree['prob']
67
else:
68
if row[tree['split_col']] <= tree['threshold']:
69
return self._predict_row(row, tree['left'])
70
else:
71
return self._predict_row(row, tree['right'])
72
73
74
def _create_decision_tree(X, y, max_depth,
75
minimum_gain, max_features,
76
min_samples_split, n_class,
77
feature_importance, n_row):
78
"""recursively grow the decision tree until it reaches the stopping criteria"""
79
try:
80
assert max_depth > 0
81
assert X.shape[0] > min_samples_split
82
column, value, gain = _find_best_split(X, y, max_features)
83
assert gain > minimum_gain
84
feature_importance[column] += (X.shape[0] / n_row) * gain
85
86
# split the dataset and grow left and right child
87
left_X, right_X, left_y, right_y = _split(X, y, column, value)
88
left_child = _create_decision_tree(left_X, left_y, max_depth - 1,
89
minimum_gain, max_features,
90
min_samples_split, n_class,
91
feature_importance, n_row)
92
right_child = _create_decision_tree(right_X, right_y, max_depth - 1,
93
minimum_gain, max_features,
94
min_samples_split, n_class,
95
feature_importance, n_row)
96
except AssertionError:
97
# if criteria reached, compute the classification
98
# probability and return it as a leaf node
99
100
# note that some leaf node may only contain partial classes,
101
# thus specify the minlength to class that don't appear will
102
# still get assign a probability of 0
103
counts = np.bincount(y, minlength = n_class)
104
prob = counts / y.shape[0]
105
leaf = {'is_leaf': True, 'prob': prob}
106
return leaf
107
108
node = {'is_leaf': False,
109
'left': left_child,
110
'right': right_child,
111
'split_col': column,
112
'threshold': value}
113
return node
114
115
116
def _find_best_split(X, y, max_features):
117
"""Greedy algorithm to find the best feature and value for a split"""
118
subset = np.random.choice(X.shape[1], max_features, replace = False)
119
max_col, max_val, max_gain = None, None, None
120
parent_entropy = _compute_entropy(y)
121
122
for column in subset:
123
split_values = _find_splits(X, column)
124
for value in split_values:
125
splits = _split(X, y, column, value, return_X = False)
126
gain = parent_entropy - _compute_splits_entropy(y, splits)
127
128
if max_gain is None or gain > max_gain:
129
max_col, max_val, max_gain = column, value, gain
130
131
return max_col, max_val, max_gain
132
133
134
def _compute_entropy(split):
135
"""entropy score using a fix log base 2"""
136
_, counts = np.unique(split, return_counts = True)
137
p = counts / split.shape[0]
138
entropy = -np.sum(p * np.log2(p))
139
return entropy
140
141
142
def _find_splits(X, column):
143
"""
144
find all possible split values (threshold),
145
by getting unique values in a sorted order
146
and finding cutoff point (average) between every two values
147
"""
148
X_unique = np.unique(X[:, column])
149
split_values = np.empty(X_unique.shape[0] - 1)
150
for i in range(1, X_unique.shape[0]):
151
average = (X_unique[i - 1] + X_unique[i]) / 2
152
split_values[i - 1] = average
153
154
return split_values
155
156
157
def _compute_splits_entropy(y, splits):
158
"""compute the entropy for the splits (the two child nodes)"""
159
splits_entropy = 0
160
for split in splits:
161
splits_entropy += (split.shape[0] / y.shape[0]) * _compute_entropy(split)
162
163
return splits_entropy
164
165
166
def _split(X, y, column, value, return_X = True):
167
"""split the response column using the cutoff threshold"""
168
left_mask = X[:, column] <= value
169
right_mask = X[:, column] > value
170
left_y, right_y = y[left_mask], y[right_mask]
171
172
if not return_X:
173
return left_y, right_y
174
else:
175
left_X, right_X = X[left_mask], X[right_mask]
176
return left_X, right_X, left_y, right_y
177
178
179
__all__ = [Tree]
180
181