Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Ok-landscape
GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/machine-learning/decision_tree.tex
51 views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage{float}
9
\usepackage{geometry}
10
\geometry{margin=1in}
11
\usepackage[makestderr]{pythontex}
12
13
\title{Decision Trees: Theory and Implementation}
14
\author{Machine Learning Foundations}
15
\date{\today}
16
17
\begin{document}
18
\maketitle
19
20
\begin{abstract}
21
This document presents a comprehensive analysis of decision tree algorithms for classification and regression. We explore information gain, Gini impurity, and variance reduction as splitting criteria, implement tree construction from scratch, examine pruning techniques, and analyze feature importance measures.
22
\end{abstract}
23
24
\section{Introduction}
25
Decision trees partition the feature space recursively using axis-aligned splits. For classification, a node's impurity can be measured using:
26
27
\textbf{Entropy:}
28
\begin{equation}
29
H(S) = -\sum_{c=1}^{C} p_c \log_2 p_c
30
\end{equation}
31
32
\textbf{Gini Impurity:}
33
\begin{equation}
34
G(S) = 1 - \sum_{c=1}^{C} p_c^2
35
\end{equation}
36
37
where $p_c$ is the proportion of class $c$ in set $S$.
38
39
\textbf{Information Gain:}
40
\begin{equation}
41
IG(S, A) = H(S) - \sum_{v \in \text{values}(A)} \frac{|S_v|}{|S|} H(S_v)
42
\end{equation}
43
44
\section{Computational Environment}
45
\begin{pycode}
46
import numpy as np
47
import matplotlib.pyplot as plt
48
from collections import Counter
49
import warnings
50
warnings.filterwarnings('ignore')
51
52
plt.rc('text', usetex=True)
53
plt.rc('font', family='serif')
54
np.random.seed(42)
55
56
def save_plot(filename, caption):
57
plt.savefig(filename, bbox_inches='tight', dpi=150)
58
print(r'\begin{figure}[H]')
59
print(r'\centering')
60
print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')
61
print(r'\caption{' + caption + '}')
62
print(r'\end{figure}')
63
plt.close()
64
\end{pycode}
65
66
\section{Data Generation}
67
\begin{pycode}
68
# Generate classification dataset
69
def make_classification_data(n_samples, n_features, n_classes, random_state=42):
70
np.random.seed(random_state)
71
X = np.random.randn(n_samples, n_features)
72
# Create class structure
73
y = np.zeros(n_samples, dtype=int)
74
for i in range(n_classes):
75
mask = (i * n_samples // n_classes <= np.arange(n_samples)) & \
76
(np.arange(n_samples) < (i+1) * n_samples // n_classes)
77
y[mask] = i
78
X[mask] += np.random.randn(n_features) * 0.5
79
# Shuffle
80
idx = np.random.permutation(n_samples)
81
return X[idx], y[idx]
82
83
def make_moons_data(n_samples, noise=0.1, random_state=42):
84
np.random.seed(random_state)
85
n_samples_out = n_samples // 2
86
n_samples_in = n_samples - n_samples_out
87
outer_circ_x = np.cos(np.linspace(0, np.pi, n_samples_out))
88
outer_circ_y = np.sin(np.linspace(0, np.pi, n_samples_out))
89
inner_circ_x = 1 - np.cos(np.linspace(0, np.pi, n_samples_in))
90
inner_circ_y = 1 - np.sin(np.linspace(0, np.pi, n_samples_in)) - 0.5
91
X = np.vstack([np.append(outer_circ_x, inner_circ_x),
92
np.append(outer_circ_y, inner_circ_y)]).T
93
y = np.hstack([np.zeros(n_samples_out), np.ones(n_samples_in)]).astype(int)
94
X += np.random.randn(*X.shape) * noise
95
return X, y
96
97
# Multi-class classification
98
X, y = make_classification_data(500, 10, 3)
99
# Split data
100
n_train = 400
101
X_train, X_test = X[:n_train], X[n_train:]
102
y_train, y_test = y[:n_train], y[n_train:]
103
104
n_samples = len(X)
105
n_features = X.shape[1]
106
n_classes = len(np.unique(y))
107
108
# 2D dataset for visualization
109
X_2d, y_2d = make_moons_data(300, noise=0.25)
110
X_2d_train, X_2d_test = X_2d[:240], X_2d[240:]
111
y_2d_train, y_2d_test = y_2d[:240], y_2d[240:]
112
113
# Plot 2D data
114
fig, ax = plt.subplots(figsize=(8, 6))
115
scatter = ax.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap='RdYlBu',
116
alpha=0.7, edgecolors='black', s=50)
117
ax.set_xlabel('Feature 1')
118
ax.set_ylabel('Feature 2')
119
ax.set_title('Two Moons Classification Dataset')
120
ax.grid(True, alpha=0.3)
121
plt.colorbar(scatter, label='Class')
122
save_plot('dt_data.pdf', 'Two-dimensional classification dataset for decision tree visualization.')
123
\end{pycode}
124
125
Dataset characteristics: $n = \py{n_samples}$ samples, $p = \py{n_features}$ features, $C = \py{n_classes}$ classes.
126
127
\section{Impurity Measures}
128
\begin{pycode}
129
def entropy(y):
130
"""Calculate entropy of a label array."""
131
if len(y) == 0:
132
return 0
133
counts = np.bincount(y.astype(int))
134
probs = counts[counts > 0] / len(y)
135
return -np.sum(probs * np.log2(probs))
136
137
def gini(y):
138
"""Calculate Gini impurity of a label array."""
139
if len(y) == 0:
140
return 0
141
counts = np.bincount(y.astype(int))
142
probs = counts[counts > 0] / len(y)
143
return 1 - np.sum(probs**2)
144
145
def misclassification(y):
146
"""Calculate misclassification error."""
147
if len(y) == 0:
148
return 0
149
counts = np.bincount(y.astype(int))
150
return 1 - np.max(counts) / len(y)
151
152
# Compare impurity measures
153
p = np.linspace(0.01, 0.99, 100)
154
entropy_vals = -p * np.log2(p) - (1-p) * np.log2(1-p)
155
gini_vals = 2 * p * (1 - p)
156
misc_vals = 1 - np.maximum(p, 1-p)
157
158
fig, ax = plt.subplots(figsize=(8, 5))
159
ax.plot(p, entropy_vals, 'b-', linewidth=2, label='Entropy')
160
ax.plot(p, gini_vals, 'r--', linewidth=2, label='Gini Impurity')
161
ax.plot(p, misc_vals, 'g:', linewidth=2, label='Misclassification')
162
ax.set_xlabel('Probability of Class 1')
163
ax.set_ylabel('Impurity Measure')
164
ax.set_title('Comparison of Impurity Measures (Binary Classification)')
165
ax.legend()
166
ax.grid(True, alpha=0.3)
167
save_plot('dt_impurity.pdf', 'Comparison of entropy, Gini impurity, and misclassification error.')
168
\end{pycode}
169
170
\section{Decision Tree Implementation}
171
\begin{pycode}
172
class DecisionTreeNode:
173
def __init__(self, depth=0):
174
self.depth = depth
175
self.feature_idx = None
176
self.threshold = None
177
self.left = None
178
self.right = None
179
self.is_leaf = False
180
self.prediction = None
181
self.samples = 0
182
self.impurity = 0
183
184
class DecisionTreeClassifier:
185
def __init__(self, max_depth=5, min_samples_split=2, criterion='gini'):
186
self.max_depth = max_depth
187
self.min_samples_split = min_samples_split
188
self.criterion = criterion
189
self.root = None
190
self.n_classes = None
191
self.feature_importances_ = None
192
193
def _impurity(self, y):
194
if self.criterion == 'gini':
195
return gini(y)
196
else:
197
return entropy(y)
198
199
def _best_split(self, X, y):
200
best_gain = -1
201
best_feature = None
202
best_threshold = None
203
204
n_samples, n_features = X.shape
205
parent_impurity = self._impurity(y)
206
207
for feature_idx in range(n_features):
208
thresholds = np.unique(X[:, feature_idx])
209
210
for threshold in thresholds:
211
left_mask = X[:, feature_idx] <= threshold
212
right_mask = ~left_mask
213
214
if np.sum(left_mask) < 1 or np.sum(right_mask) < 1:
215
continue
216
217
left_impurity = self._impurity(y[left_mask])
218
right_impurity = self._impurity(y[right_mask])
219
220
n_left = np.sum(left_mask)
221
n_right = np.sum(right_mask)
222
223
weighted_impurity = (n_left * left_impurity +
224
n_right * right_impurity) / n_samples
225
gain = parent_impurity - weighted_impurity
226
227
if gain > best_gain:
228
best_gain = gain
229
best_feature = feature_idx
230
best_threshold = threshold
231
232
return best_feature, best_threshold, best_gain
233
234
def _build_tree(self, X, y, depth=0):
235
node = DecisionTreeNode(depth)
236
node.samples = len(y)
237
node.impurity = self._impurity(y)
238
239
# Check stopping criteria
240
if (depth >= self.max_depth or
241
len(y) < self.min_samples_split or
242
len(np.unique(y)) == 1):
243
node.is_leaf = True
244
node.prediction = Counter(y.astype(int)).most_common(1)[0][0]
245
return node
246
247
# Find best split
248
feature_idx, threshold, gain = self._best_split(X, y)
249
250
if feature_idx is None or gain <= 0:
251
node.is_leaf = True
252
node.prediction = Counter(y.astype(int)).most_common(1)[0][0]
253
return node
254
255
# Split data
256
left_mask = X[:, feature_idx] <= threshold
257
right_mask = ~left_mask
258
259
node.feature_idx = feature_idx
260
node.threshold = threshold
261
262
# Update feature importance
263
if self.feature_importances_ is not None:
264
self.feature_importances_[feature_idx] += gain * len(y)
265
266
# Recursively build children
267
node.left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
268
node.right = self._build_tree(X[right_mask], y[right_mask], depth + 1)
269
270
return node
271
272
def fit(self, X, y):
273
self.n_classes = len(np.unique(y))
274
self.feature_importances_ = np.zeros(X.shape[1])
275
self.root = self._build_tree(X, y)
276
# Normalize feature importances
277
if np.sum(self.feature_importances_) > 0:
278
self.feature_importances_ /= np.sum(self.feature_importances_)
279
return self
280
281
def _predict_sample(self, x, node):
282
if node.is_leaf:
283
return node.prediction
284
if x[node.feature_idx] <= node.threshold:
285
return self._predict_sample(x, node.left)
286
else:
287
return self._predict_sample(x, node.right)
288
289
def predict(self, X):
290
return np.array([self._predict_sample(x, self.root) for x in X])
291
292
def score(self, X, y):
293
return np.mean(self.predict(X) == y)
294
295
# Train tree on 2D data
296
tree = DecisionTreeClassifier(max_depth=5, criterion='gini')
297
tree.fit(X_2d_train, y_2d_train)
298
299
train_acc = tree.score(X_2d_train, y_2d_train)
300
test_acc = tree.score(X_2d_test, y_2d_test)
301
\end{pycode}
302
303
\section{Decision Boundary Visualization}
304
\begin{pycode}
305
def plot_decision_boundary(model, X, y, ax, title):
306
"""Plot decision boundary for 2D data."""
307
h = 0.02
308
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
309
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
310
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
311
np.arange(y_min, y_max, h))
312
313
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
314
Z = Z.reshape(xx.shape)
315
316
ax.contourf(xx, yy, Z, alpha=0.4, cmap='RdYlBu')
317
ax.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu',
318
edgecolors='black', s=50, alpha=0.8)
319
ax.set_xlabel('Feature 1')
320
ax.set_ylabel('Feature 2')
321
ax.set_title(title)
322
323
# Compare different depths
324
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
325
depths = [1, 3, 5, 10]
326
327
for ax, depth in zip(axes.flat, depths):
328
tree_temp = DecisionTreeClassifier(max_depth=depth, criterion='gini')
329
tree_temp.fit(X_2d_train, y_2d_train)
330
acc = tree_temp.score(X_2d_test, y_2d_test)
331
plot_decision_boundary(tree_temp, X_2d, y_2d, ax,
332
f'Depth = {depth}, Accuracy = {acc:.2f}')
333
334
plt.tight_layout()
335
save_plot('dt_boundaries.pdf', 'Decision boundaries for different tree depths showing overfitting.')
336
\end{pycode}
337
338
Training accuracy: \py{f"{train_acc:.3f}"}, Test accuracy: \py{f"{test_acc:.3f}"}.
339
340
\section{Pruning Analysis}
341
\begin{pycode}
342
# Analyze effect of max_depth
343
depths = range(1, 16)
344
train_scores = []
345
test_scores = []
346
347
for depth in depths:
348
tree_temp = DecisionTreeClassifier(max_depth=depth, criterion='gini')
349
tree_temp.fit(X_train, y_train)
350
train_scores.append(tree_temp.score(X_train, y_train))
351
test_scores.append(tree_temp.score(X_test, y_test))
352
353
# Find optimal depth
354
best_depth = list(depths)[np.argmax(test_scores)]
355
best_test_acc = max(test_scores)
356
357
fig, ax = plt.subplots(figsize=(8, 5))
358
ax.plot(list(depths), train_scores, 'b-o', label='Training', linewidth=2, markersize=6)
359
ax.plot(list(depths), test_scores, 'r-s', label='Test', linewidth=2, markersize=6)
360
ax.axvline(best_depth, color='gray', linestyle='--', alpha=0.7,
361
label=f'Best depth = {best_depth}')
362
ax.set_xlabel('Maximum Depth')
363
ax.set_ylabel('Accuracy')
364
ax.set_title('Effect of Tree Depth on Performance')
365
ax.legend()
366
ax.grid(True, alpha=0.3)
367
save_plot('dt_pruning.pdf', 'Training and test accuracy as function of tree depth.')
368
\end{pycode}
369
370
Optimal depth: \py{best_depth} with test accuracy \py{f"{best_test_acc:.3f}"}.
371
372
\section{Feature Importance}
373
\begin{pycode}
374
# Train final model with optimal depth
375
final_tree = DecisionTreeClassifier(max_depth=best_depth, criterion='gini')
376
final_tree.fit(X_train, y_train)
377
378
# Get feature importances
379
importances = final_tree.feature_importances_
380
indices = np.argsort(importances)[::-1]
381
382
fig, ax = plt.subplots(figsize=(10, 5))
383
ax.bar(range(len(importances)), importances[indices],
384
color='steelblue', alpha=0.8)
385
ax.set_xticks(range(len(importances)))
386
ax.set_xticklabels([f'F{i}' for i in indices])
387
ax.set_xlabel('Feature')
388
ax.set_ylabel('Importance')
389
ax.set_title('Feature Importance (Information Gain)')
390
ax.grid(True, alpha=0.3, axis='y')
391
save_plot('dt_importance.pdf', 'Feature importance scores based on total information gain.')
392
393
# Top 3 features
394
top_features = indices[:3]
395
top_importances = importances[top_features]
396
\end{pycode}
397
398
Top 3 features: \py{f"F{top_features[0]}"} (\py{f"{top_importances[0]:.3f}"}), \py{f"F{top_features[1]}"} (\py{f"{top_importances[1]:.3f}"}), \py{f"F{top_features[2]}"} (\py{f"{top_importances[2]:.3f}"}).
399
400
\section{Gini vs Entropy Comparison}
401
\begin{pycode}
402
# Compare splitting criteria
403
gini_scores = []
404
entropy_scores = []
405
406
for depth in depths:
407
tree_gini = DecisionTreeClassifier(max_depth=depth, criterion='gini')
408
tree_gini.fit(X_train, y_train)
409
gini_scores.append(tree_gini.score(X_test, y_test))
410
411
tree_entropy = DecisionTreeClassifier(max_depth=depth, criterion='entropy')
412
tree_entropy.fit(X_train, y_train)
413
entropy_scores.append(tree_entropy.score(X_test, y_test))
414
415
fig, ax = plt.subplots(figsize=(8, 5))
416
ax.plot(list(depths), gini_scores, 'b-o', label='Gini', linewidth=2, markersize=6)
417
ax.plot(list(depths), entropy_scores, 'r-s', label='Entropy', linewidth=2, markersize=6)
418
ax.set_xlabel('Maximum Depth')
419
ax.set_ylabel('Test Accuracy')
420
ax.set_title('Comparison of Splitting Criteria')
421
ax.legend()
422
ax.grid(True, alpha=0.3)
423
save_plot('dt_criteria.pdf', 'Test accuracy comparison between Gini impurity and entropy.')
424
425
# Best scores for each criterion
426
best_gini = max(gini_scores)
427
best_entropy = max(entropy_scores)
428
\end{pycode}
429
430
\section{Results Summary}
431
\begin{table}[H]
432
\centering
433
\caption{Decision Tree Performance Summary}
434
\begin{tabular}{lr}
435
\toprule
436
Metric & Value \\
437
\midrule
438
Dataset Size & \py{n_samples} \\
439
Number of Features & \py{n_features} \\
440
Number of Classes & \py{n_classes} \\
441
Optimal Tree Depth & \py{best_depth} \\
442
Best Test Accuracy (Gini) & \py{f"{best_gini:.3f}"} \\
443
Best Test Accuracy (Entropy) & \py{f"{best_entropy:.3f}"} \\
444
\bottomrule
445
\end{tabular}
446
\end{table}
447
448
\begin{pycode}
449
print(r'\begin{table}[H]')
450
print(r'\centering')
451
print(r'\caption{Feature Importance Ranking}')
452
print(r'\begin{tabular}{ccc}')
453
print(r'\toprule')
454
print(r'Rank & Feature & Importance \\')
455
print(r'\midrule')
456
for i in range(min(5, len(indices))):
457
print(f'{i+1} & F{indices[i]} & {importances[indices[i]]:.4f} \\\\')
458
print(r'\bottomrule')
459
print(r'\end{tabular}')
460
print(r'\end{table}')
461
\end{pycode}
462
463
\section{Conclusion}
464
This analysis demonstrated:
465
\begin{itemize}
466
\item Decision tree construction using Gini impurity and entropy
467
\item The bias-variance tradeoff controlled by tree depth
468
\item Feature importance computation via information gain
469
\item Hyperparameter tuning (max depth, min samples split)
470
\item Visualization of decision boundaries in 2D
471
\end{itemize}
472
473
The optimal tree depth of \py{best_depth} balances model complexity with generalization, achieving \py{f"{best_test_acc:.1%}"} test accuracy.
474
475
\end{document}
476
477