CoCalc -- decision

GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/machine-learning/decision_tree.tex
⁵¹ views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage{float}
9
\usepackage{geometry}
10
\geometry{margin=1in}
11
\usepackage[makestderr]{pythontex}
12

13
\title{Decision Trees: Theory and Implementation}
14
\author{Machine Learning Foundations}
15
\date{\today}
16

17
\begin{document}
18
\maketitle
19

20
\begin{abstract}
21
This document presents a comprehensive analysis of decision tree algorithms for classification and regression. We explore information gain, Gini impurity, and variance reduction as splitting criteria, implement tree construction from scratch, examine pruning techniques, and analyze feature importance measures.
22
\end{abstract}
23

24
\section{Introduction}
25
Decision trees partition the feature space recursively using axis-aligned splits. For classification, a node's impurity can be measured using:
26

27
\textbf{Entropy:}
28
\begin{equation}
29
H(S) = -\sum_{c=1}^{C} p_c \log_2 p_c
30
\end{equation}
31

32
\textbf{Gini Impurity:}
33
\begin{equation}
34
G(S) = 1 - \sum_{c=1}^{C} p_c^2
35
\end{equation}
36

37
where $p_c$ is the proportion of class $c$ in set $S$.
38

39
\textbf{Information Gain:}
40
\begin{equation}
41
IG(S, A) = H(S) - \sum_{v \in \text{values}(A)} \frac{|S_v|}{|S|} H(S_v)
42
\end{equation}
43

44
\section{Computational Environment}
45
\begin{pycode}
46
import numpy as np
47
import matplotlib.pyplot as plt
48
from collections import Counter
49
import warnings
50
warnings.filterwarnings('ignore')
51

52
plt.rc('text', usetex=True)
53
plt.rc('font', family='serif')
54
np.random.seed(42)
55

56
def save_plot(filename, caption):
57
    plt.savefig(filename, bbox_inches='tight', dpi=150)
58
    print(r'\begin{figure}[H]')
59
    print(r'\centering')
60
    print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')
61
    print(r'\caption{' + caption + '}')
62
    print(r'\end{figure}')
63
    plt.close()
64
\end{pycode}
65

66
\section{Data Generation}
67
\begin{pycode}
68
# Generate classification dataset
69
def make_classification_data(n_samples, n_features, n_classes, random_state=42):
70
    np.random.seed(random_state)
71
    X = np.random.randn(n_samples, n_features)
72
    # Create class structure
73
    y = np.zeros(n_samples, dtype=int)
74
    for i in range(n_classes):
75
        mask = (i * n_samples // n_classes <= np.arange(n_samples)) & \
76
               (np.arange(n_samples) < (i+1) * n_samples // n_classes)
77
        y[mask] = i
78
        X[mask] += np.random.randn(n_features) * 0.5
79
    # Shuffle
80
    idx = np.random.permutation(n_samples)
81
    return X[idx], y[idx]
82

83
def make_moons_data(n_samples, noise=0.1, random_state=42):
84
    np.random.seed(random_state)
85
    n_samples_out = n_samples // 2
86
    n_samples_in = n_samples - n_samples_out
87
    outer_circ_x = np.cos(np.linspace(0, np.pi, n_samples_out))
88
    outer_circ_y = np.sin(np.linspace(0, np.pi, n_samples_out))
89
    inner_circ_x = 1 - np.cos(np.linspace(0, np.pi, n_samples_in))
90
    inner_circ_y = 1 - np.sin(np.linspace(0, np.pi, n_samples_in)) - 0.5
91
    X = np.vstack([np.append(outer_circ_x, inner_circ_x),
92
                   np.append(outer_circ_y, inner_circ_y)]).T
93
    y = np.hstack([np.zeros(n_samples_out), np.ones(n_samples_in)]).astype(int)
94
    X += np.random.randn(*X.shape) * noise
95
    return X, y
96

97
# Multi-class classification
98
X, y = make_classification_data(500, 10, 3)
99
# Split data
100
n_train = 400
101
X_train, X_test = X[:n_train], X[n_train:]
102
y_train, y_test = y[:n_train], y[n_train:]
103

104
n_samples = len(X)
105
n_features = X.shape[1]
106
n_classes = len(np.unique(y))
107

108
# 2D dataset for visualization
109
X_2d, y_2d = make_moons_data(300, noise=0.25)
110
X_2d_train, X_2d_test = X_2d[:240], X_2d[240:]
111
y_2d_train, y_2d_test = y_2d[:240], y_2d[240:]
112

113
# Plot 2D data
114
fig, ax = plt.subplots(figsize=(8, 6))
115
scatter = ax.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap='RdYlBu',
116
                     alpha=0.7, edgecolors='black', s=50)
117
ax.set_xlabel('Feature 1')
118
ax.set_ylabel('Feature 2')
119
ax.set_title('Two Moons Classification Dataset')
120
ax.grid(True, alpha=0.3)
121
plt.colorbar(scatter, label='Class')
122
save_plot('dt_data.pdf', 'Two-dimensional classification dataset for decision tree visualization.')
123
\end{pycode}
124

125
Dataset characteristics: $n = \py{n_samples}$ samples, $p = \py{n_features}$ features, $C = \py{n_classes}$ classes.
126

127
\section{Impurity Measures}
128
\begin{pycode}
129
def entropy(y):
130
    """Calculate entropy of a label array."""
131
    if len(y) == 0:
132
        return 0
133
    counts = np.bincount(y.astype(int))
134
    probs = counts[counts > 0] / len(y)
135
    return -np.sum(probs * np.log2(probs))
136

137
def gini(y):
138
    """Calculate Gini impurity of a label array."""
139
    if len(y) == 0:
140
        return 0
141
    counts = np.bincount(y.astype(int))
142
    probs = counts[counts > 0] / len(y)
143
    return 1 - np.sum(probs**2)
144

145
def misclassification(y):
146
    """Calculate misclassification error."""
147
    if len(y) == 0:
148
        return 0
149
    counts = np.bincount(y.astype(int))
150
    return 1 - np.max(counts) / len(y)
151

152
# Compare impurity measures
153
p = np.linspace(0.01, 0.99, 100)
154
entropy_vals = -p * np.log2(p) - (1-p) * np.log2(1-p)
155
gini_vals = 2 * p * (1 - p)
156
misc_vals = 1 - np.maximum(p, 1-p)
157

158
fig, ax = plt.subplots(figsize=(8, 5))
159
ax.plot(p, entropy_vals, 'b-', linewidth=2, label='Entropy')
160
ax.plot(p, gini_vals, 'r--', linewidth=2, label='Gini Impurity')
161
ax.plot(p, misc_vals, 'g:', linewidth=2, label='Misclassification')
162
ax.set_xlabel('Probability of Class 1')
163
ax.set_ylabel('Impurity Measure')
164
ax.set_title('Comparison of Impurity Measures (Binary Classification)')
165
ax.legend()
166
ax.grid(True, alpha=0.3)
167
save_plot('dt_impurity.pdf', 'Comparison of entropy, Gini impurity, and misclassification error.')
168
\end{pycode}
169

170
\section{Decision Tree Implementation}
171
\begin{pycode}
172
class DecisionTreeNode:
173
    def __init__(self, depth=0):
174
        self.depth = depth
175
        self.feature_idx = None
176
        self.threshold = None
177
        self.left = None
178
        self.right = None
179
        self.is_leaf = False
180
        self.prediction = None
181
        self.samples = 0
182
        self.impurity = 0
183

184
class DecisionTreeClassifier:
185
    def __init__(self, max_depth=5, min_samples_split=2, criterion='gini'):
186
        self.max_depth = max_depth
187
        self.min_samples_split = min_samples_split
188
        self.criterion = criterion
189
        self.root = None
190
        self.n_classes = None
191
        self.feature_importances_ = None
192

193
    def _impurity(self, y):
194
        if self.criterion == 'gini':
195
            return gini(y)
196
        else:
197
            return entropy(y)
198

199
    def _best_split(self, X, y):
200
        best_gain = -1
201
        best_feature = None
202
        best_threshold = None
203

204
        n_samples, n_features = X.shape
205
        parent_impurity = self._impurity(y)
206

207
        for feature_idx in range(n_features):
208
            thresholds = np.unique(X[:, feature_idx])
209

210
            for threshold in thresholds:
211
                left_mask = X[:, feature_idx] <= threshold
212
                right_mask = ~left_mask
213

214
                if np.sum(left_mask) < 1 or np.sum(right_mask) < 1:
215
                    continue
216

217
                left_impurity = self._impurity(y[left_mask])
218
                right_impurity = self._impurity(y[right_mask])
219

220
                n_left = np.sum(left_mask)
221
                n_right = np.sum(right_mask)
222

223
                weighted_impurity = (n_left * left_impurity +
224
                                     n_right * right_impurity) / n_samples
225
                gain = parent_impurity - weighted_impurity
226

227
                if gain > best_gain:
228
                    best_gain = gain
229
                    best_feature = feature_idx
230
                    best_threshold = threshold
231

232
        return best_feature, best_threshold, best_gain
233

234
    def _build_tree(self, X, y, depth=0):
235
        node = DecisionTreeNode(depth)
236
        node.samples = len(y)
237
        node.impurity = self._impurity(y)
238

239
        # Check stopping criteria
240
        if (depth >= self.max_depth or
241
            len(y) < self.min_samples_split or
242
            len(np.unique(y)) == 1):
243
            node.is_leaf = True
244
            node.prediction = Counter(y.astype(int)).most_common(1)[0][0]
245
            return node
246

247
        # Find best split
248
        feature_idx, threshold, gain = self._best_split(X, y)
249

250
        if feature_idx is None or gain <= 0:
251
            node.is_leaf = True
252
            node.prediction = Counter(y.astype(int)).most_common(1)[0][0]
253
            return node
254

255
        # Split data
256
        left_mask = X[:, feature_idx] <= threshold
257
        right_mask = ~left_mask
258

259
        node.feature_idx = feature_idx
260
        node.threshold = threshold
261

262
        # Update feature importance
263
        if self.feature_importances_ is not None:
264
            self.feature_importances_[feature_idx] += gain * len(y)
265

266
        # Recursively build children
267
        node.left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
268
        node.right = self._build_tree(X[right_mask], y[right_mask], depth + 1)
269

270
        return node
271

272
    def fit(self, X, y):
273
        self.n_classes = len(np.unique(y))
274
        self.feature_importances_ = np.zeros(X.shape[1])
275
        self.root = self._build_tree(X, y)
276
        # Normalize feature importances
277
        if np.sum(self.feature_importances_) > 0:
278
            self.feature_importances_ /= np.sum(self.feature_importances_)
279
        return self
280

281
    def _predict_sample(self, x, node):
282
        if node.is_leaf:
283
            return node.prediction
284
        if x[node.feature_idx] <= node.threshold:
285
            return self._predict_sample(x, node.left)
286
        else:
287
            return self._predict_sample(x, node.right)
288

289
    def predict(self, X):
290
        return np.array([self._predict_sample(x, self.root) for x in X])
291

292
    def score(self, X, y):
293
        return np.mean(self.predict(X) == y)
294

295
# Train tree on 2D data
296
tree = DecisionTreeClassifier(max_depth=5, criterion='gini')
297
tree.fit(X_2d_train, y_2d_train)
298

299
train_acc = tree.score(X_2d_train, y_2d_train)
300
test_acc = tree.score(X_2d_test, y_2d_test)
301
\end{pycode}
302

303
\section{Decision Boundary Visualization}
304
\begin{pycode}
305
def plot_decision_boundary(model, X, y, ax, title):
306
    """Plot decision boundary for 2D data."""
307
    h = 0.02
308
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
309
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
310
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
311
                         np.arange(y_min, y_max, h))
312

313
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
314
    Z = Z.reshape(xx.shape)
315

316
    ax.contourf(xx, yy, Z, alpha=0.4, cmap='RdYlBu')
317
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu',
318
               edgecolors='black', s=50, alpha=0.8)
319
    ax.set_xlabel('Feature 1')
320
    ax.set_ylabel('Feature 2')
321
    ax.set_title(title)
322

323
# Compare different depths
324
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
325
depths = [1, 3, 5, 10]
326

327
for ax, depth in zip(axes.flat, depths):
328
    tree_temp = DecisionTreeClassifier(max_depth=depth, criterion='gini')
329
    tree_temp.fit(X_2d_train, y_2d_train)
330
    acc = tree_temp.score(X_2d_test, y_2d_test)
331
    plot_decision_boundary(tree_temp, X_2d, y_2d, ax,
332
                          f'Depth = {depth}, Accuracy = {acc:.2f}')
333

334
plt.tight_layout()
335
save_plot('dt_boundaries.pdf', 'Decision boundaries for different tree depths showing overfitting.')
336
\end{pycode}
337

338
Training accuracy: \py{f"{train_acc:.3f}"}, Test accuracy: \py{f"{test_acc:.3f}"}.
339

340
\section{Pruning Analysis}
341
\begin{pycode}
342
# Analyze effect of max_depth
343
depths = range(1, 16)
344
train_scores = []
345
test_scores = []
346

347
for depth in depths:
348
    tree_temp = DecisionTreeClassifier(max_depth=depth, criterion='gini')
349
    tree_temp.fit(X_train, y_train)
350
    train_scores.append(tree_temp.score(X_train, y_train))
351
    test_scores.append(tree_temp.score(X_test, y_test))
352

353
# Find optimal depth
354
best_depth = list(depths)[np.argmax(test_scores)]
355
best_test_acc = max(test_scores)
356

357
fig, ax = plt.subplots(figsize=(8, 5))
358
ax.plot(list(depths), train_scores, 'b-o', label='Training', linewidth=2, markersize=6)
359
ax.plot(list(depths), test_scores, 'r-s', label='Test', linewidth=2, markersize=6)
360
ax.axvline(best_depth, color='gray', linestyle='--', alpha=0.7,
361
           label=f'Best depth = {best_depth}')
362
ax.set_xlabel('Maximum Depth')
363
ax.set_ylabel('Accuracy')
364
ax.set_title('Effect of Tree Depth on Performance')
365
ax.legend()
366
ax.grid(True, alpha=0.3)
367
save_plot('dt_pruning.pdf', 'Training and test accuracy as function of tree depth.')
368
\end{pycode}
369

370
Optimal depth: \py{best_depth} with test accuracy \py{f"{best_test_acc:.3f}"}.
371

372
\section{Feature Importance}
373
\begin{pycode}
374
# Train final model with optimal depth
375
final_tree = DecisionTreeClassifier(max_depth=best_depth, criterion='gini')
376
final_tree.fit(X_train, y_train)
377

378
# Get feature importances
379
importances = final_tree.feature_importances_
380
indices = np.argsort(importances)[::-1]
381

382
fig, ax = plt.subplots(figsize=(10, 5))
383
ax.bar(range(len(importances)), importances[indices],
384
       color='steelblue', alpha=0.8)
385
ax.set_xticks(range(len(importances)))
386
ax.set_xticklabels([f'F{i}' for i in indices])
387
ax.set_xlabel('Feature')
388
ax.set_ylabel('Importance')
389
ax.set_title('Feature Importance (Information Gain)')
390
ax.grid(True, alpha=0.3, axis='y')
391
save_plot('dt_importance.pdf', 'Feature importance scores based on total information gain.')
392

393
# Top 3 features
394
top_features = indices[:3]
395
top_importances = importances[top_features]
396
\end{pycode}
397

398
Top 3 features: \py{f"F{top_features[0]}"} (\py{f"{top_importances[0]:.3f}"}), \py{f"F{top_features[1]}"} (\py{f"{top_importances[1]:.3f}"}), \py{f"F{top_features[2]}"} (\py{f"{top_importances[2]:.3f}"}).
399

400
\section{Gini vs Entropy Comparison}
401
\begin{pycode}
402
# Compare splitting criteria
403
gini_scores = []
404
entropy_scores = []
405

406
for depth in depths:
407
    tree_gini = DecisionTreeClassifier(max_depth=depth, criterion='gini')
408
    tree_gini.fit(X_train, y_train)
409
    gini_scores.append(tree_gini.score(X_test, y_test))
410

411
    tree_entropy = DecisionTreeClassifier(max_depth=depth, criterion='entropy')
412
    tree_entropy.fit(X_train, y_train)
413
    entropy_scores.append(tree_entropy.score(X_test, y_test))
414

415
fig, ax = plt.subplots(figsize=(8, 5))
416
ax.plot(list(depths), gini_scores, 'b-o', label='Gini', linewidth=2, markersize=6)
417
ax.plot(list(depths), entropy_scores, 'r-s', label='Entropy', linewidth=2, markersize=6)
418
ax.set_xlabel('Maximum Depth')
419
ax.set_ylabel('Test Accuracy')
420
ax.set_title('Comparison of Splitting Criteria')
421
ax.legend()
422
ax.grid(True, alpha=0.3)
423
save_plot('dt_criteria.pdf', 'Test accuracy comparison between Gini impurity and entropy.')
424

425
# Best scores for each criterion
426
best_gini = max(gini_scores)
427
best_entropy = max(entropy_scores)
428
\end{pycode}
429

430
\section{Results Summary}
431
\begin{table}[H]
432
\centering
433
\caption{Decision Tree Performance Summary}
434
\begin{tabular}{lr}
435
\toprule
436
Metric & Value \\
437
\midrule
438
Dataset Size & \py{n_samples} \\
439
Number of Features & \py{n_features} \\
440
Number of Classes & \py{n_classes} \\
441
Optimal Tree Depth & \py{best_depth} \\
442
Best Test Accuracy (Gini) & \py{f"{best_gini:.3f}"} \\
443
Best Test Accuracy (Entropy) & \py{f"{best_entropy:.3f}"} \\
444
\bottomrule
445
\end{tabular}
446
\end{table}
447

448
\begin{pycode}
449
print(r'\begin{table}[H]')
450
print(r'\centering')
451
print(r'\caption{Feature Importance Ranking}')
452
print(r'\begin{tabular}{ccc}')
453
print(r'\toprule')
454
print(r'Rank & Feature & Importance \\')
455
print(r'\midrule')
456
for i in range(min(5, len(indices))):
457
    print(f'{i+1} & F{indices[i]} & {importances[indices[i]]:.4f} \\\\')
458
print(r'\bottomrule')
459
print(r'\end{tabular}')
460
print(r'\end{table}')
461
\end{pycode}
462

463
\section{Conclusion}
464
This analysis demonstrated:
465
\begin{itemize}
466
    \item Decision tree construction using Gini impurity and entropy
467
    \item The bias-variance tradeoff controlled by tree depth
468
    \item Feature importance computation via information gain
469
    \item Hyperparameter tuning (max depth, min samples split)
470
    \item Visualization of decision boundaries in 2D
471
\end{itemize}
472

473
The optimal tree depth of \py{best_depth} balances model complexity with generalization, achieving \py{f"{best_test_acc:.1%}"} test accuracy.
474

475
\end{document}
476

477
Product

Resources

Company