Path: blob/main/latex-templates/templates/machine-learning/decision_tree.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{booktabs}6\usepackage{siunitx}7\usepackage{float}8\usepackage{geometry}9\geometry{margin=1in}10\usepackage[makestderr]{pythontex}1112\title{Decision Trees: Theory and Implementation}13\author{Machine Learning Foundations}14\date{\today}1516\begin{document}17\maketitle1819\begin{abstract}20This document presents a comprehensive analysis of decision tree algorithms for classification and regression. We explore information gain, Gini impurity, and variance reduction as splitting criteria, implement tree construction from scratch, examine pruning techniques, and analyze feature importance measures.21\end{abstract}2223\section{Introduction}24Decision trees partition the feature space recursively using axis-aligned splits. For classification, a node's impurity can be measured using:2526\textbf{Entropy:}27\begin{equation}28H(S) = -\sum_{c=1}^{C} p_c \log_2 p_c29\end{equation}3031\textbf{Gini Impurity:}32\begin{equation}33G(S) = 1 - \sum_{c=1}^{C} p_c^234\end{equation}3536where $p_c$ is the proportion of class $c$ in set $S$.3738\textbf{Information Gain:}39\begin{equation}40IG(S, A) = H(S) - \sum_{v \in \text{values}(A)} \frac{|S_v|}{|S|} H(S_v)41\end{equation}4243\section{Computational Environment}44\begin{pycode}45import numpy as np46import matplotlib.pyplot as plt47from collections import Counter48import warnings49warnings.filterwarnings('ignore')5051plt.rc('text', usetex=True)52plt.rc('font', family='serif')53np.random.seed(42)5455def save_plot(filename, caption):56plt.savefig(filename, bbox_inches='tight', dpi=150)57print(r'\begin{figure}[H]')58print(r'\centering')59print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')60print(r'\caption{' + caption + '}')61print(r'\end{figure}')62plt.close()63\end{pycode}6465\section{Data Generation}66\begin{pycode}67# Generate classification dataset68def make_classification_data(n_samples, n_features, n_classes, random_state=42):69np.random.seed(random_state)70X = np.random.randn(n_samples, n_features)71# Create class structure72y = np.zeros(n_samples, dtype=int)73for i in range(n_classes):74mask = (i * n_samples // n_classes <= np.arange(n_samples)) & \75(np.arange(n_samples) < (i+1) * n_samples // n_classes)76y[mask] = i77X[mask] += np.random.randn(n_features) * 0.578# Shuffle79idx = np.random.permutation(n_samples)80return X[idx], y[idx]8182def make_moons_data(n_samples, noise=0.1, random_state=42):83np.random.seed(random_state)84n_samples_out = n_samples // 285n_samples_in = n_samples - n_samples_out86outer_circ_x = np.cos(np.linspace(0, np.pi, n_samples_out))87outer_circ_y = np.sin(np.linspace(0, np.pi, n_samples_out))88inner_circ_x = 1 - np.cos(np.linspace(0, np.pi, n_samples_in))89inner_circ_y = 1 - np.sin(np.linspace(0, np.pi, n_samples_in)) - 0.590X = np.vstack([np.append(outer_circ_x, inner_circ_x),91np.append(outer_circ_y, inner_circ_y)]).T92y = np.hstack([np.zeros(n_samples_out), np.ones(n_samples_in)]).astype(int)93X += np.random.randn(*X.shape) * noise94return X, y9596# Multi-class classification97X, y = make_classification_data(500, 10, 3)98# Split data99n_train = 400100X_train, X_test = X[:n_train], X[n_train:]101y_train, y_test = y[:n_train], y[n_train:]102103n_samples = len(X)104n_features = X.shape[1]105n_classes = len(np.unique(y))106107# 2D dataset for visualization108X_2d, y_2d = make_moons_data(300, noise=0.25)109X_2d_train, X_2d_test = X_2d[:240], X_2d[240:]110y_2d_train, y_2d_test = y_2d[:240], y_2d[240:]111112# Plot 2D data113fig, ax = plt.subplots(figsize=(8, 6))114scatter = ax.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap='RdYlBu',115alpha=0.7, edgecolors='black', s=50)116ax.set_xlabel('Feature 1')117ax.set_ylabel('Feature 2')118ax.set_title('Two Moons Classification Dataset')119ax.grid(True, alpha=0.3)120plt.colorbar(scatter, label='Class')121save_plot('dt_data.pdf', 'Two-dimensional classification dataset for decision tree visualization.')122\end{pycode}123124Dataset characteristics: $n = \py{n_samples}$ samples, $p = \py{n_features}$ features, $C = \py{n_classes}$ classes.125126\section{Impurity Measures}127\begin{pycode}128def entropy(y):129"""Calculate entropy of a label array."""130if len(y) == 0:131return 0132counts = np.bincount(y.astype(int))133probs = counts[counts > 0] / len(y)134return -np.sum(probs * np.log2(probs))135136def gini(y):137"""Calculate Gini impurity of a label array."""138if len(y) == 0:139return 0140counts = np.bincount(y.astype(int))141probs = counts[counts > 0] / len(y)142return 1 - np.sum(probs**2)143144def misclassification(y):145"""Calculate misclassification error."""146if len(y) == 0:147return 0148counts = np.bincount(y.astype(int))149return 1 - np.max(counts) / len(y)150151# Compare impurity measures152p = np.linspace(0.01, 0.99, 100)153entropy_vals = -p * np.log2(p) - (1-p) * np.log2(1-p)154gini_vals = 2 * p * (1 - p)155misc_vals = 1 - np.maximum(p, 1-p)156157fig, ax = plt.subplots(figsize=(8, 5))158ax.plot(p, entropy_vals, 'b-', linewidth=2, label='Entropy')159ax.plot(p, gini_vals, 'r--', linewidth=2, label='Gini Impurity')160ax.plot(p, misc_vals, 'g:', linewidth=2, label='Misclassification')161ax.set_xlabel('Probability of Class 1')162ax.set_ylabel('Impurity Measure')163ax.set_title('Comparison of Impurity Measures (Binary Classification)')164ax.legend()165ax.grid(True, alpha=0.3)166save_plot('dt_impurity.pdf', 'Comparison of entropy, Gini impurity, and misclassification error.')167\end{pycode}168169\section{Decision Tree Implementation}170\begin{pycode}171class DecisionTreeNode:172def __init__(self, depth=0):173self.depth = depth174self.feature_idx = None175self.threshold = None176self.left = None177self.right = None178self.is_leaf = False179self.prediction = None180self.samples = 0181self.impurity = 0182183class DecisionTreeClassifier:184def __init__(self, max_depth=5, min_samples_split=2, criterion='gini'):185self.max_depth = max_depth186self.min_samples_split = min_samples_split187self.criterion = criterion188self.root = None189self.n_classes = None190self.feature_importances_ = None191192def _impurity(self, y):193if self.criterion == 'gini':194return gini(y)195else:196return entropy(y)197198def _best_split(self, X, y):199best_gain = -1200best_feature = None201best_threshold = None202203n_samples, n_features = X.shape204parent_impurity = self._impurity(y)205206for feature_idx in range(n_features):207thresholds = np.unique(X[:, feature_idx])208209for threshold in thresholds:210left_mask = X[:, feature_idx] <= threshold211right_mask = ~left_mask212213if np.sum(left_mask) < 1 or np.sum(right_mask) < 1:214continue215216left_impurity = self._impurity(y[left_mask])217right_impurity = self._impurity(y[right_mask])218219n_left = np.sum(left_mask)220n_right = np.sum(right_mask)221222weighted_impurity = (n_left * left_impurity +223n_right * right_impurity) / n_samples224gain = parent_impurity - weighted_impurity225226if gain > best_gain:227best_gain = gain228best_feature = feature_idx229best_threshold = threshold230231return best_feature, best_threshold, best_gain232233def _build_tree(self, X, y, depth=0):234node = DecisionTreeNode(depth)235node.samples = len(y)236node.impurity = self._impurity(y)237238# Check stopping criteria239if (depth >= self.max_depth or240len(y) < self.min_samples_split or241len(np.unique(y)) == 1):242node.is_leaf = True243node.prediction = Counter(y.astype(int)).most_common(1)[0][0]244return node245246# Find best split247feature_idx, threshold, gain = self._best_split(X, y)248249if feature_idx is None or gain <= 0:250node.is_leaf = True251node.prediction = Counter(y.astype(int)).most_common(1)[0][0]252return node253254# Split data255left_mask = X[:, feature_idx] <= threshold256right_mask = ~left_mask257258node.feature_idx = feature_idx259node.threshold = threshold260261# Update feature importance262if self.feature_importances_ is not None:263self.feature_importances_[feature_idx] += gain * len(y)264265# Recursively build children266node.left = self._build_tree(X[left_mask], y[left_mask], depth + 1)267node.right = self._build_tree(X[right_mask], y[right_mask], depth + 1)268269return node270271def fit(self, X, y):272self.n_classes = len(np.unique(y))273self.feature_importances_ = np.zeros(X.shape[1])274self.root = self._build_tree(X, y)275# Normalize feature importances276if np.sum(self.feature_importances_) > 0:277self.feature_importances_ /= np.sum(self.feature_importances_)278return self279280def _predict_sample(self, x, node):281if node.is_leaf:282return node.prediction283if x[node.feature_idx] <= node.threshold:284return self._predict_sample(x, node.left)285else:286return self._predict_sample(x, node.right)287288def predict(self, X):289return np.array([self._predict_sample(x, self.root) for x in X])290291def score(self, X, y):292return np.mean(self.predict(X) == y)293294# Train tree on 2D data295tree = DecisionTreeClassifier(max_depth=5, criterion='gini')296tree.fit(X_2d_train, y_2d_train)297298train_acc = tree.score(X_2d_train, y_2d_train)299test_acc = tree.score(X_2d_test, y_2d_test)300\end{pycode}301302\section{Decision Boundary Visualization}303\begin{pycode}304def plot_decision_boundary(model, X, y, ax, title):305"""Plot decision boundary for 2D data."""306h = 0.02307x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5308y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5309xx, yy = np.meshgrid(np.arange(x_min, x_max, h),310np.arange(y_min, y_max, h))311312Z = model.predict(np.c_[xx.ravel(), yy.ravel()])313Z = Z.reshape(xx.shape)314315ax.contourf(xx, yy, Z, alpha=0.4, cmap='RdYlBu')316ax.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu',317edgecolors='black', s=50, alpha=0.8)318ax.set_xlabel('Feature 1')319ax.set_ylabel('Feature 2')320ax.set_title(title)321322# Compare different depths323fig, axes = plt.subplots(2, 2, figsize=(12, 10))324depths = [1, 3, 5, 10]325326for ax, depth in zip(axes.flat, depths):327tree_temp = DecisionTreeClassifier(max_depth=depth, criterion='gini')328tree_temp.fit(X_2d_train, y_2d_train)329acc = tree_temp.score(X_2d_test, y_2d_test)330plot_decision_boundary(tree_temp, X_2d, y_2d, ax,331f'Depth = {depth}, Accuracy = {acc:.2f}')332333plt.tight_layout()334save_plot('dt_boundaries.pdf', 'Decision boundaries for different tree depths showing overfitting.')335\end{pycode}336337Training accuracy: \py{f"{train_acc:.3f}"}, Test accuracy: \py{f"{test_acc:.3f}"}.338339\section{Pruning Analysis}340\begin{pycode}341# Analyze effect of max_depth342depths = range(1, 16)343train_scores = []344test_scores = []345346for depth in depths:347tree_temp = DecisionTreeClassifier(max_depth=depth, criterion='gini')348tree_temp.fit(X_train, y_train)349train_scores.append(tree_temp.score(X_train, y_train))350test_scores.append(tree_temp.score(X_test, y_test))351352# Find optimal depth353best_depth = list(depths)[np.argmax(test_scores)]354best_test_acc = max(test_scores)355356fig, ax = plt.subplots(figsize=(8, 5))357ax.plot(list(depths), train_scores, 'b-o', label='Training', linewidth=2, markersize=6)358ax.plot(list(depths), test_scores, 'r-s', label='Test', linewidth=2, markersize=6)359ax.axvline(best_depth, color='gray', linestyle='--', alpha=0.7,360label=f'Best depth = {best_depth}')361ax.set_xlabel('Maximum Depth')362ax.set_ylabel('Accuracy')363ax.set_title('Effect of Tree Depth on Performance')364ax.legend()365ax.grid(True, alpha=0.3)366save_plot('dt_pruning.pdf', 'Training and test accuracy as function of tree depth.')367\end{pycode}368369Optimal depth: \py{best_depth} with test accuracy \py{f"{best_test_acc:.3f}"}.370371\section{Feature Importance}372\begin{pycode}373# Train final model with optimal depth374final_tree = DecisionTreeClassifier(max_depth=best_depth, criterion='gini')375final_tree.fit(X_train, y_train)376377# Get feature importances378importances = final_tree.feature_importances_379indices = np.argsort(importances)[::-1]380381fig, ax = plt.subplots(figsize=(10, 5))382ax.bar(range(len(importances)), importances[indices],383color='steelblue', alpha=0.8)384ax.set_xticks(range(len(importances)))385ax.set_xticklabels([f'F{i}' for i in indices])386ax.set_xlabel('Feature')387ax.set_ylabel('Importance')388ax.set_title('Feature Importance (Information Gain)')389ax.grid(True, alpha=0.3, axis='y')390save_plot('dt_importance.pdf', 'Feature importance scores based on total information gain.')391392# Top 3 features393top_features = indices[:3]394top_importances = importances[top_features]395\end{pycode}396397Top 3 features: \py{f"F{top_features[0]}"} (\py{f"{top_importances[0]:.3f}"}), \py{f"F{top_features[1]}"} (\py{f"{top_importances[1]:.3f}"}), \py{f"F{top_features[2]}"} (\py{f"{top_importances[2]:.3f}"}).398399\section{Gini vs Entropy Comparison}400\begin{pycode}401# Compare splitting criteria402gini_scores = []403entropy_scores = []404405for depth in depths:406tree_gini = DecisionTreeClassifier(max_depth=depth, criterion='gini')407tree_gini.fit(X_train, y_train)408gini_scores.append(tree_gini.score(X_test, y_test))409410tree_entropy = DecisionTreeClassifier(max_depth=depth, criterion='entropy')411tree_entropy.fit(X_train, y_train)412entropy_scores.append(tree_entropy.score(X_test, y_test))413414fig, ax = plt.subplots(figsize=(8, 5))415ax.plot(list(depths), gini_scores, 'b-o', label='Gini', linewidth=2, markersize=6)416ax.plot(list(depths), entropy_scores, 'r-s', label='Entropy', linewidth=2, markersize=6)417ax.set_xlabel('Maximum Depth')418ax.set_ylabel('Test Accuracy')419ax.set_title('Comparison of Splitting Criteria')420ax.legend()421ax.grid(True, alpha=0.3)422save_plot('dt_criteria.pdf', 'Test accuracy comparison between Gini impurity and entropy.')423424# Best scores for each criterion425best_gini = max(gini_scores)426best_entropy = max(entropy_scores)427\end{pycode}428429\section{Results Summary}430\begin{table}[H]431\centering432\caption{Decision Tree Performance Summary}433\begin{tabular}{lr}434\toprule435Metric & Value \\436\midrule437Dataset Size & \py{n_samples} \\438Number of Features & \py{n_features} \\439Number of Classes & \py{n_classes} \\440Optimal Tree Depth & \py{best_depth} \\441Best Test Accuracy (Gini) & \py{f"{best_gini:.3f}"} \\442Best Test Accuracy (Entropy) & \py{f"{best_entropy:.3f}"} \\443\bottomrule444\end{tabular}445\end{table}446447\begin{pycode}448print(r'\begin{table}[H]')449print(r'\centering')450print(r'\caption{Feature Importance Ranking}')451print(r'\begin{tabular}{ccc}')452print(r'\toprule')453print(r'Rank & Feature & Importance \\')454print(r'\midrule')455for i in range(min(5, len(indices))):456print(f'{i+1} & F{indices[i]} & {importances[indices[i]]:.4f} \\\\')457print(r'\bottomrule')458print(r'\end{tabular}')459print(r'\end{table}')460\end{pycode}461462\section{Conclusion}463This analysis demonstrated:464\begin{itemize}465\item Decision tree construction using Gini impurity and entropy466\item The bias-variance tradeoff controlled by tree depth467\item Feature importance computation via information gain468\item Hyperparameter tuning (max depth, min samples split)469\item Visualization of decision boundaries in 2D470\end{itemize}471472The optimal tree depth of \py{best_depth} balances model complexity with generalization, achieving \py{f"{best_test_acc:.1%}"} test accuracy.473474\end{document}475476477