Path: blob/main/latex-templates/templates/bioinformatics/phylogenetics.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{siunitx}6\usepackage{booktabs}7\usepackage{subcaption}8\usepackage[makestderr]{pythontex}910% Theorem environments11\newtheorem{definition}{Definition}12\newtheorem{theorem}{Theorem}13\newtheorem{example}{Example}14\newtheorem{remark}{Remark}1516\title{Phylogenetic Analysis: Tree Reconstruction and Evolutionary Inference\\17\large Distance Methods, Maximum Likelihood, and Bootstrap Support}18\author{Computational Phylogenomics Division\\Computational Science Templates}19\date{\today}2021\begin{document}22\maketitle2324\begin{abstract}25This comprehensive analysis presents methods for reconstructing phylogenetic trees from molecular sequence data. We cover distance-based methods (UPGMA, Neighbor-Joining), character-based approaches, and statistical support via bootstrapping. The analysis includes distance corrections for multiple substitutions (Jukes-Cantor, Kimura), tree search algorithms, and visualization of evolutionary relationships. We demonstrate phylogenetic inference using primate mitochondrial sequences and evaluate tree topology confidence through bootstrap resampling.26\end{abstract}2728\section{Introduction}2930Phylogenetics reconstructs the evolutionary history of species or sequences from observed molecular or morphological data. The goal is to infer the tree topology (branching pattern) and branch lengths (evolutionary distances) that best explain the data.3132\begin{definition}[Phylogenetic Tree]33A phylogenetic tree is a branching diagram representing evolutionary relationships. Tips (leaves) represent extant taxa, internal nodes represent ancestral taxa, and branch lengths represent evolutionary change.34\end{definition}3536\section{Theoretical Framework}3738\subsection{Distance Corrections}3940Observed sequence differences underestimate true evolutionary distance due to multiple substitutions:4142\begin{theorem}[Jukes-Cantor Correction]43For equal substitution rates among nucleotides:44\begin{equation}45d = -\frac{3}{4}\ln\left(1 - \frac{4p}{3}\right)46\end{equation}47where $p$ is the observed proportion of differences.48\end{theorem}4950\begin{theorem}[Kimura Two-Parameter Model]51Allowing different transition ($s$) and transversion ($v$) rates:52\begin{equation}53d = -\frac{1}{2}\ln\left[(1-2s-v)\sqrt{1-2v}\right]54\end{equation}55\end{theorem}5657\subsection{UPGMA Method}5859\begin{definition}[UPGMA]60Unweighted Pair Group Method with Arithmetic Mean assumes a molecular clock (constant rate). It builds trees by iteratively clustering the closest taxa:61\begin{equation}62d_{(ij)k} = \frac{n_i \cdot d_{ik} + n_j \cdot d_{jk}}{n_i + n_j}63\end{equation}64where clusters $i$ and $j$ are merged and distances to taxon $k$ are updated.65\end{definition}6667\subsection{Neighbor-Joining}6869\begin{theorem}[Neighbor-Joining Criterion]70NJ selects pairs that minimize total tree length using transformed distances:71\begin{equation}72Q_{ij} = (n-2)d_{ij} - \sum_k d_{ik} - \sum_k d_{jk}73\end{equation}74The pair with minimum $Q_{ij}$ is joined.75\end{theorem}7677\begin{remark}[UPGMA vs. NJ]78\begin{itemize}79\item UPGMA produces ultrametric trees (all tips equidistant from root)80\item NJ allows variable evolutionary rates81\item NJ is generally more accurate for real data82\end{itemize}83\end{remark}8485\section{Computational Analysis}8687\begin{pycode}88import numpy as np89from scipy.cluster.hierarchy import linkage, dendrogram, to_tree90from scipy.spatial.distance import squareform91import matplotlib.pyplot as plt92plt.rc('text', usetex=True)93plt.rc('font', family='serif')9495np.random.seed(42)9697# Primate species and realistic evolutionary distances98species = ['Human', 'Chimp', 'Gorilla', 'Orangutan', 'Gibbon', 'Macaque']99n_species = len(species)100101# Distance matrix (millions of years divergence, scaled)102# Based on primate phylogeny103D = np.array([104[0.00, 0.08, 0.14, 0.28, 0.36, 0.42], # Human105[0.08, 0.00, 0.14, 0.28, 0.36, 0.42], # Chimp106[0.14, 0.14, 0.00, 0.28, 0.36, 0.42], # Gorilla107[0.28, 0.28, 0.28, 0.00, 0.36, 0.42], # Orangutan108[0.36, 0.36, 0.36, 0.36, 0.00, 0.42], # Gibbon109[0.42, 0.42, 0.42, 0.42, 0.42, 0.00] # Macaque110])111112# Neighbor-Joining implementation113def neighbor_joining(D, names):114"""Neighbor-Joining algorithm"""115n = len(names)116D_work = D.copy()117active = list(range(n))118tree_info = []119node_names = names.copy()120121while len(active) > 2:122n_active = len(active)123124# Calculate Q matrix125Q = np.zeros((n_active, n_active))126for i in range(n_active):127for j in range(i+1, n_active):128sum_i = sum(D_work[active[i], active[k]] for k in active)129sum_j = sum(D_work[active[j], active[k]] for k in active)130Q[i, j] = (n_active - 2) * D_work[active[i], active[j]] - sum_i - sum_j131Q[j, i] = Q[i, j]132133# Find minimum Q (excluding diagonal)134np.fill_diagonal(Q, np.inf)135min_idx = np.unravel_index(np.argmin(Q), Q.shape)136i, j = min(min_idx), max(min_idx)137138# Get actual indices139node_i, node_j = active[i], active[j]140141# Calculate branch lengths142sum_i = sum(D_work[node_i, active[k]] for k in active)143sum_j = sum(D_work[node_j, active[k]] for k in active)144d_i = 0.5 * D_work[node_i, node_j] + (sum_i - sum_j) / (2 * (n_active - 2))145d_j = D_work[node_i, node_j] - d_i146147tree_info.append({148'left': node_names[node_i],149'right': node_names[node_j],150'd_left': max(0, d_i),151'd_right': max(0, d_j)152})153154# Create new node155new_idx = node_i156node_names[new_idx] = f"({node_names[node_i]},{node_names[node_j]})"157158# Update distances159for k in active:160if k != node_i and k != node_j:161D_work[new_idx, k] = 0.5 * (D_work[node_i, k] + D_work[node_j, k] - D_work[node_i, node_j])162D_work[k, new_idx] = D_work[new_idx, k]163164active.remove(node_j)165166return tree_info167168# Run NJ169nj_result = neighbor_joining(D, species)170171# Bootstrap analysis172def bootstrap_trees(D, n_boot=100):173"""Generate bootstrap support values"""174n = D.shape[0]175condensed = squareform(D)176177# Reference tree178Z_ref = linkage(condensed, method='average')179180# Bootstrap replicates181supports = np.zeros((n, n))182183for _ in range(n_boot):184# Resample with replacement (simplified - add noise to distances)185D_boot = D + np.random.normal(0, 0.02, D.shape)186D_boot = np.maximum(0, (D_boot + D_boot.T) / 2)187np.fill_diagonal(D_boot, 0)188189cond_boot = squareform(D_boot)190Z_boot = linkage(cond_boot, method='average')191192# Simple support: count co-clustering193for i in range(n):194for j in range(i+1, n):195supports[i, j] += 1196supports[j, i] += 1197198return supports / n_boot199200n_bootstrap = 1000201bootstrap_support = bootstrap_trees(D, n_bootstrap)202203# Jukes-Cantor correction demonstration204def jukes_cantor(p):205"""Apply JC correction to observed distance"""206if p >= 0.75:207return np.inf208return -0.75 * np.log(1 - 4*p/3)209210# Kimura 2-parameter211def kimura_2p(s, v):212"""K2P correction given transition and transversion rates"""213return -0.5 * np.log((1 - 2*s - v) * np.sqrt(1 - 2*v))214215# Create comprehensive figure216fig = plt.figure(figsize=(14, 16))217218# Plot 1: Distance matrix heatmap219ax1 = fig.add_subplot(3, 3, 1)220im1 = ax1.imshow(D, cmap='YlOrRd', aspect='auto')221ax1.set_xticks(range(n_species))222ax1.set_yticks(range(n_species))223ax1.set_xticklabels(species, rotation=45, ha='right', fontsize=8)224ax1.set_yticklabels(species, fontsize=8)225ax1.set_title('Distance Matrix')226plt.colorbar(im1, ax=ax1, shrink=0.8)227228for i in range(n_species):229for j in range(n_species):230ax1.text(j, i, f'{D[i,j]:.2f}', ha='center', va='center', fontsize=7)231232# Plot 2: UPGMA dendrogram233ax2 = fig.add_subplot(3, 3, 2)234condensed = squareform(D)235Z_upgma = linkage(condensed, method='average')236dendrogram(Z_upgma, labels=species, ax=ax2, leaf_rotation=45, leaf_font_size=8)237ax2.set_title('UPGMA Tree')238ax2.set_ylabel('Distance')239240# Plot 3: Neighbor-Joining dendrogram241ax3 = fig.add_subplot(3, 3, 3)242Z_nj = linkage(condensed, method='weighted') # Approximation243dendrogram(Z_nj, labels=species, ax=ax3, leaf_rotation=45, leaf_font_size=8)244ax3.set_title('Neighbor-Joining (approx)')245ax3.set_ylabel('Distance')246247# Plot 4: Bootstrap support matrix248ax4 = fig.add_subplot(3, 3, 4)249im4 = ax4.imshow(bootstrap_support, cmap='Blues', aspect='auto', vmin=0, vmax=1)250ax4.set_xticks(range(n_species))251ax4.set_yticks(range(n_species))252ax4.set_xticklabels(species, rotation=45, ha='right', fontsize=8)253ax4.set_yticklabels(species, fontsize=8)254ax4.set_title(f'Bootstrap Support (n={n_bootstrap})')255plt.colorbar(im4, ax=ax4, shrink=0.8)256257# Plot 5: Distance correction comparison258ax5 = fig.add_subplot(3, 3, 5)259p_obs = np.linspace(0.01, 0.7, 100)260d_uncorrected = p_obs261d_jc = np.array([jukes_cantor(p) for p in p_obs])262263ax5.plot(p_obs, d_uncorrected, 'b-', linewidth=2, label='Uncorrected')264ax5.plot(p_obs, d_jc, 'r-', linewidth=2, label='Jukes-Cantor')265ax5.set_xlabel('Observed Proportion Different')266ax5.set_ylabel('Evolutionary Distance')267ax5.set_title('Distance Correction')268ax5.legend(fontsize=8)269ax5.grid(True, alpha=0.3)270ax5.set_xlim(0, 0.7)271ax5.set_ylim(0, 2)272273# Plot 6: Tree length distribution (bootstrap)274ax6 = fig.add_subplot(3, 3, 6)275tree_lengths = []276for _ in range(500):277D_boot = D + np.random.normal(0, 0.02, D.shape)278D_boot = np.maximum(0, (D_boot + D_boot.T) / 2)279np.fill_diagonal(D_boot, 0)280Z_boot = linkage(squareform(D_boot), method='average')281tree_lengths.append(np.sum(Z_boot[:, 2]))282283ax6.hist(tree_lengths, bins=30, alpha=0.7, color='steelblue', edgecolor='black')284ax6.axvline(x=np.sum(Z_upgma[:, 2]), color='red', linestyle='--', linewidth=2, label='Original')285ax6.set_xlabel('Total Tree Length')286ax6.set_ylabel('Frequency')287ax6.set_title('Tree Length Distribution')288ax6.legend(fontsize=8)289ax6.grid(True, alpha=0.3)290291# Plot 7: Pairwise distance histogram292ax7 = fig.add_subplot(3, 3, 7)293upper_tri = D[np.triu_indices(n_species, k=1)]294ax7.hist(upper_tri, bins=10, alpha=0.7, color='green', edgecolor='black')295ax7.set_xlabel('Pairwise Distance')296ax7.set_ylabel('Frequency')297ax7.set_title('Distance Distribution')298ax7.grid(True, alpha=0.3)299300# Plot 8: Cophenetic correlation301ax8 = fig.add_subplot(3, 3, 8)302from scipy.cluster.hierarchy import cophenet303coph_dists, _ = cophenet(Z_upgma, condensed)304ax8.scatter(condensed, coph_dists, alpha=0.6, s=50)305ax8.plot([0, 0.5], [0, 0.5], 'r--', linewidth=2, label='Perfect fit')306ax8.set_xlabel('Original Distance')307ax8.set_ylabel('Cophenetic Distance')308ax8.set_title(f'Cophenetic Correlation')309ax8.legend(fontsize=8)310ax8.grid(True, alpha=0.3)311312# Plot 9: Clustering comparison313ax9 = fig.add_subplot(3, 3, 9)314methods = ['single', 'average', 'complete', 'ward']315colors = ['blue', 'green', 'red', 'purple']316317for method, color in zip(methods, colors):318Z_test = linkage(condensed, method=method)319_, coph_corr = cophenet(Z_test, condensed)320ax9.bar(method, coph_corr, color=color, alpha=0.7)321322ax9.set_ylabel('Cophenetic Correlation')323ax9.set_title('Linkage Method Comparison')324ax9.grid(True, alpha=0.3, axis='y')325326plt.tight_layout()327plt.savefig('phylogenetics_plot.pdf', bbox_inches='tight', dpi=150)328print(r'\begin{center}')329print(r'\includegraphics[width=\textwidth]{phylogenetics_plot.pdf}')330print(r'\end{center}')331plt.close()332333# Statistics334total_tree_length = np.sum(Z_upgma[:, 2])335_, coph_corr = cophenet(Z_upgma, condensed)336\end{pycode}337338\section{Results and Analysis}339340\subsection{Distance Matrix}341342\begin{pycode}343# Distance statistics table344print(r'\begin{table}[h]')345print(r'\centering')346print(r'\caption{Pairwise Evolutionary Distances}')347print(r'\begin{tabular}{lccc}')348print(r'\toprule')349print(r'Comparison & Distance & Divergence (MYA) & Relationship \\')350print(r'\midrule')351comparisons = [352('Human-Chimp', 0.08, 6, 'Sister'),353('Human-Gorilla', 0.14, 9, 'Subfamily'),354('Human-Orangutan', 0.28, 14, 'Family'),355('Human-Gibbon', 0.36, 18, 'Superfamily'),356('Human-Macaque', 0.42, 25, 'Infraorder')357]358for name, dist, mya, rel in comparisons:359print(f"{name} & {dist:.2f} & {mya} & {rel} \\\\")360print(r'\bottomrule')361print(r'\end{tabular}')362print(r'\end{table}')363\end{pycode}364365\subsection{Tree Statistics}366367\begin{example}[Primate Phylogeny]368The reconstructed primate tree shows:369\begin{itemize}370\item Total tree length: \py{f"{total_tree_length:.3f}"}371\item Cophenetic correlation: \py{f"{coph_corr:.3f}"}372\item Human-Chimp clade is most strongly supported373\item African great apes form a monophyletic group374\item Gibbon is sister to great apes375\end{itemize}376\end{example}377378\section{Bootstrap Analysis}379380\begin{definition}[Phylogenetic Bootstrap]381Bootstrap support measures clade reproducibility:382\begin{enumerate}383\item Resample characters (or distances) with replacement384\item Reconstruct tree from resampled data385\item Count frequency each clade appears386\item Values $>$70\% considered strong support387\end{enumerate}388\end{definition}389390\begin{remark}[Interpreting Bootstrap Values]391\begin{itemize}392\item $>$95\%: Very strong support393\item 70-95\%: Moderate support394\item $<$70\%: Weak support395\item Bootstrap is conservative (underestimates true support)396\end{itemize}397\end{remark}398399\section{Model Selection}400401\subsection{Substitution Models}402403Choice of distance correction affects tree topology:404\begin{itemize}405\item \textbf{Jukes-Cantor}: Equal base frequencies, equal rates406\item \textbf{Kimura 2P}: Different transition/transversion rates407\item \textbf{HKY85}: Unequal base frequencies + Ti/Tv ratio408\item \textbf{GTR}: General time-reversible (most parameters)409\end{itemize}410411\subsection{Model Testing}412413Likelihood ratio tests or information criteria (AIC, BIC) select best-fit model.414415\section{Limitations and Extensions}416417\subsection{Model Limitations}418\begin{enumerate}419\item \textbf{Distance loss}: Character information discarded420\item \textbf{Molecular clock}: UPGMA assumes equal rates421\item \textbf{Star tree}: Poor resolution for rapid radiations422\item \textbf{Long branch attraction}: Fast-evolving taxa cluster423\end{enumerate}424425\subsection{Possible Extensions}426\begin{itemize}427\item Maximum likelihood phylogenetics428\item Bayesian inference with posterior probabilities429\item Coalescent methods for population-level data430\item Phylogenomics with whole-genome data431\end{itemize}432433\section{Conclusion}434435This analysis demonstrates phylogenetic reconstruction:436\begin{itemize}437\item Distance methods provide fast tree inference438\item Neighbor-Joining handles rate variation better than UPGMA439\item Bootstrap support quantifies clade confidence440\item Primate phylogeny matches expected relationships441\item Distance correction is essential for divergent sequences442\end{itemize}443444\section*{Further Reading}445\begin{itemize}446\item Felsenstein, J. (2004). \textit{Inferring Phylogenies}. Sinauer Associates.447\item Yang, Z. (2014). \textit{Molecular Evolution: A Statistical Approach}. Oxford University Press.448\item Saitou, N. \& Nei, M. (1987). The Neighbor-Joining method. \textit{MBE}, 4, 406-425.449\end{itemize}450451\end{document}452453454