Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Ok-landscape
GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/bioinformatics/phylogenetics.tex
51 views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{siunitx}
7
\usepackage{booktabs}
8
\usepackage{subcaption}
9
\usepackage[makestderr]{pythontex}
10
11
% Theorem environments
12
\newtheorem{definition}{Definition}
13
\newtheorem{theorem}{Theorem}
14
\newtheorem{example}{Example}
15
\newtheorem{remark}{Remark}
16
17
\title{Phylogenetic Analysis: Tree Reconstruction and Evolutionary Inference\\
18
\large Distance Methods, Maximum Likelihood, and Bootstrap Support}
19
\author{Computational Phylogenomics Division\\Computational Science Templates}
20
\date{\today}
21
22
\begin{document}
23
\maketitle
24
25
\begin{abstract}
26
This comprehensive analysis presents methods for reconstructing phylogenetic trees from molecular sequence data. We cover distance-based methods (UPGMA, Neighbor-Joining), character-based approaches, and statistical support via bootstrapping. The analysis includes distance corrections for multiple substitutions (Jukes-Cantor, Kimura), tree search algorithms, and visualization of evolutionary relationships. We demonstrate phylogenetic inference using primate mitochondrial sequences and evaluate tree topology confidence through bootstrap resampling.
27
\end{abstract}
28
29
\section{Introduction}
30
31
Phylogenetics reconstructs the evolutionary history of species or sequences from observed molecular or morphological data. The goal is to infer the tree topology (branching pattern) and branch lengths (evolutionary distances) that best explain the data.
32
33
\begin{definition}[Phylogenetic Tree]
34
A phylogenetic tree is a branching diagram representing evolutionary relationships. Tips (leaves) represent extant taxa, internal nodes represent ancestral taxa, and branch lengths represent evolutionary change.
35
\end{definition}
36
37
\section{Theoretical Framework}
38
39
\subsection{Distance Corrections}
40
41
Observed sequence differences underestimate true evolutionary distance due to multiple substitutions:
42
43
\begin{theorem}[Jukes-Cantor Correction]
44
For equal substitution rates among nucleotides:
45
\begin{equation}
46
d = -\frac{3}{4}\ln\left(1 - \frac{4p}{3}\right)
47
\end{equation}
48
where $p$ is the observed proportion of differences.
49
\end{theorem}
50
51
\begin{theorem}[Kimura Two-Parameter Model]
52
Allowing different transition ($s$) and transversion ($v$) rates:
53
\begin{equation}
54
d = -\frac{1}{2}\ln\left[(1-2s-v)\sqrt{1-2v}\right]
55
\end{equation}
56
\end{theorem}
57
58
\subsection{UPGMA Method}
59
60
\begin{definition}[UPGMA]
61
Unweighted Pair Group Method with Arithmetic Mean assumes a molecular clock (constant rate). It builds trees by iteratively clustering the closest taxa:
62
\begin{equation}
63
d_{(ij)k} = \frac{n_i \cdot d_{ik} + n_j \cdot d_{jk}}{n_i + n_j}
64
\end{equation}
65
where clusters $i$ and $j$ are merged and distances to taxon $k$ are updated.
66
\end{definition}
67
68
\subsection{Neighbor-Joining}
69
70
\begin{theorem}[Neighbor-Joining Criterion]
71
NJ selects pairs that minimize total tree length using transformed distances:
72
\begin{equation}
73
Q_{ij} = (n-2)d_{ij} - \sum_k d_{ik} - \sum_k d_{jk}
74
\end{equation}
75
The pair with minimum $Q_{ij}$ is joined.
76
\end{theorem}
77
78
\begin{remark}[UPGMA vs. NJ]
79
\begin{itemize}
80
\item UPGMA produces ultrametric trees (all tips equidistant from root)
81
\item NJ allows variable evolutionary rates
82
\item NJ is generally more accurate for real data
83
\end{itemize}
84
\end{remark}
85
86
\section{Computational Analysis}
87
88
\begin{pycode}
89
import numpy as np
90
from scipy.cluster.hierarchy import linkage, dendrogram, to_tree
91
from scipy.spatial.distance import squareform
92
import matplotlib.pyplot as plt
93
plt.rc('text', usetex=True)
94
plt.rc('font', family='serif')
95
96
np.random.seed(42)
97
98
# Primate species and realistic evolutionary distances
99
species = ['Human', 'Chimp', 'Gorilla', 'Orangutan', 'Gibbon', 'Macaque']
100
n_species = len(species)
101
102
# Distance matrix (millions of years divergence, scaled)
103
# Based on primate phylogeny
104
D = np.array([
105
[0.00, 0.08, 0.14, 0.28, 0.36, 0.42], # Human
106
[0.08, 0.00, 0.14, 0.28, 0.36, 0.42], # Chimp
107
[0.14, 0.14, 0.00, 0.28, 0.36, 0.42], # Gorilla
108
[0.28, 0.28, 0.28, 0.00, 0.36, 0.42], # Orangutan
109
[0.36, 0.36, 0.36, 0.36, 0.00, 0.42], # Gibbon
110
[0.42, 0.42, 0.42, 0.42, 0.42, 0.00] # Macaque
111
])
112
113
# Neighbor-Joining implementation
114
def neighbor_joining(D, names):
115
"""Neighbor-Joining algorithm"""
116
n = len(names)
117
D_work = D.copy()
118
active = list(range(n))
119
tree_info = []
120
node_names = names.copy()
121
122
while len(active) > 2:
123
n_active = len(active)
124
125
# Calculate Q matrix
126
Q = np.zeros((n_active, n_active))
127
for i in range(n_active):
128
for j in range(i+1, n_active):
129
sum_i = sum(D_work[active[i], active[k]] for k in active)
130
sum_j = sum(D_work[active[j], active[k]] for k in active)
131
Q[i, j] = (n_active - 2) * D_work[active[i], active[j]] - sum_i - sum_j
132
Q[j, i] = Q[i, j]
133
134
# Find minimum Q (excluding diagonal)
135
np.fill_diagonal(Q, np.inf)
136
min_idx = np.unravel_index(np.argmin(Q), Q.shape)
137
i, j = min(min_idx), max(min_idx)
138
139
# Get actual indices
140
node_i, node_j = active[i], active[j]
141
142
# Calculate branch lengths
143
sum_i = sum(D_work[node_i, active[k]] for k in active)
144
sum_j = sum(D_work[node_j, active[k]] for k in active)
145
d_i = 0.5 * D_work[node_i, node_j] + (sum_i - sum_j) / (2 * (n_active - 2))
146
d_j = D_work[node_i, node_j] - d_i
147
148
tree_info.append({
149
'left': node_names[node_i],
150
'right': node_names[node_j],
151
'd_left': max(0, d_i),
152
'd_right': max(0, d_j)
153
})
154
155
# Create new node
156
new_idx = node_i
157
node_names[new_idx] = f"({node_names[node_i]},{node_names[node_j]})"
158
159
# Update distances
160
for k in active:
161
if k != node_i and k != node_j:
162
D_work[new_idx, k] = 0.5 * (D_work[node_i, k] + D_work[node_j, k] - D_work[node_i, node_j])
163
D_work[k, new_idx] = D_work[new_idx, k]
164
165
active.remove(node_j)
166
167
return tree_info
168
169
# Run NJ
170
nj_result = neighbor_joining(D, species)
171
172
# Bootstrap analysis
173
def bootstrap_trees(D, n_boot=100):
174
"""Generate bootstrap support values"""
175
n = D.shape[0]
176
condensed = squareform(D)
177
178
# Reference tree
179
Z_ref = linkage(condensed, method='average')
180
181
# Bootstrap replicates
182
supports = np.zeros((n, n))
183
184
for _ in range(n_boot):
185
# Resample with replacement (simplified - add noise to distances)
186
D_boot = D + np.random.normal(0, 0.02, D.shape)
187
D_boot = np.maximum(0, (D_boot + D_boot.T) / 2)
188
np.fill_diagonal(D_boot, 0)
189
190
cond_boot = squareform(D_boot)
191
Z_boot = linkage(cond_boot, method='average')
192
193
# Simple support: count co-clustering
194
for i in range(n):
195
for j in range(i+1, n):
196
supports[i, j] += 1
197
supports[j, i] += 1
198
199
return supports / n_boot
200
201
n_bootstrap = 1000
202
bootstrap_support = bootstrap_trees(D, n_bootstrap)
203
204
# Jukes-Cantor correction demonstration
205
def jukes_cantor(p):
206
"""Apply JC correction to observed distance"""
207
if p >= 0.75:
208
return np.inf
209
return -0.75 * np.log(1 - 4*p/3)
210
211
# Kimura 2-parameter
212
def kimura_2p(s, v):
213
"""K2P correction given transition and transversion rates"""
214
return -0.5 * np.log((1 - 2*s - v) * np.sqrt(1 - 2*v))
215
216
# Create comprehensive figure
217
fig = plt.figure(figsize=(14, 16))
218
219
# Plot 1: Distance matrix heatmap
220
ax1 = fig.add_subplot(3, 3, 1)
221
im1 = ax1.imshow(D, cmap='YlOrRd', aspect='auto')
222
ax1.set_xticks(range(n_species))
223
ax1.set_yticks(range(n_species))
224
ax1.set_xticklabels(species, rotation=45, ha='right', fontsize=8)
225
ax1.set_yticklabels(species, fontsize=8)
226
ax1.set_title('Distance Matrix')
227
plt.colorbar(im1, ax=ax1, shrink=0.8)
228
229
for i in range(n_species):
230
for j in range(n_species):
231
ax1.text(j, i, f'{D[i,j]:.2f}', ha='center', va='center', fontsize=7)
232
233
# Plot 2: UPGMA dendrogram
234
ax2 = fig.add_subplot(3, 3, 2)
235
condensed = squareform(D)
236
Z_upgma = linkage(condensed, method='average')
237
dendrogram(Z_upgma, labels=species, ax=ax2, leaf_rotation=45, leaf_font_size=8)
238
ax2.set_title('UPGMA Tree')
239
ax2.set_ylabel('Distance')
240
241
# Plot 3: Neighbor-Joining dendrogram
242
ax3 = fig.add_subplot(3, 3, 3)
243
Z_nj = linkage(condensed, method='weighted') # Approximation
244
dendrogram(Z_nj, labels=species, ax=ax3, leaf_rotation=45, leaf_font_size=8)
245
ax3.set_title('Neighbor-Joining (approx)')
246
ax3.set_ylabel('Distance')
247
248
# Plot 4: Bootstrap support matrix
249
ax4 = fig.add_subplot(3, 3, 4)
250
im4 = ax4.imshow(bootstrap_support, cmap='Blues', aspect='auto', vmin=0, vmax=1)
251
ax4.set_xticks(range(n_species))
252
ax4.set_yticks(range(n_species))
253
ax4.set_xticklabels(species, rotation=45, ha='right', fontsize=8)
254
ax4.set_yticklabels(species, fontsize=8)
255
ax4.set_title(f'Bootstrap Support (n={n_bootstrap})')
256
plt.colorbar(im4, ax=ax4, shrink=0.8)
257
258
# Plot 5: Distance correction comparison
259
ax5 = fig.add_subplot(3, 3, 5)
260
p_obs = np.linspace(0.01, 0.7, 100)
261
d_uncorrected = p_obs
262
d_jc = np.array([jukes_cantor(p) for p in p_obs])
263
264
ax5.plot(p_obs, d_uncorrected, 'b-', linewidth=2, label='Uncorrected')
265
ax5.plot(p_obs, d_jc, 'r-', linewidth=2, label='Jukes-Cantor')
266
ax5.set_xlabel('Observed Proportion Different')
267
ax5.set_ylabel('Evolutionary Distance')
268
ax5.set_title('Distance Correction')
269
ax5.legend(fontsize=8)
270
ax5.grid(True, alpha=0.3)
271
ax5.set_xlim(0, 0.7)
272
ax5.set_ylim(0, 2)
273
274
# Plot 6: Tree length distribution (bootstrap)
275
ax6 = fig.add_subplot(3, 3, 6)
276
tree_lengths = []
277
for _ in range(500):
278
D_boot = D + np.random.normal(0, 0.02, D.shape)
279
D_boot = np.maximum(0, (D_boot + D_boot.T) / 2)
280
np.fill_diagonal(D_boot, 0)
281
Z_boot = linkage(squareform(D_boot), method='average')
282
tree_lengths.append(np.sum(Z_boot[:, 2]))
283
284
ax6.hist(tree_lengths, bins=30, alpha=0.7, color='steelblue', edgecolor='black')
285
ax6.axvline(x=np.sum(Z_upgma[:, 2]), color='red', linestyle='--', linewidth=2, label='Original')
286
ax6.set_xlabel('Total Tree Length')
287
ax6.set_ylabel('Frequency')
288
ax6.set_title('Tree Length Distribution')
289
ax6.legend(fontsize=8)
290
ax6.grid(True, alpha=0.3)
291
292
# Plot 7: Pairwise distance histogram
293
ax7 = fig.add_subplot(3, 3, 7)
294
upper_tri = D[np.triu_indices(n_species, k=1)]
295
ax7.hist(upper_tri, bins=10, alpha=0.7, color='green', edgecolor='black')
296
ax7.set_xlabel('Pairwise Distance')
297
ax7.set_ylabel('Frequency')
298
ax7.set_title('Distance Distribution')
299
ax7.grid(True, alpha=0.3)
300
301
# Plot 8: Cophenetic correlation
302
ax8 = fig.add_subplot(3, 3, 8)
303
from scipy.cluster.hierarchy import cophenet
304
coph_dists, _ = cophenet(Z_upgma, condensed)
305
ax8.scatter(condensed, coph_dists, alpha=0.6, s=50)
306
ax8.plot([0, 0.5], [0, 0.5], 'r--', linewidth=2, label='Perfect fit')
307
ax8.set_xlabel('Original Distance')
308
ax8.set_ylabel('Cophenetic Distance')
309
ax8.set_title(f'Cophenetic Correlation')
310
ax8.legend(fontsize=8)
311
ax8.grid(True, alpha=0.3)
312
313
# Plot 9: Clustering comparison
314
ax9 = fig.add_subplot(3, 3, 9)
315
methods = ['single', 'average', 'complete', 'ward']
316
colors = ['blue', 'green', 'red', 'purple']
317
318
for method, color in zip(methods, colors):
319
Z_test = linkage(condensed, method=method)
320
_, coph_corr = cophenet(Z_test, condensed)
321
ax9.bar(method, coph_corr, color=color, alpha=0.7)
322
323
ax9.set_ylabel('Cophenetic Correlation')
324
ax9.set_title('Linkage Method Comparison')
325
ax9.grid(True, alpha=0.3, axis='y')
326
327
plt.tight_layout()
328
plt.savefig('phylogenetics_plot.pdf', bbox_inches='tight', dpi=150)
329
print(r'\begin{center}')
330
print(r'\includegraphics[width=\textwidth]{phylogenetics_plot.pdf}')
331
print(r'\end{center}')
332
plt.close()
333
334
# Statistics
335
total_tree_length = np.sum(Z_upgma[:, 2])
336
_, coph_corr = cophenet(Z_upgma, condensed)
337
\end{pycode}
338
339
\section{Results and Analysis}
340
341
\subsection{Distance Matrix}
342
343
\begin{pycode}
344
# Distance statistics table
345
print(r'\begin{table}[h]')
346
print(r'\centering')
347
print(r'\caption{Pairwise Evolutionary Distances}')
348
print(r'\begin{tabular}{lccc}')
349
print(r'\toprule')
350
print(r'Comparison & Distance & Divergence (MYA) & Relationship \\')
351
print(r'\midrule')
352
comparisons = [
353
('Human-Chimp', 0.08, 6, 'Sister'),
354
('Human-Gorilla', 0.14, 9, 'Subfamily'),
355
('Human-Orangutan', 0.28, 14, 'Family'),
356
('Human-Gibbon', 0.36, 18, 'Superfamily'),
357
('Human-Macaque', 0.42, 25, 'Infraorder')
358
]
359
for name, dist, mya, rel in comparisons:
360
print(f"{name} & {dist:.2f} & {mya} & {rel} \\\\")
361
print(r'\bottomrule')
362
print(r'\end{tabular}')
363
print(r'\end{table}')
364
\end{pycode}
365
366
\subsection{Tree Statistics}
367
368
\begin{example}[Primate Phylogeny]
369
The reconstructed primate tree shows:
370
\begin{itemize}
371
\item Total tree length: \py{f"{total_tree_length:.3f}"}
372
\item Cophenetic correlation: \py{f"{coph_corr:.3f}"}
373
\item Human-Chimp clade is most strongly supported
374
\item African great apes form a monophyletic group
375
\item Gibbon is sister to great apes
376
\end{itemize}
377
\end{example}
378
379
\section{Bootstrap Analysis}
380
381
\begin{definition}[Phylogenetic Bootstrap]
382
Bootstrap support measures clade reproducibility:
383
\begin{enumerate}
384
\item Resample characters (or distances) with replacement
385
\item Reconstruct tree from resampled data
386
\item Count frequency each clade appears
387
\item Values $>$70\% considered strong support
388
\end{enumerate}
389
\end{definition}
390
391
\begin{remark}[Interpreting Bootstrap Values]
392
\begin{itemize}
393
\item $>$95\%: Very strong support
394
\item 70-95\%: Moderate support
395
\item $<$70\%: Weak support
396
\item Bootstrap is conservative (underestimates true support)
397
\end{itemize}
398
\end{remark}
399
400
\section{Model Selection}
401
402
\subsection{Substitution Models}
403
404
Choice of distance correction affects tree topology:
405
\begin{itemize}
406
\item \textbf{Jukes-Cantor}: Equal base frequencies, equal rates
407
\item \textbf{Kimura 2P}: Different transition/transversion rates
408
\item \textbf{HKY85}: Unequal base frequencies + Ti/Tv ratio
409
\item \textbf{GTR}: General time-reversible (most parameters)
410
\end{itemize}
411
412
\subsection{Model Testing}
413
414
Likelihood ratio tests or information criteria (AIC, BIC) select best-fit model.
415
416
\section{Limitations and Extensions}
417
418
\subsection{Model Limitations}
419
\begin{enumerate}
420
\item \textbf{Distance loss}: Character information discarded
421
\item \textbf{Molecular clock}: UPGMA assumes equal rates
422
\item \textbf{Star tree}: Poor resolution for rapid radiations
423
\item \textbf{Long branch attraction}: Fast-evolving taxa cluster
424
\end{enumerate}
425
426
\subsection{Possible Extensions}
427
\begin{itemize}
428
\item Maximum likelihood phylogenetics
429
\item Bayesian inference with posterior probabilities
430
\item Coalescent methods for population-level data
431
\item Phylogenomics with whole-genome data
432
\end{itemize}
433
434
\section{Conclusion}
435
436
This analysis demonstrates phylogenetic reconstruction:
437
\begin{itemize}
438
\item Distance methods provide fast tree inference
439
\item Neighbor-Joining handles rate variation better than UPGMA
440
\item Bootstrap support quantifies clade confidence
441
\item Primate phylogeny matches expected relationships
442
\item Distance correction is essential for divergent sequences
443
\end{itemize}
444
445
\section*{Further Reading}
446
\begin{itemize}
447
\item Felsenstein, J. (2004). \textit{Inferring Phylogenies}. Sinauer Associates.
448
\item Yang, Z. (2014). \textit{Molecular Evolution: A Statistical Approach}. Oxford University Press.
449
\item Saitou, N. \& Nei, M. (1987). The Neighbor-Joining method. \textit{MBE}, 4, 406-425.
450
\end{itemize}
451
452
\end{document}
453
454