Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Ok-landscape
GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/bioinformatics/sequence_alignment.tex
51 views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{siunitx}
7
\usepackage{booktabs}
8
\usepackage{subcaption}
9
\usepackage[makestderr]{pythontex}
10
11
% Theorem environments
12
\newtheorem{definition}{Definition}
13
\newtheorem{theorem}{Theorem}
14
\newtheorem{example}{Example}
15
\newtheorem{remark}{Remark}
16
17
\title{Sequence Alignment: Dynamic Programming and Scoring Matrices\\
18
\large Global and Local Alignment with Statistical Significance}
19
\author{Computational Genomics Division\\Computational Science Templates}
20
\date{\today}
21
22
\begin{document}
23
\maketitle
24
25
\begin{abstract}
26
This comprehensive analysis presents algorithms for pairwise sequence alignment. We implement the Needleman-Wunsch algorithm for global alignment and Smith-Waterman for local alignment using dynamic programming. The analysis covers scoring matrices (BLOSUM, PAM), gap penalties, traceback procedures, and statistical significance assessment. We demonstrate alignment on nucleotide and protein sequences, visualize scoring matrices, and evaluate alignment quality through comparison with random sequence distributions.
27
\end{abstract}
28
29
\section{Introduction}
30
31
Sequence alignment is fundamental to bioinformatics, enabling comparison of DNA, RNA, and protein sequences to infer homology, identify conserved regions, and predict function. Optimal alignment algorithms use dynamic programming to efficiently find the best alignment among exponentially many possibilities.
32
33
\begin{definition}[Sequence Alignment]
34
An alignment of two sequences places them in a matrix to maximize similarity, allowing gaps (insertions/deletions) to optimize the correspondence. Each position is either a match, mismatch, or gap.
35
\end{definition}
36
37
\section{Theoretical Framework}
38
39
\subsection{Global Alignment}
40
41
\begin{theorem}[Needleman-Wunsch Algorithm]
42
The optimal global alignment score is computed by the recurrence:
43
\begin{equation}
44
F(i,j) = \max\begin{cases}
45
F(i-1, j-1) + s(a_i, b_j) & \text{(match/mismatch)}\\
46
F(i-1, j) + d & \text{(gap in sequence B)}\\
47
F(i, j-1) + d & \text{(gap in sequence A)}
48
\end{cases}
49
\end{equation}
50
where $s(a_i, b_j)$ is the substitution score and $d$ is the gap penalty.
51
\end{theorem}
52
53
\subsection{Local Alignment}
54
55
\begin{theorem}[Smith-Waterman Algorithm]
56
Local alignment finds the best matching subsequences:
57
\begin{equation}
58
F(i,j) = \max\begin{cases}
59
0 & \text{(restart)}\\
60
F(i-1, j-1) + s(a_i, b_j)\\
61
F(i-1, j) + d\\
62
F(i, j-1) + d
63
\end{cases}
64
\end{equation}
65
The key difference from global alignment is the zero option that allows starting fresh.
66
\end{theorem}
67
68
\subsection{Gap Penalties}
69
70
\begin{definition}[Affine Gap Penalty]
71
A more realistic gap model penalizes gap opening differently from extension:
72
\begin{equation}
73
\gamma(g) = d + (g-1) \cdot e
74
\end{equation}
75
where $d$ is the gap opening penalty and $e$ is the extension penalty.
76
\end{definition}
77
78
\begin{remark}[Typical Values]
79
\begin{itemize}
80
\item Linear gap: $d = -2$ to $-8$
81
\item Affine: $d = -10$ to $-12$, $e = -1$ to $-2$
82
\end{itemize}
83
\end{remark}
84
85
\section{Computational Analysis}
86
87
\begin{pycode}
88
import numpy as np
89
import matplotlib.pyplot as plt
90
plt.rc('text', usetex=True)
91
plt.rc('font', family='serif')
92
93
np.random.seed(42)
94
95
# Scoring matrices
96
BLOSUM62 = {
97
('A', 'A'): 4, ('A', 'R'): -1, ('A', 'N'): -2, ('A', 'D'): -2, ('A', 'C'): 0,
98
('R', 'R'): 5, ('R', 'N'): 0, ('R', 'D'): -2, ('R', 'C'): -3,
99
('N', 'N'): 6, ('N', 'D'): 1, ('N', 'C'): -3,
100
('D', 'D'): 6, ('D', 'C'): -3,
101
('C', 'C'): 9
102
}
103
104
# Simple scoring for nucleotides
105
def score_nt(a, b, match=2, mismatch=-1):
106
return match if a == b else mismatch
107
108
def score_aa(a, b, match=4, mismatch=-2):
109
"""Simplified amino acid scoring"""
110
if a == b:
111
return match
112
# Some similar residues
113
similar = [('I', 'L'), ('I', 'V'), ('L', 'V'), ('F', 'Y'), ('D', 'E'), ('K', 'R')]
114
if (a, b) in similar or (b, a) in similar:
115
return 1
116
return mismatch
117
118
def needleman_wunsch(seq1, seq2, score_func, gap=-2):
119
"""Global alignment using Needleman-Wunsch"""
120
m, n = len(seq1), len(seq2)
121
F = np.zeros((m+1, n+1))
122
traceback = np.zeros((m+1, n+1), dtype=int) # 0: diag, 1: up, 2: left
123
124
# Initialize
125
for i in range(m+1):
126
F[i, 0] = i * gap
127
for j in range(n+1):
128
F[0, j] = j * gap
129
130
# Fill matrix
131
for i in range(1, m+1):
132
for j in range(1, n+1):
133
match = F[i-1, j-1] + score_func(seq1[i-1], seq2[j-1])
134
delete = F[i-1, j] + gap
135
insert = F[i, j-1] + gap
136
137
F[i, j] = max(match, delete, insert)
138
139
if F[i, j] == match:
140
traceback[i, j] = 0
141
elif F[i, j] == delete:
142
traceback[i, j] = 1
143
else:
144
traceback[i, j] = 2
145
146
return F, traceback
147
148
def smith_waterman(seq1, seq2, score_func, gap=-2):
149
"""Local alignment using Smith-Waterman"""
150
m, n = len(seq1), len(seq2)
151
F = np.zeros((m+1, n+1))
152
153
max_score = 0
154
max_pos = (0, 0)
155
156
for i in range(1, m+1):
157
for j in range(1, n+1):
158
match = F[i-1, j-1] + score_func(seq1[i-1], seq2[j-1])
159
delete = F[i-1, j] + gap
160
insert = F[i, j-1] + gap
161
F[i, j] = max(0, match, delete, insert)
162
163
if F[i, j] > max_score:
164
max_score = F[i, j]
165
max_pos = (i, j)
166
167
return F, max_score, max_pos
168
169
def get_alignment(seq1, seq2, traceback):
170
"""Reconstruct alignment from traceback"""
171
align1, align2 = '', ''
172
i, j = len(seq1), len(seq2)
173
174
while i > 0 or j > 0:
175
if i > 0 and j > 0 and traceback[i, j] == 0:
176
align1 = seq1[i-1] + align1
177
align2 = seq2[j-1] + align2
178
i -= 1
179
j -= 1
180
elif i > 0 and (j == 0 or traceback[i, j] == 1):
181
align1 = seq1[i-1] + align1
182
align2 = '-' + align2
183
i -= 1
184
else:
185
align1 = '-' + align1
186
align2 = seq2[j-1] + align2
187
j -= 1
188
189
return align1, align2
190
191
def calculate_identity(align1, align2):
192
"""Calculate percent identity"""
193
matches = sum(1 for a, b in zip(align1, align2) if a == b and a != '-')
194
length = len(align1)
195
return matches / length * 100 if length > 0 else 0
196
197
def dot_plot(seq1, seq2):
198
"""Create dot plot matrix"""
199
m, n = len(seq1), len(seq2)
200
dot = np.zeros((m, n))
201
for i in range(m):
202
for j in range(n):
203
if seq1[i] == seq2[j]:
204
dot[i, j] = 1
205
return dot
206
207
# Example sequences
208
# DNA sequences
209
dna1 = "AGTACGCATGC"
210
dna2 = "TATGCATGAC"
211
212
# Protein sequences
213
prot1 = "HEAGAWGHEE"
214
prot2 = "PAWHEAE"
215
216
# Run alignments
217
F_global_dna, tb_dna = needleman_wunsch(dna1, dna2, score_nt)
218
F_local_dna, max_local_dna, pos_dna = smith_waterman(dna1, dna2, score_nt)
219
align1_dna, align2_dna = get_alignment(dna1, dna2, tb_dna)
220
221
F_global_prot, tb_prot = needleman_wunsch(prot1, prot2, score_aa)
222
F_local_prot, max_local_prot, pos_prot = smith_waterman(prot1, prot2, score_aa)
223
align1_prot, align2_prot = get_alignment(prot1, prot2, tb_prot)
224
225
# Calculate statistics
226
global_score_dna = F_global_dna[-1, -1]
227
global_score_prot = F_global_prot[-1, -1]
228
identity_dna = calculate_identity(align1_dna, align2_dna)
229
identity_prot = calculate_identity(align1_prot, align2_prot)
230
231
# Statistical significance
232
n_random = 1000
233
random_scores_dna = []
234
random_scores_prot = []
235
236
for _ in range(n_random):
237
rand_dna = ''.join(np.random.choice(['A', 'C', 'G', 'T'], len(dna2)))
238
rand_prot = ''.join(np.random.choice(list('ACDEFGHIKLMNPQRSTVWY'), len(prot2)))
239
240
F_rand, _ = needleman_wunsch(dna1, rand_dna, score_nt)
241
random_scores_dna.append(F_rand[-1, -1])
242
243
F_rand_p, _ = needleman_wunsch(prot1, rand_prot, score_aa)
244
random_scores_prot.append(F_rand_p[-1, -1])
245
246
p_value_dna = np.sum(np.array(random_scores_dna) >= global_score_dna) / n_random
247
p_value_prot = np.sum(np.array(random_scores_prot) >= global_score_prot) / n_random
248
249
# Z-scores
250
z_dna = (global_score_dna - np.mean(random_scores_dna)) / np.std(random_scores_dna)
251
z_prot = (global_score_prot - np.mean(random_scores_prot)) / np.std(random_scores_prot)
252
253
# E-value approximation (simplified)
254
m, n = len(dna1), len(dna2)
255
db_size = 1e6 # hypothetical database size
256
lambda_param = 0.3 # typical value
257
K_param = 0.1
258
e_value_dna = K_param * m * db_size * np.exp(-lambda_param * global_score_dna)
259
260
# Create comprehensive figure
261
fig = plt.figure(figsize=(14, 16))
262
263
# Plot 1: Global alignment matrix (DNA)
264
ax1 = fig.add_subplot(3, 3, 1)
265
im1 = ax1.imshow(F_global_dna, cmap='viridis', aspect='auto')
266
ax1.set_xlabel('Sequence 2 (DNA)')
267
ax1.set_ylabel('Sequence 1 (DNA)')
268
ax1.set_title(f'Global Alignment (Score: {global_score_dna:.0f})')
269
ax1.set_xticks(range(len(dna2)+1))
270
ax1.set_yticks(range(len(dna1)+1))
271
ax1.set_xticklabels(['-'] + list(dna2), fontsize=7)
272
ax1.set_yticklabels(['-'] + list(dna1), fontsize=7)
273
plt.colorbar(im1, ax=ax1, shrink=0.8)
274
275
# Plot 2: Local alignment matrix (DNA)
276
ax2 = fig.add_subplot(3, 3, 2)
277
im2 = ax2.imshow(F_local_dna, cmap='viridis', aspect='auto')
278
ax2.plot(pos_dna[1], pos_dna[0], 'r*', markersize=15)
279
ax2.set_xlabel('Sequence 2')
280
ax2.set_ylabel('Sequence 1')
281
ax2.set_title(f'Local Alignment (Max: {max_local_dna:.0f})')
282
plt.colorbar(im2, ax=ax2, shrink=0.8)
283
284
# Plot 3: Dot plot
285
ax3 = fig.add_subplot(3, 3, 3)
286
dot = dot_plot(dna1, dna2)
287
ax3.imshow(dot, cmap='binary', aspect='auto')
288
ax3.set_xlabel('Sequence 2')
289
ax3.set_ylabel('Sequence 1')
290
ax3.set_title('Dot Plot')
291
ax3.set_xticks(range(len(dna2)))
292
ax3.set_yticks(range(len(dna1)))
293
ax3.set_xticklabels(list(dna2), fontsize=7)
294
ax3.set_yticklabels(list(dna1), fontsize=7)
295
296
# Plot 4: Score distribution (DNA)
297
ax4 = fig.add_subplot(3, 3, 4)
298
ax4.hist(random_scores_dna, bins=30, alpha=0.7, color='steelblue', edgecolor='black', density=True)
299
ax4.axvline(x=global_score_dna, color='red', linewidth=2, label=f'Score: {global_score_dna:.0f}')
300
ax4.axvline(x=np.mean(random_scores_dna), color='green', linestyle='--', linewidth=2, label='Mean')
301
ax4.set_xlabel('Alignment Score')
302
ax4.set_ylabel('Density')
303
ax4.set_title(f'Score Distribution (DNA, p={p_value_dna:.3f})')
304
ax4.legend(fontsize=7)
305
ax4.grid(True, alpha=0.3)
306
307
# Plot 5: Score distribution (Protein)
308
ax5 = fig.add_subplot(3, 3, 5)
309
ax5.hist(random_scores_prot, bins=30, alpha=0.7, color='green', edgecolor='black', density=True)
310
ax5.axvline(x=global_score_prot, color='red', linewidth=2, label=f'Score: {global_score_prot:.0f}')
311
ax5.set_xlabel('Alignment Score')
312
ax5.set_ylabel('Density')
313
ax5.set_title(f'Score Distribution (Protein, p={p_value_prot:.3f})')
314
ax5.legend(fontsize=7)
315
ax5.grid(True, alpha=0.3)
316
317
# Plot 6: Global alignment matrix (Protein)
318
ax6 = fig.add_subplot(3, 3, 6)
319
im6 = ax6.imshow(F_global_prot, cmap='plasma', aspect='auto')
320
ax6.set_xlabel('Sequence 2 (Protein)')
321
ax6.set_ylabel('Sequence 1 (Protein)')
322
ax6.set_title(f'Protein Alignment (Score: {global_score_prot:.0f})')
323
ax6.set_xticks(range(len(prot2)+1))
324
ax6.set_yticks(range(len(prot1)+1))
325
ax6.set_xticklabels(['-'] + list(prot2), fontsize=7)
326
ax6.set_yticklabels(['-'] + list(prot1), fontsize=7)
327
plt.colorbar(im6, ax=ax6, shrink=0.8)
328
329
# Plot 7: Gap penalty effect
330
ax7 = fig.add_subplot(3, 3, 7)
331
gap_penalties = [-1, -2, -4, -6, -8, -10]
332
scores_by_gap = []
333
for gap in gap_penalties:
334
F, _ = needleman_wunsch(dna1, dna2, score_nt, gap=gap)
335
scores_by_gap.append(F[-1, -1])
336
337
ax7.plot(gap_penalties, scores_by_gap, 'bo-', linewidth=2, markersize=8)
338
ax7.set_xlabel('Gap Penalty')
339
ax7.set_ylabel('Alignment Score')
340
ax7.set_title('Effect of Gap Penalty')
341
ax7.grid(True, alpha=0.3)
342
343
# Plot 8: Identity vs Length
344
ax8 = fig.add_subplot(3, 3, 8)
345
lengths = range(5, 101, 5)
346
identities = []
347
for L in lengths:
348
# Random sequences of length L
349
s1 = ''.join(np.random.choice(['A', 'C', 'G', 'T'], L))
350
s2 = ''.join(np.random.choice(['A', 'C', 'G', 'T'], L))
351
_, tb = needleman_wunsch(s1, s2, score_nt)
352
a1, a2 = get_alignment(s1, s2, tb)
353
identities.append(calculate_identity(a1, a2))
354
355
ax8.plot(lengths, identities, 'g.-', linewidth=1)
356
ax8.axhline(y=25, color='r', linestyle='--', alpha=0.7, label='Random (25\\%)')
357
ax8.set_xlabel('Sequence Length')
358
ax8.set_ylabel('Percent Identity')
359
ax8.set_title('Random Alignment Identity')
360
ax8.legend(fontsize=8)
361
ax8.grid(True, alpha=0.3)
362
363
# Plot 9: Scoring matrix heatmap (simplified BLOSUM)
364
ax9 = fig.add_subplot(3, 3, 9)
365
aas = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G']
366
n_aa = len(aas)
367
blosum_matrix = np.zeros((n_aa, n_aa))
368
for i, a1 in enumerate(aas):
369
for j, a2 in enumerate(aas):
370
blosum_matrix[i, j] = score_aa(a1, a2)
371
372
im9 = ax9.imshow(blosum_matrix, cmap='RdBu', aspect='auto', vmin=-4, vmax=4)
373
ax9.set_xticks(range(n_aa))
374
ax9.set_yticks(range(n_aa))
375
ax9.set_xticklabels(aas)
376
ax9.set_yticklabels(aas)
377
ax9.set_title('Substitution Matrix')
378
plt.colorbar(im9, ax=ax9, shrink=0.8)
379
380
plt.tight_layout()
381
plt.savefig('sequence_alignment_plot.pdf', bbox_inches='tight', dpi=150)
382
print(r'\begin{center}')
383
print(r'\includegraphics[width=\textwidth]{sequence_alignment_plot.pdf}')
384
print(r'\end{center}')
385
plt.close()
386
\end{pycode}
387
388
\section{Results and Analysis}
389
390
\subsection{Alignment Results}
391
392
\begin{pycode}
393
# Results table
394
print(r'\begin{table}[h]')
395
print(r'\centering')
396
print(r'\caption{Sequence Alignment Results}')
397
print(r'\begin{tabular}{lccccc}')
398
print(r'\toprule')
399
print(r'Alignment & Global Score & Local Score & Identity (\\%) & Z-score & p-value \\')
400
print(r'\midrule')
401
print(f"DNA & {global_score_dna:.0f} & {max_local_dna:.0f} & {identity_dna:.1f} & {z_dna:.2f} & {p_value_dna:.3f} \\\\")
402
print(f"Protein & {global_score_prot:.0f} & {max_local_prot:.0f} & {identity_prot:.1f} & {z_prot:.2f} & {p_value_prot:.3f} \\\\")
403
print(r'\bottomrule')
404
print(r'\end{tabular}')
405
print(r'\end{table}')
406
\end{pycode}
407
408
\subsection{Sequence Details}
409
410
\begin{example}[DNA Alignment]
411
Sequences:
412
\begin{itemize}
413
\item Sequence 1: \texttt{\py{f"{dna1}"}}
414
\item Sequence 2: \texttt{\py{f"{dna2}"}}
415
\item Aligned 1: \texttt{\py{f"{align1_dna}"}}
416
\item Aligned 2: \texttt{\py{f"{align2_dna}"}}
417
\end{itemize}
418
\end{example}
419
420
\begin{example}[Protein Alignment]
421
Sequences:
422
\begin{itemize}
423
\item Sequence 1: \texttt{\py{f"{prot1}"}}
424
\item Sequence 2: \texttt{\py{f"{prot2}"}}
425
\item Aligned 1: \texttt{\py{f"{align1_prot}"}}
426
\item Aligned 2: \texttt{\py{f"{align2_prot}"}}
427
\end{itemize}
428
\end{example}
429
430
\section{Statistical Significance}
431
432
\subsection{Z-score and P-value}
433
434
\begin{remark}[Significance Assessment]
435
Alignment significance is evaluated by comparing to random sequences:
436
\begin{itemize}
437
\item \textbf{Z-score}: Number of standard deviations above mean
438
\item \textbf{P-value}: Probability of score by chance
439
\item Z $>$ 5 typically indicates significant similarity
440
\item P $<$ 0.01 suggests true homology
441
\end{itemize}
442
\end{remark}
443
444
\subsection{E-value}
445
446
The expected number of alignments with score $\geq S$ by chance:
447
\begin{equation}
448
E = K \cdot m \cdot n \cdot e^{-\lambda S}
449
\end{equation}
450
451
\section{Scoring Matrices}
452
453
\subsection{BLOSUM Series}
454
455
BLOcks SUbstitution Matrices derived from aligned blocks:
456
\begin{itemize}
457
\item BLOSUM62: Most widely used, 62\% identity threshold
458
\item BLOSUM80: For closely related sequences
459
\item BLOSUM45: For distantly related sequences
460
\end{itemize}
461
462
\subsection{PAM Series}
463
464
Point Accepted Mutation matrices based on evolutionary models:
465
\begin{itemize}
466
\item PAM1: 1\% expected mutations
467
\item PAM250: Distant relationships
468
\end{itemize}
469
470
\section{Limitations and Extensions}
471
472
\subsection{Model Limitations}
473
\begin{enumerate}
474
\item \textbf{Pairwise only}: No multiple sequence alignment
475
\item \textbf{Linear gap}: Affine penalties more realistic
476
\item \textbf{Global/local}: No semi-global option shown
477
\item \textbf{No profiles}: Sequence-to-sequence only
478
\end{enumerate}
479
480
\subsection{Possible Extensions}
481
\begin{itemize}
482
\item Affine gap penalties
483
\item Multiple sequence alignment (ClustalW, MUSCLE)
484
\item Profile HMMs for remote homology
485
\item BLAST heuristics for database search
486
\end{itemize}
487
488
\section{Conclusion}
489
490
This analysis demonstrates sequence alignment fundamentals:
491
\begin{itemize}
492
\item Needleman-Wunsch provides optimal global alignment
493
\item Smith-Waterman finds best local similarities
494
\item DNA alignment: score = \py{f"{global_score_dna:.0f}"}, identity = \py{f"{identity_dna:.0f}"}\%
495
\item Protein alignment: score = \py{f"{global_score_prot:.0f}"}, identity = \py{f"{identity_prot:.0f}"}\%
496
\item Statistical significance evaluated by Z-score and p-value
497
\end{itemize}
498
499
\section*{Further Reading}
500
\begin{itemize}
501
\item Durbin, R. et al. (1998). \textit{Biological Sequence Analysis}. Cambridge University Press.
502
\item Altschul, S. F. et al. (1990). Basic local alignment search tool. \textit{J. Mol. Biol.}, 215, 403-410.
503
\item Henikoff, S. \& Henikoff, J. G. (1992). Amino acid substitution matrices from protein blocks. \textit{PNAS}, 89, 10915-10919.
504
\end{itemize}
505
506
\end{document}
507
508