CoCalc -- sequence_alignment.tex

GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/bioinformatics/sequence_alignment.tex
⁵¹ views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{siunitx}
7
\usepackage{booktabs}
8
\usepackage{subcaption}
9
\usepackage[makestderr]{pythontex}
10

11
% Theorem environments
12
\newtheorem{definition}{Definition}
13
\newtheorem{theorem}{Theorem}
14
\newtheorem{example}{Example}
15
\newtheorem{remark}{Remark}
16

17
\title{Sequence Alignment: Dynamic Programming and Scoring Matrices\\
18
\large Global and Local Alignment with Statistical Significance}
19
\author{Computational Genomics Division\\Computational Science Templates}
20
\date{\today}
21

22
\begin{document}
23
\maketitle
24

25
\begin{abstract}
26
This comprehensive analysis presents algorithms for pairwise sequence alignment. We implement the Needleman-Wunsch algorithm for global alignment and Smith-Waterman for local alignment using dynamic programming. The analysis covers scoring matrices (BLOSUM, PAM), gap penalties, traceback procedures, and statistical significance assessment. We demonstrate alignment on nucleotide and protein sequences, visualize scoring matrices, and evaluate alignment quality through comparison with random sequence distributions.
27
\end{abstract}
28

29
\section{Introduction}
30

31
Sequence alignment is fundamental to bioinformatics, enabling comparison of DNA, RNA, and protein sequences to infer homology, identify conserved regions, and predict function. Optimal alignment algorithms use dynamic programming to efficiently find the best alignment among exponentially many possibilities.
32

33
\begin{definition}[Sequence Alignment]
34
An alignment of two sequences places them in a matrix to maximize similarity, allowing gaps (insertions/deletions) to optimize the correspondence. Each position is either a match, mismatch, or gap.
35
\end{definition}
36

37
\section{Theoretical Framework}
38

39
\subsection{Global Alignment}
40

41
\begin{theorem}[Needleman-Wunsch Algorithm]
42
The optimal global alignment score is computed by the recurrence:
43
\begin{equation}
44
F(i,j) = \max\begin{cases}
45
F(i-1, j-1) + s(a_i, b_j) & \text{(match/mismatch)}\\
46
F(i-1, j) + d & \text{(gap in sequence B)}\\
47
F(i, j-1) + d & \text{(gap in sequence A)}
48
\end{cases}
49
\end{equation}
50
where $s(a_i, b_j)$ is the substitution score and $d$ is the gap penalty.
51
\end{theorem}
52

53
\subsection{Local Alignment}
54

55
\begin{theorem}[Smith-Waterman Algorithm]
56
Local alignment finds the best matching subsequences:
57
\begin{equation}
58
F(i,j) = \max\begin{cases}
59
0 & \text{(restart)}\\
60
F(i-1, j-1) + s(a_i, b_j)\\
61
F(i-1, j) + d\\
62
F(i, j-1) + d
63
\end{cases}
64
\end{equation}
65
The key difference from global alignment is the zero option that allows starting fresh.
66
\end{theorem}
67

68
\subsection{Gap Penalties}
69

70
\begin{definition}[Affine Gap Penalty]
71
A more realistic gap model penalizes gap opening differently from extension:
72
\begin{equation}
73
\gamma(g) = d + (g-1) \cdot e
74
\end{equation}
75
where $d$ is the gap opening penalty and $e$ is the extension penalty.
76
\end{definition}
77

78
\begin{remark}[Typical Values]
79
\begin{itemize}
80
    \item Linear gap: $d = -2$ to $-8$
81
    \item Affine: $d = -10$ to $-12$, $e = -1$ to $-2$
82
\end{itemize}
83
\end{remark}
84

85
\section{Computational Analysis}
86

87
\begin{pycode}
88
import numpy as np
89
import matplotlib.pyplot as plt
90
plt.rc('text', usetex=True)
91
plt.rc('font', family='serif')
92

93
np.random.seed(42)
94

95
# Scoring matrices
96
BLOSUM62 = {
97
    ('A', 'A'): 4, ('A', 'R'): -1, ('A', 'N'): -2, ('A', 'D'): -2, ('A', 'C'): 0,
98
    ('R', 'R'): 5, ('R', 'N'): 0, ('R', 'D'): -2, ('R', 'C'): -3,
99
    ('N', 'N'): 6, ('N', 'D'): 1, ('N', 'C'): -3,
100
    ('D', 'D'): 6, ('D', 'C'): -3,
101
    ('C', 'C'): 9
102
}
103

104
# Simple scoring for nucleotides
105
def score_nt(a, b, match=2, mismatch=-1):
106
    return match if a == b else mismatch
107

108
def score_aa(a, b, match=4, mismatch=-2):
109
    """Simplified amino acid scoring"""
110
    if a == b:
111
        return match
112
    # Some similar residues
113
    similar = [('I', 'L'), ('I', 'V'), ('L', 'V'), ('F', 'Y'), ('D', 'E'), ('K', 'R')]
114
    if (a, b) in similar or (b, a) in similar:
115
        return 1
116
    return mismatch
117

118
def needleman_wunsch(seq1, seq2, score_func, gap=-2):
119
    """Global alignment using Needleman-Wunsch"""
120
    m, n = len(seq1), len(seq2)
121
    F = np.zeros((m+1, n+1))
122
    traceback = np.zeros((m+1, n+1), dtype=int)  # 0: diag, 1: up, 2: left
123

124
    # Initialize
125
    for i in range(m+1):
126
        F[i, 0] = i * gap
127
    for j in range(n+1):
128
        F[0, j] = j * gap
129

130
    # Fill matrix
131
    for i in range(1, m+1):
132
        for j in range(1, n+1):
133
            match = F[i-1, j-1] + score_func(seq1[i-1], seq2[j-1])
134
            delete = F[i-1, j] + gap
135
            insert = F[i, j-1] + gap
136

137
            F[i, j] = max(match, delete, insert)
138

139
            if F[i, j] == match:
140
                traceback[i, j] = 0
141
            elif F[i, j] == delete:
142
                traceback[i, j] = 1
143
            else:
144
                traceback[i, j] = 2
145

146
    return F, traceback
147

148
def smith_waterman(seq1, seq2, score_func, gap=-2):
149
    """Local alignment using Smith-Waterman"""
150
    m, n = len(seq1), len(seq2)
151
    F = np.zeros((m+1, n+1))
152

153
    max_score = 0
154
    max_pos = (0, 0)
155

156
    for i in range(1, m+1):
157
        for j in range(1, n+1):
158
            match = F[i-1, j-1] + score_func(seq1[i-1], seq2[j-1])
159
            delete = F[i-1, j] + gap
160
            insert = F[i, j-1] + gap
161
            F[i, j] = max(0, match, delete, insert)
162

163
            if F[i, j] > max_score:
164
                max_score = F[i, j]
165
                max_pos = (i, j)
166

167
    return F, max_score, max_pos
168

169
def get_alignment(seq1, seq2, traceback):
170
    """Reconstruct alignment from traceback"""
171
    align1, align2 = '', ''
172
    i, j = len(seq1), len(seq2)
173

174
    while i > 0 or j > 0:
175
        if i > 0 and j > 0 and traceback[i, j] == 0:
176
            align1 = seq1[i-1] + align1
177
            align2 = seq2[j-1] + align2
178
            i -= 1
179
            j -= 1
180
        elif i > 0 and (j == 0 or traceback[i, j] == 1):
181
            align1 = seq1[i-1] + align1
182
            align2 = '-' + align2
183
            i -= 1
184
        else:
185
            align1 = '-' + align1
186
            align2 = seq2[j-1] + align2
187
            j -= 1
188

189
    return align1, align2
190

191
def calculate_identity(align1, align2):
192
    """Calculate percent identity"""
193
    matches = sum(1 for a, b in zip(align1, align2) if a == b and a != '-')
194
    length = len(align1)
195
    return matches / length * 100 if length > 0 else 0
196

197
def dot_plot(seq1, seq2):
198
    """Create dot plot matrix"""
199
    m, n = len(seq1), len(seq2)
200
    dot = np.zeros((m, n))
201
    for i in range(m):
202
        for j in range(n):
203
            if seq1[i] == seq2[j]:
204
                dot[i, j] = 1
205
    return dot
206

207
# Example sequences
208
# DNA sequences
209
dna1 = "AGTACGCATGC"
210
dna2 = "TATGCATGAC"
211

212
# Protein sequences
213
prot1 = "HEAGAWGHEE"
214
prot2 = "PAWHEAE"
215

216
# Run alignments
217
F_global_dna, tb_dna = needleman_wunsch(dna1, dna2, score_nt)
218
F_local_dna, max_local_dna, pos_dna = smith_waterman(dna1, dna2, score_nt)
219
align1_dna, align2_dna = get_alignment(dna1, dna2, tb_dna)
220

221
F_global_prot, tb_prot = needleman_wunsch(prot1, prot2, score_aa)
222
F_local_prot, max_local_prot, pos_prot = smith_waterman(prot1, prot2, score_aa)
223
align1_prot, align2_prot = get_alignment(prot1, prot2, tb_prot)
224

225
# Calculate statistics
226
global_score_dna = F_global_dna[-1, -1]
227
global_score_prot = F_global_prot[-1, -1]
228
identity_dna = calculate_identity(align1_dna, align2_dna)
229
identity_prot = calculate_identity(align1_prot, align2_prot)
230

231
# Statistical significance
232
n_random = 1000
233
random_scores_dna = []
234
random_scores_prot = []
235

236
for _ in range(n_random):
237
    rand_dna = ''.join(np.random.choice(['A', 'C', 'G', 'T'], len(dna2)))
238
    rand_prot = ''.join(np.random.choice(list('ACDEFGHIKLMNPQRSTVWY'), len(prot2)))
239

240
    F_rand, _ = needleman_wunsch(dna1, rand_dna, score_nt)
241
    random_scores_dna.append(F_rand[-1, -1])
242

243
    F_rand_p, _ = needleman_wunsch(prot1, rand_prot, score_aa)
244
    random_scores_prot.append(F_rand_p[-1, -1])
245

246
p_value_dna = np.sum(np.array(random_scores_dna) >= global_score_dna) / n_random
247
p_value_prot = np.sum(np.array(random_scores_prot) >= global_score_prot) / n_random
248

249
# Z-scores
250
z_dna = (global_score_dna - np.mean(random_scores_dna)) / np.std(random_scores_dna)
251
z_prot = (global_score_prot - np.mean(random_scores_prot)) / np.std(random_scores_prot)
252

253
# E-value approximation (simplified)
254
m, n = len(dna1), len(dna2)
255
db_size = 1e6  # hypothetical database size
256
lambda_param = 0.3  # typical value
257
K_param = 0.1
258
e_value_dna = K_param * m * db_size * np.exp(-lambda_param * global_score_dna)
259

260
# Create comprehensive figure
261
fig = plt.figure(figsize=(14, 16))
262

263
# Plot 1: Global alignment matrix (DNA)
264
ax1 = fig.add_subplot(3, 3, 1)
265
im1 = ax1.imshow(F_global_dna, cmap='viridis', aspect='auto')
266
ax1.set_xlabel('Sequence 2 (DNA)')
267
ax1.set_ylabel('Sequence 1 (DNA)')
268
ax1.set_title(f'Global Alignment (Score: {global_score_dna:.0f})')
269
ax1.set_xticks(range(len(dna2)+1))
270
ax1.set_yticks(range(len(dna1)+1))
271
ax1.set_xticklabels(['-'] + list(dna2), fontsize=7)
272
ax1.set_yticklabels(['-'] + list(dna1), fontsize=7)
273
plt.colorbar(im1, ax=ax1, shrink=0.8)
274

275
# Plot 2: Local alignment matrix (DNA)
276
ax2 = fig.add_subplot(3, 3, 2)
277
im2 = ax2.imshow(F_local_dna, cmap='viridis', aspect='auto')
278
ax2.plot(pos_dna[1], pos_dna[0], 'r*', markersize=15)
279
ax2.set_xlabel('Sequence 2')
280
ax2.set_ylabel('Sequence 1')
281
ax2.set_title(f'Local Alignment (Max: {max_local_dna:.0f})')
282
plt.colorbar(im2, ax=ax2, shrink=0.8)
283

284
# Plot 3: Dot plot
285
ax3 = fig.add_subplot(3, 3, 3)
286
dot = dot_plot(dna1, dna2)
287
ax3.imshow(dot, cmap='binary', aspect='auto')
288
ax3.set_xlabel('Sequence 2')
289
ax3.set_ylabel('Sequence 1')
290
ax3.set_title('Dot Plot')
291
ax3.set_xticks(range(len(dna2)))
292
ax3.set_yticks(range(len(dna1)))
293
ax3.set_xticklabels(list(dna2), fontsize=7)
294
ax3.set_yticklabels(list(dna1), fontsize=7)
295

296
# Plot 4: Score distribution (DNA)
297
ax4 = fig.add_subplot(3, 3, 4)
298
ax4.hist(random_scores_dna, bins=30, alpha=0.7, color='steelblue', edgecolor='black', density=True)
299
ax4.axvline(x=global_score_dna, color='red', linewidth=2, label=f'Score: {global_score_dna:.0f}')
300
ax4.axvline(x=np.mean(random_scores_dna), color='green', linestyle='--', linewidth=2, label='Mean')
301
ax4.set_xlabel('Alignment Score')
302
ax4.set_ylabel('Density')
303
ax4.set_title(f'Score Distribution (DNA, p={p_value_dna:.3f})')
304
ax4.legend(fontsize=7)
305
ax4.grid(True, alpha=0.3)
306

307
# Plot 5: Score distribution (Protein)
308
ax5 = fig.add_subplot(3, 3, 5)
309
ax5.hist(random_scores_prot, bins=30, alpha=0.7, color='green', edgecolor='black', density=True)
310
ax5.axvline(x=global_score_prot, color='red', linewidth=2, label=f'Score: {global_score_prot:.0f}')
311
ax5.set_xlabel('Alignment Score')
312
ax5.set_ylabel('Density')
313
ax5.set_title(f'Score Distribution (Protein, p={p_value_prot:.3f})')
314
ax5.legend(fontsize=7)
315
ax5.grid(True, alpha=0.3)
316

317
# Plot 6: Global alignment matrix (Protein)
318
ax6 = fig.add_subplot(3, 3, 6)
319
im6 = ax6.imshow(F_global_prot, cmap='plasma', aspect='auto')
320
ax6.set_xlabel('Sequence 2 (Protein)')
321
ax6.set_ylabel('Sequence 1 (Protein)')
322
ax6.set_title(f'Protein Alignment (Score: {global_score_prot:.0f})')
323
ax6.set_xticks(range(len(prot2)+1))
324
ax6.set_yticks(range(len(prot1)+1))
325
ax6.set_xticklabels(['-'] + list(prot2), fontsize=7)
326
ax6.set_yticklabels(['-'] + list(prot1), fontsize=7)
327
plt.colorbar(im6, ax=ax6, shrink=0.8)
328

329
# Plot 7: Gap penalty effect
330
ax7 = fig.add_subplot(3, 3, 7)
331
gap_penalties = [-1, -2, -4, -6, -8, -10]
332
scores_by_gap = []
333
for gap in gap_penalties:
334
    F, _ = needleman_wunsch(dna1, dna2, score_nt, gap=gap)
335
    scores_by_gap.append(F[-1, -1])
336

337
ax7.plot(gap_penalties, scores_by_gap, 'bo-', linewidth=2, markersize=8)
338
ax7.set_xlabel('Gap Penalty')
339
ax7.set_ylabel('Alignment Score')
340
ax7.set_title('Effect of Gap Penalty')
341
ax7.grid(True, alpha=0.3)
342

343
# Plot 8: Identity vs Length
344
ax8 = fig.add_subplot(3, 3, 8)
345
lengths = range(5, 101, 5)
346
identities = []
347
for L in lengths:
348
    # Random sequences of length L
349
    s1 = ''.join(np.random.choice(['A', 'C', 'G', 'T'], L))
350
    s2 = ''.join(np.random.choice(['A', 'C', 'G', 'T'], L))
351
    _, tb = needleman_wunsch(s1, s2, score_nt)
352
    a1, a2 = get_alignment(s1, s2, tb)
353
    identities.append(calculate_identity(a1, a2))
354

355
ax8.plot(lengths, identities, 'g.-', linewidth=1)
356
ax8.axhline(y=25, color='r', linestyle='--', alpha=0.7, label='Random (25\\%)')
357
ax8.set_xlabel('Sequence Length')
358
ax8.set_ylabel('Percent Identity')
359
ax8.set_title('Random Alignment Identity')
360
ax8.legend(fontsize=8)
361
ax8.grid(True, alpha=0.3)
362

363
# Plot 9: Scoring matrix heatmap (simplified BLOSUM)
364
ax9 = fig.add_subplot(3, 3, 9)
365
aas = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G']
366
n_aa = len(aas)
367
blosum_matrix = np.zeros((n_aa, n_aa))
368
for i, a1 in enumerate(aas):
369
    for j, a2 in enumerate(aas):
370
        blosum_matrix[i, j] = score_aa(a1, a2)
371

372
im9 = ax9.imshow(blosum_matrix, cmap='RdBu', aspect='auto', vmin=-4, vmax=4)
373
ax9.set_xticks(range(n_aa))
374
ax9.set_yticks(range(n_aa))
375
ax9.set_xticklabels(aas)
376
ax9.set_yticklabels(aas)
377
ax9.set_title('Substitution Matrix')
378
plt.colorbar(im9, ax=ax9, shrink=0.8)
379

380
plt.tight_layout()
381
plt.savefig('sequence_alignment_plot.pdf', bbox_inches='tight', dpi=150)
382
print(r'\begin{center}')
383
print(r'\includegraphics[width=\textwidth]{sequence_alignment_plot.pdf}')
384
print(r'\end{center}')
385
plt.close()
386
\end{pycode}
387

388
\section{Results and Analysis}
389

390
\subsection{Alignment Results}
391

392
\begin{pycode}
393
# Results table
394
print(r'\begin{table}[h]')
395
print(r'\centering')
396
print(r'\caption{Sequence Alignment Results}')
397
print(r'\begin{tabular}{lccccc}')
398
print(r'\toprule')
399
print(r'Alignment & Global Score & Local Score & Identity (\\%) & Z-score & p-value \\')
400
print(r'\midrule')
401
print(f"DNA & {global_score_dna:.0f} & {max_local_dna:.0f} & {identity_dna:.1f} & {z_dna:.2f} & {p_value_dna:.3f} \\\\")
402
print(f"Protein & {global_score_prot:.0f} & {max_local_prot:.0f} & {identity_prot:.1f} & {z_prot:.2f} & {p_value_prot:.3f} \\\\")
403
print(r'\bottomrule')
404
print(r'\end{tabular}')
405
print(r'\end{table}')
406
\end{pycode}
407

408
\subsection{Sequence Details}
409

410
\begin{example}[DNA Alignment]
411
Sequences:
412
\begin{itemize}
413
    \item Sequence 1: \texttt{\py{f"{dna1}"}}
414
    \item Sequence 2: \texttt{\py{f"{dna2}"}}
415
    \item Aligned 1: \texttt{\py{f"{align1_dna}"}}
416
    \item Aligned 2: \texttt{\py{f"{align2_dna}"}}
417
\end{itemize}
418
\end{example}
419

420
\begin{example}[Protein Alignment]
421
Sequences:
422
\begin{itemize}
423
    \item Sequence 1: \texttt{\py{f"{prot1}"}}
424
    \item Sequence 2: \texttt{\py{f"{prot2}"}}
425
    \item Aligned 1: \texttt{\py{f"{align1_prot}"}}
426
    \item Aligned 2: \texttt{\py{f"{align2_prot}"}}
427
\end{itemize}
428
\end{example}
429

430
\section{Statistical Significance}
431

432
\subsection{Z-score and P-value}
433

434
\begin{remark}[Significance Assessment]
435
Alignment significance is evaluated by comparing to random sequences:
436
\begin{itemize}
437
    \item \textbf{Z-score}: Number of standard deviations above mean
438
    \item \textbf{P-value}: Probability of score by chance
439
    \item Z $>$ 5 typically indicates significant similarity
440
    \item P $<$ 0.01 suggests true homology
441
\end{itemize}
442
\end{remark}
443

444
\subsection{E-value}
445

446
The expected number of alignments with score $\geq S$ by chance:
447
\begin{equation}
448
E = K \cdot m \cdot n \cdot e^{-\lambda S}
449
\end{equation}
450

451
\section{Scoring Matrices}
452

453
\subsection{BLOSUM Series}
454

455
BLOcks SUbstitution Matrices derived from aligned blocks:
456
\begin{itemize}
457
    \item BLOSUM62: Most widely used, 62\% identity threshold
458
    \item BLOSUM80: For closely related sequences
459
    \item BLOSUM45: For distantly related sequences
460
\end{itemize}
461

462
\subsection{PAM Series}
463

464
Point Accepted Mutation matrices based on evolutionary models:
465
\begin{itemize}
466
    \item PAM1: 1\% expected mutations
467
    \item PAM250: Distant relationships
468
\end{itemize}
469

470
\section{Limitations and Extensions}
471

472
\subsection{Model Limitations}
473
\begin{enumerate}
474
    \item \textbf{Pairwise only}: No multiple sequence alignment
475
    \item \textbf{Linear gap}: Affine penalties more realistic
476
    \item \textbf{Global/local}: No semi-global option shown
477
    \item \textbf{No profiles}: Sequence-to-sequence only
478
\end{enumerate}
479

480
\subsection{Possible Extensions}
481
\begin{itemize}
482
    \item Affine gap penalties
483
    \item Multiple sequence alignment (ClustalW, MUSCLE)
484
    \item Profile HMMs for remote homology
485
    \item BLAST heuristics for database search
486
\end{itemize}
487

488
\section{Conclusion}
489

490
This analysis demonstrates sequence alignment fundamentals:
491
\begin{itemize}
492
    \item Needleman-Wunsch provides optimal global alignment
493
    \item Smith-Waterman finds best local similarities
494
    \item DNA alignment: score = \py{f"{global_score_dna:.0f}"}, identity = \py{f"{identity_dna:.0f}"}\%
495
    \item Protein alignment: score = \py{f"{global_score_prot:.0f}"}, identity = \py{f"{identity_prot:.0f}"}\%
496
    \item Statistical significance evaluated by Z-score and p-value
497
\end{itemize}
498

499
\section*{Further Reading}
500
\begin{itemize}
501
    \item Durbin, R. et al. (1998). \textit{Biological Sequence Analysis}. Cambridge University Press.
502
    \item Altschul, S. F. et al. (1990). Basic local alignment search tool. \textit{J. Mol. Biol.}, 215, 403-410.
503
    \item Henikoff, S. \& Henikoff, J. G. (1992). Amino acid substitution matrices from protein blocks. \textit{PNAS}, 89, 10915-10919.
504
\end{itemize}
505

506
\end{document}
507

508
Product

Resources

Company