Path: blob/main/latex-templates/templates/bioinformatics/sequence_alignment.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{siunitx}6\usepackage{booktabs}7\usepackage{subcaption}8\usepackage[makestderr]{pythontex}910% Theorem environments11\newtheorem{definition}{Definition}12\newtheorem{theorem}{Theorem}13\newtheorem{example}{Example}14\newtheorem{remark}{Remark}1516\title{Sequence Alignment: Dynamic Programming and Scoring Matrices\\17\large Global and Local Alignment with Statistical Significance}18\author{Computational Genomics Division\\Computational Science Templates}19\date{\today}2021\begin{document}22\maketitle2324\begin{abstract}25This comprehensive analysis presents algorithms for pairwise sequence alignment. We implement the Needleman-Wunsch algorithm for global alignment and Smith-Waterman for local alignment using dynamic programming. The analysis covers scoring matrices (BLOSUM, PAM), gap penalties, traceback procedures, and statistical significance assessment. We demonstrate alignment on nucleotide and protein sequences, visualize scoring matrices, and evaluate alignment quality through comparison with random sequence distributions.26\end{abstract}2728\section{Introduction}2930Sequence alignment is fundamental to bioinformatics, enabling comparison of DNA, RNA, and protein sequences to infer homology, identify conserved regions, and predict function. Optimal alignment algorithms use dynamic programming to efficiently find the best alignment among exponentially many possibilities.3132\begin{definition}[Sequence Alignment]33An alignment of two sequences places them in a matrix to maximize similarity, allowing gaps (insertions/deletions) to optimize the correspondence. Each position is either a match, mismatch, or gap.34\end{definition}3536\section{Theoretical Framework}3738\subsection{Global Alignment}3940\begin{theorem}[Needleman-Wunsch Algorithm]41The optimal global alignment score is computed by the recurrence:42\begin{equation}43F(i,j) = \max\begin{cases}44F(i-1, j-1) + s(a_i, b_j) & \text{(match/mismatch)}\\45F(i-1, j) + d & \text{(gap in sequence B)}\\46F(i, j-1) + d & \text{(gap in sequence A)}47\end{cases}48\end{equation}49where $s(a_i, b_j)$ is the substitution score and $d$ is the gap penalty.50\end{theorem}5152\subsection{Local Alignment}5354\begin{theorem}[Smith-Waterman Algorithm]55Local alignment finds the best matching subsequences:56\begin{equation}57F(i,j) = \max\begin{cases}580 & \text{(restart)}\\59F(i-1, j-1) + s(a_i, b_j)\\60F(i-1, j) + d\\61F(i, j-1) + d62\end{cases}63\end{equation}64The key difference from global alignment is the zero option that allows starting fresh.65\end{theorem}6667\subsection{Gap Penalties}6869\begin{definition}[Affine Gap Penalty]70A more realistic gap model penalizes gap opening differently from extension:71\begin{equation}72\gamma(g) = d + (g-1) \cdot e73\end{equation}74where $d$ is the gap opening penalty and $e$ is the extension penalty.75\end{definition}7677\begin{remark}[Typical Values]78\begin{itemize}79\item Linear gap: $d = -2$ to $-8$80\item Affine: $d = -10$ to $-12$, $e = -1$ to $-2$81\end{itemize}82\end{remark}8384\section{Computational Analysis}8586\begin{pycode}87import numpy as np88import matplotlib.pyplot as plt89plt.rc('text', usetex=True)90plt.rc('font', family='serif')9192np.random.seed(42)9394# Scoring matrices95BLOSUM62 = {96('A', 'A'): 4, ('A', 'R'): -1, ('A', 'N'): -2, ('A', 'D'): -2, ('A', 'C'): 0,97('R', 'R'): 5, ('R', 'N'): 0, ('R', 'D'): -2, ('R', 'C'): -3,98('N', 'N'): 6, ('N', 'D'): 1, ('N', 'C'): -3,99('D', 'D'): 6, ('D', 'C'): -3,100('C', 'C'): 9101}102103# Simple scoring for nucleotides104def score_nt(a, b, match=2, mismatch=-1):105return match if a == b else mismatch106107def score_aa(a, b, match=4, mismatch=-2):108"""Simplified amino acid scoring"""109if a == b:110return match111# Some similar residues112similar = [('I', 'L'), ('I', 'V'), ('L', 'V'), ('F', 'Y'), ('D', 'E'), ('K', 'R')]113if (a, b) in similar or (b, a) in similar:114return 1115return mismatch116117def needleman_wunsch(seq1, seq2, score_func, gap=-2):118"""Global alignment using Needleman-Wunsch"""119m, n = len(seq1), len(seq2)120F = np.zeros((m+1, n+1))121traceback = np.zeros((m+1, n+1), dtype=int) # 0: diag, 1: up, 2: left122123# Initialize124for i in range(m+1):125F[i, 0] = i * gap126for j in range(n+1):127F[0, j] = j * gap128129# Fill matrix130for i in range(1, m+1):131for j in range(1, n+1):132match = F[i-1, j-1] + score_func(seq1[i-1], seq2[j-1])133delete = F[i-1, j] + gap134insert = F[i, j-1] + gap135136F[i, j] = max(match, delete, insert)137138if F[i, j] == match:139traceback[i, j] = 0140elif F[i, j] == delete:141traceback[i, j] = 1142else:143traceback[i, j] = 2144145return F, traceback146147def smith_waterman(seq1, seq2, score_func, gap=-2):148"""Local alignment using Smith-Waterman"""149m, n = len(seq1), len(seq2)150F = np.zeros((m+1, n+1))151152max_score = 0153max_pos = (0, 0)154155for i in range(1, m+1):156for j in range(1, n+1):157match = F[i-1, j-1] + score_func(seq1[i-1], seq2[j-1])158delete = F[i-1, j] + gap159insert = F[i, j-1] + gap160F[i, j] = max(0, match, delete, insert)161162if F[i, j] > max_score:163max_score = F[i, j]164max_pos = (i, j)165166return F, max_score, max_pos167168def get_alignment(seq1, seq2, traceback):169"""Reconstruct alignment from traceback"""170align1, align2 = '', ''171i, j = len(seq1), len(seq2)172173while i > 0 or j > 0:174if i > 0 and j > 0 and traceback[i, j] == 0:175align1 = seq1[i-1] + align1176align2 = seq2[j-1] + align2177i -= 1178j -= 1179elif i > 0 and (j == 0 or traceback[i, j] == 1):180align1 = seq1[i-1] + align1181align2 = '-' + align2182i -= 1183else:184align1 = '-' + align1185align2 = seq2[j-1] + align2186j -= 1187188return align1, align2189190def calculate_identity(align1, align2):191"""Calculate percent identity"""192matches = sum(1 for a, b in zip(align1, align2) if a == b and a != '-')193length = len(align1)194return matches / length * 100 if length > 0 else 0195196def dot_plot(seq1, seq2):197"""Create dot plot matrix"""198m, n = len(seq1), len(seq2)199dot = np.zeros((m, n))200for i in range(m):201for j in range(n):202if seq1[i] == seq2[j]:203dot[i, j] = 1204return dot205206# Example sequences207# DNA sequences208dna1 = "AGTACGCATGC"209dna2 = "TATGCATGAC"210211# Protein sequences212prot1 = "HEAGAWGHEE"213prot2 = "PAWHEAE"214215# Run alignments216F_global_dna, tb_dna = needleman_wunsch(dna1, dna2, score_nt)217F_local_dna, max_local_dna, pos_dna = smith_waterman(dna1, dna2, score_nt)218align1_dna, align2_dna = get_alignment(dna1, dna2, tb_dna)219220F_global_prot, tb_prot = needleman_wunsch(prot1, prot2, score_aa)221F_local_prot, max_local_prot, pos_prot = smith_waterman(prot1, prot2, score_aa)222align1_prot, align2_prot = get_alignment(prot1, prot2, tb_prot)223224# Calculate statistics225global_score_dna = F_global_dna[-1, -1]226global_score_prot = F_global_prot[-1, -1]227identity_dna = calculate_identity(align1_dna, align2_dna)228identity_prot = calculate_identity(align1_prot, align2_prot)229230# Statistical significance231n_random = 1000232random_scores_dna = []233random_scores_prot = []234235for _ in range(n_random):236rand_dna = ''.join(np.random.choice(['A', 'C', 'G', 'T'], len(dna2)))237rand_prot = ''.join(np.random.choice(list('ACDEFGHIKLMNPQRSTVWY'), len(prot2)))238239F_rand, _ = needleman_wunsch(dna1, rand_dna, score_nt)240random_scores_dna.append(F_rand[-1, -1])241242F_rand_p, _ = needleman_wunsch(prot1, rand_prot, score_aa)243random_scores_prot.append(F_rand_p[-1, -1])244245p_value_dna = np.sum(np.array(random_scores_dna) >= global_score_dna) / n_random246p_value_prot = np.sum(np.array(random_scores_prot) >= global_score_prot) / n_random247248# Z-scores249z_dna = (global_score_dna - np.mean(random_scores_dna)) / np.std(random_scores_dna)250z_prot = (global_score_prot - np.mean(random_scores_prot)) / np.std(random_scores_prot)251252# E-value approximation (simplified)253m, n = len(dna1), len(dna2)254db_size = 1e6 # hypothetical database size255lambda_param = 0.3 # typical value256K_param = 0.1257e_value_dna = K_param * m * db_size * np.exp(-lambda_param * global_score_dna)258259# Create comprehensive figure260fig = plt.figure(figsize=(14, 16))261262# Plot 1: Global alignment matrix (DNA)263ax1 = fig.add_subplot(3, 3, 1)264im1 = ax1.imshow(F_global_dna, cmap='viridis', aspect='auto')265ax1.set_xlabel('Sequence 2 (DNA)')266ax1.set_ylabel('Sequence 1 (DNA)')267ax1.set_title(f'Global Alignment (Score: {global_score_dna:.0f})')268ax1.set_xticks(range(len(dna2)+1))269ax1.set_yticks(range(len(dna1)+1))270ax1.set_xticklabels(['-'] + list(dna2), fontsize=7)271ax1.set_yticklabels(['-'] + list(dna1), fontsize=7)272plt.colorbar(im1, ax=ax1, shrink=0.8)273274# Plot 2: Local alignment matrix (DNA)275ax2 = fig.add_subplot(3, 3, 2)276im2 = ax2.imshow(F_local_dna, cmap='viridis', aspect='auto')277ax2.plot(pos_dna[1], pos_dna[0], 'r*', markersize=15)278ax2.set_xlabel('Sequence 2')279ax2.set_ylabel('Sequence 1')280ax2.set_title(f'Local Alignment (Max: {max_local_dna:.0f})')281plt.colorbar(im2, ax=ax2, shrink=0.8)282283# Plot 3: Dot plot284ax3 = fig.add_subplot(3, 3, 3)285dot = dot_plot(dna1, dna2)286ax3.imshow(dot, cmap='binary', aspect='auto')287ax3.set_xlabel('Sequence 2')288ax3.set_ylabel('Sequence 1')289ax3.set_title('Dot Plot')290ax3.set_xticks(range(len(dna2)))291ax3.set_yticks(range(len(dna1)))292ax3.set_xticklabels(list(dna2), fontsize=7)293ax3.set_yticklabels(list(dna1), fontsize=7)294295# Plot 4: Score distribution (DNA)296ax4 = fig.add_subplot(3, 3, 4)297ax4.hist(random_scores_dna, bins=30, alpha=0.7, color='steelblue', edgecolor='black', density=True)298ax4.axvline(x=global_score_dna, color='red', linewidth=2, label=f'Score: {global_score_dna:.0f}')299ax4.axvline(x=np.mean(random_scores_dna), color='green', linestyle='--', linewidth=2, label='Mean')300ax4.set_xlabel('Alignment Score')301ax4.set_ylabel('Density')302ax4.set_title(f'Score Distribution (DNA, p={p_value_dna:.3f})')303ax4.legend(fontsize=7)304ax4.grid(True, alpha=0.3)305306# Plot 5: Score distribution (Protein)307ax5 = fig.add_subplot(3, 3, 5)308ax5.hist(random_scores_prot, bins=30, alpha=0.7, color='green', edgecolor='black', density=True)309ax5.axvline(x=global_score_prot, color='red', linewidth=2, label=f'Score: {global_score_prot:.0f}')310ax5.set_xlabel('Alignment Score')311ax5.set_ylabel('Density')312ax5.set_title(f'Score Distribution (Protein, p={p_value_prot:.3f})')313ax5.legend(fontsize=7)314ax5.grid(True, alpha=0.3)315316# Plot 6: Global alignment matrix (Protein)317ax6 = fig.add_subplot(3, 3, 6)318im6 = ax6.imshow(F_global_prot, cmap='plasma', aspect='auto')319ax6.set_xlabel('Sequence 2 (Protein)')320ax6.set_ylabel('Sequence 1 (Protein)')321ax6.set_title(f'Protein Alignment (Score: {global_score_prot:.0f})')322ax6.set_xticks(range(len(prot2)+1))323ax6.set_yticks(range(len(prot1)+1))324ax6.set_xticklabels(['-'] + list(prot2), fontsize=7)325ax6.set_yticklabels(['-'] + list(prot1), fontsize=7)326plt.colorbar(im6, ax=ax6, shrink=0.8)327328# Plot 7: Gap penalty effect329ax7 = fig.add_subplot(3, 3, 7)330gap_penalties = [-1, -2, -4, -6, -8, -10]331scores_by_gap = []332for gap in gap_penalties:333F, _ = needleman_wunsch(dna1, dna2, score_nt, gap=gap)334scores_by_gap.append(F[-1, -1])335336ax7.plot(gap_penalties, scores_by_gap, 'bo-', linewidth=2, markersize=8)337ax7.set_xlabel('Gap Penalty')338ax7.set_ylabel('Alignment Score')339ax7.set_title('Effect of Gap Penalty')340ax7.grid(True, alpha=0.3)341342# Plot 8: Identity vs Length343ax8 = fig.add_subplot(3, 3, 8)344lengths = range(5, 101, 5)345identities = []346for L in lengths:347# Random sequences of length L348s1 = ''.join(np.random.choice(['A', 'C', 'G', 'T'], L))349s2 = ''.join(np.random.choice(['A', 'C', 'G', 'T'], L))350_, tb = needleman_wunsch(s1, s2, score_nt)351a1, a2 = get_alignment(s1, s2, tb)352identities.append(calculate_identity(a1, a2))353354ax8.plot(lengths, identities, 'g.-', linewidth=1)355ax8.axhline(y=25, color='r', linestyle='--', alpha=0.7, label='Random (25\\%)')356ax8.set_xlabel('Sequence Length')357ax8.set_ylabel('Percent Identity')358ax8.set_title('Random Alignment Identity')359ax8.legend(fontsize=8)360ax8.grid(True, alpha=0.3)361362# Plot 9: Scoring matrix heatmap (simplified BLOSUM)363ax9 = fig.add_subplot(3, 3, 9)364aas = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G']365n_aa = len(aas)366blosum_matrix = np.zeros((n_aa, n_aa))367for i, a1 in enumerate(aas):368for j, a2 in enumerate(aas):369blosum_matrix[i, j] = score_aa(a1, a2)370371im9 = ax9.imshow(blosum_matrix, cmap='RdBu', aspect='auto', vmin=-4, vmax=4)372ax9.set_xticks(range(n_aa))373ax9.set_yticks(range(n_aa))374ax9.set_xticklabels(aas)375ax9.set_yticklabels(aas)376ax9.set_title('Substitution Matrix')377plt.colorbar(im9, ax=ax9, shrink=0.8)378379plt.tight_layout()380plt.savefig('sequence_alignment_plot.pdf', bbox_inches='tight', dpi=150)381print(r'\begin{center}')382print(r'\includegraphics[width=\textwidth]{sequence_alignment_plot.pdf}')383print(r'\end{center}')384plt.close()385\end{pycode}386387\section{Results and Analysis}388389\subsection{Alignment Results}390391\begin{pycode}392# Results table393print(r'\begin{table}[h]')394print(r'\centering')395print(r'\caption{Sequence Alignment Results}')396print(r'\begin{tabular}{lccccc}')397print(r'\toprule')398print(r'Alignment & Global Score & Local Score & Identity (\\%) & Z-score & p-value \\')399print(r'\midrule')400print(f"DNA & {global_score_dna:.0f} & {max_local_dna:.0f} & {identity_dna:.1f} & {z_dna:.2f} & {p_value_dna:.3f} \\\\")401print(f"Protein & {global_score_prot:.0f} & {max_local_prot:.0f} & {identity_prot:.1f} & {z_prot:.2f} & {p_value_prot:.3f} \\\\")402print(r'\bottomrule')403print(r'\end{tabular}')404print(r'\end{table}')405\end{pycode}406407\subsection{Sequence Details}408409\begin{example}[DNA Alignment]410Sequences:411\begin{itemize}412\item Sequence 1: \texttt{\py{f"{dna1}"}}413\item Sequence 2: \texttt{\py{f"{dna2}"}}414\item Aligned 1: \texttt{\py{f"{align1_dna}"}}415\item Aligned 2: \texttt{\py{f"{align2_dna}"}}416\end{itemize}417\end{example}418419\begin{example}[Protein Alignment]420Sequences:421\begin{itemize}422\item Sequence 1: \texttt{\py{f"{prot1}"}}423\item Sequence 2: \texttt{\py{f"{prot2}"}}424\item Aligned 1: \texttt{\py{f"{align1_prot}"}}425\item Aligned 2: \texttt{\py{f"{align2_prot}"}}426\end{itemize}427\end{example}428429\section{Statistical Significance}430431\subsection{Z-score and P-value}432433\begin{remark}[Significance Assessment]434Alignment significance is evaluated by comparing to random sequences:435\begin{itemize}436\item \textbf{Z-score}: Number of standard deviations above mean437\item \textbf{P-value}: Probability of score by chance438\item Z $>$ 5 typically indicates significant similarity439\item P $<$ 0.01 suggests true homology440\end{itemize}441\end{remark}442443\subsection{E-value}444445The expected number of alignments with score $\geq S$ by chance:446\begin{equation}447E = K \cdot m \cdot n \cdot e^{-\lambda S}448\end{equation}449450\section{Scoring Matrices}451452\subsection{BLOSUM Series}453454BLOcks SUbstitution Matrices derived from aligned blocks:455\begin{itemize}456\item BLOSUM62: Most widely used, 62\% identity threshold457\item BLOSUM80: For closely related sequences458\item BLOSUM45: For distantly related sequences459\end{itemize}460461\subsection{PAM Series}462463Point Accepted Mutation matrices based on evolutionary models:464\begin{itemize}465\item PAM1: 1\% expected mutations466\item PAM250: Distant relationships467\end{itemize}468469\section{Limitations and Extensions}470471\subsection{Model Limitations}472\begin{enumerate}473\item \textbf{Pairwise only}: No multiple sequence alignment474\item \textbf{Linear gap}: Affine penalties more realistic475\item \textbf{Global/local}: No semi-global option shown476\item \textbf{No profiles}: Sequence-to-sequence only477\end{enumerate}478479\subsection{Possible Extensions}480\begin{itemize}481\item Affine gap penalties482\item Multiple sequence alignment (ClustalW, MUSCLE)483\item Profile HMMs for remote homology484\item BLAST heuristics for database search485\end{itemize}486487\section{Conclusion}488489This analysis demonstrates sequence alignment fundamentals:490\begin{itemize}491\item Needleman-Wunsch provides optimal global alignment492\item Smith-Waterman finds best local similarities493\item DNA alignment: score = \py{f"{global_score_dna:.0f}"}, identity = \py{f"{identity_dna:.0f}"}\%494\item Protein alignment: score = \py{f"{global_score_prot:.0f}"}, identity = \py{f"{identity_prot:.0f}"}\%495\item Statistical significance evaluated by Z-score and p-value496\end{itemize}497498\section*{Further Reading}499\begin{itemize}500\item Durbin, R. et al. (1998). \textit{Biological Sequence Analysis}. Cambridge University Press.501\item Altschul, S. F. et al. (1990). Basic local alignment search tool. \textit{J. Mol. Biol.}, 215, 403-410.502\item Henikoff, S. \& Henikoff, J. G. (1992). Amino acid substitution matrices from protein blocks. \textit{PNAS}, 89, 10915-10919.503\end{itemize}504505\end{document}506507508