Path: blob/main/latex-templates/templates/bioinformatics/protein_structure.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{siunitx}6\usepackage{booktabs}7\usepackage{subcaption}8\usepackage[makestderr]{pythontex}910% Theorem environments11\newtheorem{definition}{Definition}12\newtheorem{theorem}{Theorem}13\newtheorem{example}{Example}14\newtheorem{remark}{Remark}1516\title{Protein Structure Analysis: From Backbone Geometry to Fold Recognition\\17\large Ramachandran Analysis, Secondary Structure, and Structural Comparison}18\author{Structural Bioinformatics Division\\Computational Science Templates}19\date{\today}2021\begin{document}22\maketitle2324\begin{abstract}25This comprehensive analysis presents methods for analyzing protein three-dimensional structure. We cover backbone dihedral angle analysis through Ramachandran plots, secondary structure prediction using propensity scales, structural comparison using RMSD and TM-score, and contact map analysis for fold topology. The analysis includes the geometric principles underlying protein conformation, statistical analysis of allowed regions in $\phi$-$\psi$ space, and methods for comparing protein structures to identify homologs and predict function.26\end{abstract}2728\section{Introduction}2930Protein structure determines function. Understanding the three-dimensional arrangement of amino acids reveals how proteins catalyze reactions, bind ligands, and interact with other molecules. Structural bioinformatics provides computational tools to analyze, compare, and predict protein structures.3132\begin{definition}[Protein Structure Hierarchy]33\begin{itemize}34\item \textbf{Primary}: Amino acid sequence35\item \textbf{Secondary}: Local structure ($\alpha$-helix, $\beta$-sheet, loops)36\item \textbf{Tertiary}: Complete 3D fold of a single chain37\item \textbf{Quaternary}: Multi-chain assembly38\end{itemize}39\end{definition}4041\section{Theoretical Framework}4243\subsection{Backbone Dihedral Angles}4445The protein backbone is defined by three repeating atoms: N-C$_\alpha$-C. Conformation is specified by dihedral angles:4647\begin{definition}[Backbone Dihedrals]48\begin{align}49\phi &: C_{i-1} - N_i - C_\alpha^i - C_i \\50\psi &: N_i - C_\alpha^i - C_i - N_{i+1} \\51\omega &: C_\alpha^i - C_i - N_{i+1} - C_\alpha^{i+1}52\end{align}53The peptide bond angle $\omega$ is typically $180^\circ$ (trans) or $0^\circ$ (cis, rare).54\end{definition}5556\subsection{Allowed Conformations}5758Steric clashes restrict $\phi$ and $\psi$ to specific regions:5960\begin{remark}[Ramachandran Regions]61\begin{itemize}62\item $\alpha$-helix: $\phi \approx -60^\circ$, $\psi \approx -45^\circ$63\item $\beta$-sheet: $\phi \approx -120^\circ$, $\psi \approx +130^\circ$64\item Left-handed helix: $\phi \approx +60^\circ$, $\psi \approx +45^\circ$ (glycine only)65\item Polyproline II: $\phi \approx -75^\circ$, $\psi \approx +145^\circ$66\end{itemize}67\end{remark}6869\subsection{Structural Comparison}7071\begin{theorem}[Root Mean Square Deviation]72RMSD measures structural similarity after optimal superposition:73\begin{equation}74\text{RMSD} = \sqrt{\frac{1}{N}\sum_{i=1}^{N}|\mathbf{r}_i^A - \mathbf{r}_i^B|^2}75\end{equation}76where $\mathbf{r}_i$ are atomic coordinates after alignment.77\end{theorem}7879\begin{definition}[TM-score]80A length-independent measure of structural similarity:81\begin{equation}82\text{TM-score} = \frac{1}{L_N}\sum_{i=1}^{L_{ali}}\frac{1}{1+(d_i/d_0)^2}83\end{equation}84where $d_0 = 1.24\sqrt[3]{L_N - 15} - 1.8$ \AA{} and $L_N$ is the target length.85\end{definition}8687\section{Computational Analysis}8889\begin{pycode}90import numpy as np91import matplotlib.pyplot as plt92from scipy import stats93plt.rc('text', usetex=True)94plt.rc('font', family='serif')9596np.random.seed(42)9798# Simulate realistic Ramachandran plot data99n_residues = 1000100101# Secondary structure populations based on PDB statistics102structures = {103'alpha': {'phi': (-63, 12), 'psi': (-42, 12), 'n': 400},104'beta': {'phi': (-119, 15), 'psi': (135, 18), 'n': 300},105'310': {'phi': (-60, 10), 'psi': (-26, 10), 'n': 50},106'pi': {'phi': (-57, 8), 'psi': (-70, 8), 'n': 20},107'ppII': {'phi': (-75, 10), 'psi': (145, 12), 'n': 80},108'left': {'phi': (60, 12), 'psi': (40, 12), 'n': 30},109'coil': {'phi': (0, 60), 'psi': (0, 60), 'n': 120}110}111112phi_all = []113psi_all = []114labels = []115116for struct, params in structures.items():117phi = np.random.normal(params['phi'][0], params['phi'][1], params['n'])118psi = np.random.normal(params['psi'][0], params['psi'][1], params['n'])119120# Wrap to [-180, 180]121phi = ((phi + 180) % 360) - 180122psi = ((psi + 180) % 360) - 180123124phi_all.extend(phi)125psi_all.extend(psi)126labels.extend([struct] * params['n'])127128phi_all = np.array(phi_all)129psi_all = np.array(psi_all)130131# Chou-Fasman propensity parameters132amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',133'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']134135aa_names = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile',136'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val']137138helix_prop = [1.42, 0.98, 0.67, 1.01, 0.70, 1.11, 1.51, 0.57,1391.00, 1.08, 1.21, 1.16, 1.45, 1.13, 0.57, 0.77,1400.83, 1.08, 0.69, 1.06]141142sheet_prop = [0.83, 0.93, 0.89, 0.54, 1.19, 1.10, 0.37, 0.75,1430.87, 1.60, 1.30, 0.74, 1.05, 1.38, 0.55, 0.75,1441.19, 1.37, 1.47, 1.70]145146turn_prop = [0.66, 0.95, 1.56, 1.46, 1.19, 0.98, 0.74, 1.56,1470.95, 0.47, 0.59, 1.01, 0.60, 0.60, 1.52, 1.43,1480.96, 0.96, 1.14, 0.50]149150# RMSD distribution simulation151def calculate_rmsd(coords1, coords2):152"""Calculate RMSD between two coordinate sets"""153return np.sqrt(np.mean(np.sum((coords1 - coords2)**2, axis=1)))154155def kabsch_rmsd(P, Q):156"""RMSD after optimal superposition (Kabsch algorithm)"""157# Center both structures158P_centered = P - np.mean(P, axis=0)159Q_centered = Q - np.mean(Q, axis=0)160161# Compute covariance matrix162H = P_centered.T @ Q_centered163164# SVD165U, S, Vt = np.linalg.svd(H)166167# Optimal rotation168R = Vt.T @ U.T169170# Handle reflection171if np.linalg.det(R) < 0:172Vt[-1, :] *= -1173R = Vt.T @ U.T174175# Apply rotation176Q_rotated = Q_centered @ R177178return np.sqrt(np.mean(np.sum((P_centered - Q_rotated)**2, axis=1)))179180# Generate RMSD distribution181n_comparisons = 500182rmsds = []183tm_scores = []184185for _ in range(n_comparisons):186n_atoms = np.random.randint(50, 200)187coords1 = np.random.randn(n_atoms, 3) * 20188perturbation = np.random.randn(n_atoms, 3) * np.random.uniform(0.5, 5)189coords2 = coords1 + perturbation190rmsds.append(kabsch_rmsd(coords1, coords2))191192# Approximate TM-score193d0 = 1.24 * (n_atoms - 15)**(1/3) - 1.8194d_i = np.sqrt(np.sum((coords1 - coords2)**2, axis=1))195tm = np.mean(1 / (1 + (d_i/d0)**2))196tm_scores.append(tm)197198# Contact map simulation199def generate_contact_map(n_residues, secondary_structure='mixed'):200"""Generate realistic contact map"""201contacts = np.zeros((n_residues, n_residues))202203# Local contacts (sequence neighbors)204for i in range(n_residues - 1):205contacts[i, i+1] = 1206contacts[i+1, i] = 1207208# Helical pattern (i, i+3/4)209for i in range(0, n_residues - 4, 5):210for j in range(min(20, n_residues - i)):211if j in [3, 4]:212contacts[i, i+j] = 1213contacts[i+j, i] = 1214215# Antiparallel beta sheet pattern216for i in range(0, n_residues//2):217j = n_residues - 1 - i218if j > i + 10:219contacts[i, j] = 1220contacts[j, i] = 1221222# Random long-range contacts223for _ in range(n_residues // 5):224i, j = np.random.randint(0, n_residues, 2)225if abs(i - j) > 8:226contacts[i, j] = 1227contacts[j, i] = 1228229return contacts230231n_res_map = 80232contact_map = generate_contact_map(n_res_map)233234# Create comprehensive figure235fig = plt.figure(figsize=(14, 16))236237# Plot 1: Ramachandran plot with density238ax1 = fig.add_subplot(3, 3, 1)239240# 2D histogram for density241H, xedges, yedges = np.histogram2d(phi_all, psi_all, bins=60, range=[[-180, 180], [-180, 180]])242extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]243ax1.imshow(H.T, extent=extent, origin='lower', cmap='YlOrRd', aspect='auto')244245ax1.set_xlabel('$\\phi$ (degrees)')246ax1.set_ylabel('$\\psi$ (degrees)')247ax1.set_title('Ramachandran Plot')248ax1.set_xlim([-180, 180])249ax1.set_ylim([-180, 180])250ax1.grid(True, alpha=0.3)251252# Plot 2: Ramachandran with scatter colored by structure253ax2 = fig.add_subplot(3, 3, 2)254colors = {'alpha': 'red', 'beta': 'blue', '310': 'orange', 'pi': 'purple',255'ppII': 'cyan', 'left': 'green', 'coil': 'gray'}256257for struct in structures.keys():258mask = [l == struct for l in labels]259ax2.scatter(phi_all[mask], psi_all[mask], c=colors[struct], s=5, alpha=0.5, label=struct)260261ax2.set_xlabel('$\\phi$ (degrees)')262ax2.set_ylabel('$\\psi$ (degrees)')263ax2.set_title('Secondary Structure Classification')264ax2.legend(fontsize=6, ncol=2, loc='upper right')265ax2.set_xlim([-180, 180])266ax2.set_ylim([-180, 180])267ax2.grid(True, alpha=0.3)268269# Plot 3: Propensity comparison270ax3 = fig.add_subplot(3, 3, 3)271x = np.arange(len(amino_acids))272width = 0.25273274ax3.bar(x - width, helix_prop, width, label='$\\alpha$-helix', color='red', alpha=0.7)275ax3.bar(x, sheet_prop, width, label='$\\beta$-sheet', color='blue', alpha=0.7)276ax3.bar(x + width, turn_prop, width, label='Turn', color='green', alpha=0.7)277ax3.axhline(y=1.0, color='black', linestyle='--', alpha=0.5)278279ax3.set_xlabel('Amino Acid')280ax3.set_ylabel('Propensity')281ax3.set_title('Chou-Fasman Propensities')282ax3.set_xticks(x)283ax3.set_xticklabels(amino_acids, fontsize=6)284ax3.legend(fontsize=7)285ax3.grid(True, alpha=0.3, axis='y')286287# Plot 4: RMSD distribution288ax4 = fig.add_subplot(3, 3, 4)289ax4.hist(rmsds, bins=30, alpha=0.7, color='steelblue', edgecolor='black')290ax4.axvline(x=np.mean(rmsds), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(rmsds):.1f}')291ax4.set_xlabel('RMSD (\\AA)')292ax4.set_ylabel('Frequency')293ax4.set_title('RMSD Distribution')294ax4.legend(fontsize=8)295ax4.grid(True, alpha=0.3)296297# Plot 5: TM-score distribution298ax5 = fig.add_subplot(3, 3, 5)299ax5.hist(tm_scores, bins=30, alpha=0.7, color='green', edgecolor='black')300ax5.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='Fold threshold')301ax5.axvline(x=0.17, color='orange', linestyle='--', linewidth=2, label='Random')302ax5.set_xlabel('TM-score')303ax5.set_ylabel('Frequency')304ax5.set_title('TM-score Distribution')305ax5.legend(fontsize=7)306ax5.grid(True, alpha=0.3)307308# Plot 6: Contact map309ax6 = fig.add_subplot(3, 3, 6)310ax6.imshow(contact_map, cmap='binary', origin='lower', aspect='equal')311ax6.set_xlabel('Residue $i$')312ax6.set_ylabel('Residue $j$')313ax6.set_title('Contact Map')314315# Plot 7: Contact density by distance316ax7 = fig.add_subplot(3, 3, 7)317distances = []318contact_fracs = []319for d in range(1, 40):320mask = np.abs(np.arange(n_res_map)[:, None] - np.arange(n_res_map)) == d321contacts_at_d = contact_map[mask]322distances.append(d)323contact_fracs.append(np.mean(contacts_at_d))324325ax7.bar(distances, contact_fracs, color='purple', alpha=0.7)326ax7.set_xlabel('Sequence Separation')327ax7.set_ylabel('Contact Frequency')328ax7.set_title('Contact vs. Sequence Distance')329ax7.grid(True, alpha=0.3, axis='y')330331# Plot 8: Helix formers vs breakers332ax8 = fig.add_subplot(3, 3, 8)333sorted_idx = np.argsort(helix_prop)[::-1]334colors_bar = ['red' if helix_prop[i] > 1.1 else 'blue' if helix_prop[i] < 0.9 else 'gray'335for i in sorted_idx]336ax8.barh([amino_acids[i] for i in sorted_idx],337[helix_prop[i] for i in sorted_idx], color=colors_bar, alpha=0.7)338ax8.axvline(x=1.0, color='black', linestyle='--', alpha=0.7)339ax8.set_xlabel('Helix Propensity')340ax8.set_title('Helix Formers and Breakers')341ax8.grid(True, alpha=0.3, axis='x')342343# Plot 9: Phi-psi outlier detection344ax9 = fig.add_subplot(3, 3, 9)345# Calculate Z-scores for detection of outliers346core_phi = phi_all[(labels == 'alpha') | (np.array(labels) == 'beta')]347core_psi = psi_all[(np.array(labels) == 'alpha') | (np.array(labels) == 'beta')]348349# Distance from nearest cluster center350helix_center = (-63, -42)351sheet_center = (-119, 135)352353dist_helix = np.sqrt((phi_all - helix_center[0])**2 + (psi_all - helix_center[1])**2)354dist_sheet = np.sqrt((phi_all - sheet_center[0])**2 + (psi_all - sheet_center[1])**2)355min_dist = np.minimum(dist_helix, dist_sheet)356357outliers = min_dist > 50358ax9.scatter(phi_all[~outliers], psi_all[~outliers], c='blue', s=3, alpha=0.3, label='Core')359ax9.scatter(phi_all[outliers], psi_all[outliers], c='red', s=10, alpha=0.7, label='Outlier')360ax9.set_xlabel('$\\phi$ (degrees)')361ax9.set_ylabel('$\\psi$ (degrees)')362ax9.set_title('Outlier Detection')363ax9.legend(fontsize=8)364ax9.set_xlim([-180, 180])365ax9.set_ylim([-180, 180])366ax9.grid(True, alpha=0.3)367368plt.tight_layout()369plt.savefig('protein_structure_plot.pdf', bbox_inches='tight', dpi=150)370print(r'\begin{center}')371print(r'\includegraphics[width=\textwidth]{protein_structure_plot.pdf}')372print(r'\end{center}')373plt.close()374375# Statistics376mean_rmsd = np.mean(rmsds)377mean_tm = np.mean(tm_scores)378helix_fraction = structures['alpha']['n'] / len(phi_all) * 100379sheet_fraction = structures['beta']['n'] / len(phi_all) * 100380n_outliers = np.sum(outliers)381\end{pycode}382383\section{Results and Analysis}384385\subsection{Ramachandran Statistics}386387\begin{pycode}388# Structure statistics table389print(r'\begin{table}[h]')390print(r'\centering')391print(r'\caption{Secondary Structure Distribution}')392print(r'\begin{tabular}{lcccc}')393print(r'\toprule')394print(r'Structure & Count & Fraction (\\%) & $\phi$ & $\psi$ \\')395print(r'\midrule')396for struct, params in structures.items():397frac = params['n'] / len(phi_all) * 100398print(f"{struct.capitalize()} & {params['n']} & {frac:.1f} & {params['phi'][0]}$^\\circ$ & {params['psi'][0]}$^\\circ$ \\\\")399print(r'\midrule')400print(f"Total & {len(phi_all)} & 100 & -- & -- \\\\")401print(r'\bottomrule')402print(r'\end{tabular}')403print(r'\end{table}')404\end{pycode}405406\subsection{Structural Comparison}407408\begin{example}[RMSD Analysis]409From \py{f"{n_comparisons}"} structural comparisons:410\begin{itemize}411\item Mean RMSD: \py{f"{mean_rmsd:.2f}"} \AA412\item Mean TM-score: \py{f"{mean_tm:.3f}"}413\item RMSD $<$ 2 \AA: typically same fold414\item TM-score $>$ 0.5: same fold (normalized)415\end{itemize}416\end{example}417418\subsection{Quality Assessment}419420\begin{pycode}421# Quality metrics422print(r'\begin{table}[h]')423print(r'\centering')424print(r'\caption{Structure Quality Indicators}')425print(r'\begin{tabular}{lc}')426print(r'\toprule')427print(r'Metric & Value \\')428print(r'\midrule')429core_frac = 100 - n_outliers/len(phi_all)*100430print(f"Core region residues & {core_frac:.1f}\\% \\\\")431print(f"Allowed region residues & {100-n_outliers/len(phi_all)*100:.1f}\\% \\\\")432print(f"Outliers (generously allowed) & {n_outliers} \\\\")433print(f"Total contacts & {int(np.sum(contact_map)/2)} \\\\")434print(r'\bottomrule')435print(r'\end{tabular}')436print(r'\end{table}')437\end{pycode}438439\section{Secondary Structure Prediction}440441\subsection{Chou-Fasman Algorithm}442443\begin{remark}[Prediction Steps]444\begin{enumerate}445\item Calculate propensity for each residue446\item Identify nucleation sites (consecutive high-propensity residues)447\item Extend regions until breaker residues448\item Resolve overlapping predictions449\end{enumerate}450Accuracy: $\sim$60-65\% (3-state)451\end{remark}452453\subsection{Modern Methods}454455Machine learning methods achieve higher accuracy:456\begin{itemize}457\item Neural networks: $\sim$75-80\%458\item PSIPRED (profile-based): $\sim$80\%459\item AlphaFold (end-to-end): $>$90\%460\end{itemize}461462\section{Limitations and Extensions}463464\subsection{Model Limitations}465\begin{enumerate}466\item \textbf{Static view}: No dynamics/flexibility467\item \textbf{Local propensity}: Ignores long-range interactions468\item \textbf{Simplified contacts}: Binary rather than distance-based469\item \textbf{No side chains}: Backbone-only analysis470\end{enumerate}471472\subsection{Possible Extensions}473\begin{itemize}474\item Side chain rotamer libraries475\item Molecular dynamics analysis476\item AlphaFold structure prediction477\item Protein-ligand docking478\end{itemize}479480\section{Conclusion}481482This analysis demonstrates protein structural bioinformatics:483\begin{itemize}484\item Ramachandran plot validates backbone geometry485\item \py{f"{helix_fraction:.0f}"}\% $\alpha$-helix, \py{f"{sheet_fraction:.0f}"}\% $\beta$-sheet486\item RMSD and TM-score quantify structural similarity487\item Contact maps reveal fold topology488\item Propensity scales enable structure prediction489\end{itemize}490491\section*{Further Reading}492\begin{itemize}493\item Branden, C. \& Tooze, J. (1999). \textit{Introduction to Protein Structure}. Garland Science.494\item Lesk, A. M. (2010). \textit{Introduction to Protein Science}. Oxford University Press.495\item Zhang, Y. \& Skolnick, J. (2004). Scoring function for automated assessment of protein structure template quality. \textit{Proteins}, 57, 702-710.496\end{itemize}497498\end{document}499500501