Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Ok-landscape
GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/bioinformatics/protein_structure.tex
51 views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{siunitx}
7
\usepackage{booktabs}
8
\usepackage{subcaption}
9
\usepackage[makestderr]{pythontex}
10
11
% Theorem environments
12
\newtheorem{definition}{Definition}
13
\newtheorem{theorem}{Theorem}
14
\newtheorem{example}{Example}
15
\newtheorem{remark}{Remark}
16
17
\title{Protein Structure Analysis: From Backbone Geometry to Fold Recognition\\
18
\large Ramachandran Analysis, Secondary Structure, and Structural Comparison}
19
\author{Structural Bioinformatics Division\\Computational Science Templates}
20
\date{\today}
21
22
\begin{document}
23
\maketitle
24
25
\begin{abstract}
26
This comprehensive analysis presents methods for analyzing protein three-dimensional structure. We cover backbone dihedral angle analysis through Ramachandran plots, secondary structure prediction using propensity scales, structural comparison using RMSD and TM-score, and contact map analysis for fold topology. The analysis includes the geometric principles underlying protein conformation, statistical analysis of allowed regions in $\phi$-$\psi$ space, and methods for comparing protein structures to identify homologs and predict function.
27
\end{abstract}
28
29
\section{Introduction}
30
31
Protein structure determines function. Understanding the three-dimensional arrangement of amino acids reveals how proteins catalyze reactions, bind ligands, and interact with other molecules. Structural bioinformatics provides computational tools to analyze, compare, and predict protein structures.
32
33
\begin{definition}[Protein Structure Hierarchy]
34
\begin{itemize}
35
\item \textbf{Primary}: Amino acid sequence
36
\item \textbf{Secondary}: Local structure ($\alpha$-helix, $\beta$-sheet, loops)
37
\item \textbf{Tertiary}: Complete 3D fold of a single chain
38
\item \textbf{Quaternary}: Multi-chain assembly
39
\end{itemize}
40
\end{definition}
41
42
\section{Theoretical Framework}
43
44
\subsection{Backbone Dihedral Angles}
45
46
The protein backbone is defined by three repeating atoms: N-C$_\alpha$-C. Conformation is specified by dihedral angles:
47
48
\begin{definition}[Backbone Dihedrals]
49
\begin{align}
50
\phi &: C_{i-1} - N_i - C_\alpha^i - C_i \\
51
\psi &: N_i - C_\alpha^i - C_i - N_{i+1} \\
52
\omega &: C_\alpha^i - C_i - N_{i+1} - C_\alpha^{i+1}
53
\end{align}
54
The peptide bond angle $\omega$ is typically $180^\circ$ (trans) or $0^\circ$ (cis, rare).
55
\end{definition}
56
57
\subsection{Allowed Conformations}
58
59
Steric clashes restrict $\phi$ and $\psi$ to specific regions:
60
61
\begin{remark}[Ramachandran Regions]
62
\begin{itemize}
63
\item $\alpha$-helix: $\phi \approx -60^\circ$, $\psi \approx -45^\circ$
64
\item $\beta$-sheet: $\phi \approx -120^\circ$, $\psi \approx +130^\circ$
65
\item Left-handed helix: $\phi \approx +60^\circ$, $\psi \approx +45^\circ$ (glycine only)
66
\item Polyproline II: $\phi \approx -75^\circ$, $\psi \approx +145^\circ$
67
\end{itemize}
68
\end{remark}
69
70
\subsection{Structural Comparison}
71
72
\begin{theorem}[Root Mean Square Deviation]
73
RMSD measures structural similarity after optimal superposition:
74
\begin{equation}
75
\text{RMSD} = \sqrt{\frac{1}{N}\sum_{i=1}^{N}|\mathbf{r}_i^A - \mathbf{r}_i^B|^2}
76
\end{equation}
77
where $\mathbf{r}_i$ are atomic coordinates after alignment.
78
\end{theorem}
79
80
\begin{definition}[TM-score]
81
A length-independent measure of structural similarity:
82
\begin{equation}
83
\text{TM-score} = \frac{1}{L_N}\sum_{i=1}^{L_{ali}}\frac{1}{1+(d_i/d_0)^2}
84
\end{equation}
85
where $d_0 = 1.24\sqrt[3]{L_N - 15} - 1.8$ \AA{} and $L_N$ is the target length.
86
\end{definition}
87
88
\section{Computational Analysis}
89
90
\begin{pycode}
91
import numpy as np
92
import matplotlib.pyplot as plt
93
from scipy import stats
94
plt.rc('text', usetex=True)
95
plt.rc('font', family='serif')
96
97
np.random.seed(42)
98
99
# Simulate realistic Ramachandran plot data
100
n_residues = 1000
101
102
# Secondary structure populations based on PDB statistics
103
structures = {
104
'alpha': {'phi': (-63, 12), 'psi': (-42, 12), 'n': 400},
105
'beta': {'phi': (-119, 15), 'psi': (135, 18), 'n': 300},
106
'310': {'phi': (-60, 10), 'psi': (-26, 10), 'n': 50},
107
'pi': {'phi': (-57, 8), 'psi': (-70, 8), 'n': 20},
108
'ppII': {'phi': (-75, 10), 'psi': (145, 12), 'n': 80},
109
'left': {'phi': (60, 12), 'psi': (40, 12), 'n': 30},
110
'coil': {'phi': (0, 60), 'psi': (0, 60), 'n': 120}
111
}
112
113
phi_all = []
114
psi_all = []
115
labels = []
116
117
for struct, params in structures.items():
118
phi = np.random.normal(params['phi'][0], params['phi'][1], params['n'])
119
psi = np.random.normal(params['psi'][0], params['psi'][1], params['n'])
120
121
# Wrap to [-180, 180]
122
phi = ((phi + 180) % 360) - 180
123
psi = ((psi + 180) % 360) - 180
124
125
phi_all.extend(phi)
126
psi_all.extend(psi)
127
labels.extend([struct] * params['n'])
128
129
phi_all = np.array(phi_all)
130
psi_all = np.array(psi_all)
131
132
# Chou-Fasman propensity parameters
133
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
134
'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
135
136
aa_names = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile',
137
'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val']
138
139
helix_prop = [1.42, 0.98, 0.67, 1.01, 0.70, 1.11, 1.51, 0.57,
140
1.00, 1.08, 1.21, 1.16, 1.45, 1.13, 0.57, 0.77,
141
0.83, 1.08, 0.69, 1.06]
142
143
sheet_prop = [0.83, 0.93, 0.89, 0.54, 1.19, 1.10, 0.37, 0.75,
144
0.87, 1.60, 1.30, 0.74, 1.05, 1.38, 0.55, 0.75,
145
1.19, 1.37, 1.47, 1.70]
146
147
turn_prop = [0.66, 0.95, 1.56, 1.46, 1.19, 0.98, 0.74, 1.56,
148
0.95, 0.47, 0.59, 1.01, 0.60, 0.60, 1.52, 1.43,
149
0.96, 0.96, 1.14, 0.50]
150
151
# RMSD distribution simulation
152
def calculate_rmsd(coords1, coords2):
153
"""Calculate RMSD between two coordinate sets"""
154
return np.sqrt(np.mean(np.sum((coords1 - coords2)**2, axis=1)))
155
156
def kabsch_rmsd(P, Q):
157
"""RMSD after optimal superposition (Kabsch algorithm)"""
158
# Center both structures
159
P_centered = P - np.mean(P, axis=0)
160
Q_centered = Q - np.mean(Q, axis=0)
161
162
# Compute covariance matrix
163
H = P_centered.T @ Q_centered
164
165
# SVD
166
U, S, Vt = np.linalg.svd(H)
167
168
# Optimal rotation
169
R = Vt.T @ U.T
170
171
# Handle reflection
172
if np.linalg.det(R) < 0:
173
Vt[-1, :] *= -1
174
R = Vt.T @ U.T
175
176
# Apply rotation
177
Q_rotated = Q_centered @ R
178
179
return np.sqrt(np.mean(np.sum((P_centered - Q_rotated)**2, axis=1)))
180
181
# Generate RMSD distribution
182
n_comparisons = 500
183
rmsds = []
184
tm_scores = []
185
186
for _ in range(n_comparisons):
187
n_atoms = np.random.randint(50, 200)
188
coords1 = np.random.randn(n_atoms, 3) * 20
189
perturbation = np.random.randn(n_atoms, 3) * np.random.uniform(0.5, 5)
190
coords2 = coords1 + perturbation
191
rmsds.append(kabsch_rmsd(coords1, coords2))
192
193
# Approximate TM-score
194
d0 = 1.24 * (n_atoms - 15)**(1/3) - 1.8
195
d_i = np.sqrt(np.sum((coords1 - coords2)**2, axis=1))
196
tm = np.mean(1 / (1 + (d_i/d0)**2))
197
tm_scores.append(tm)
198
199
# Contact map simulation
200
def generate_contact_map(n_residues, secondary_structure='mixed'):
201
"""Generate realistic contact map"""
202
contacts = np.zeros((n_residues, n_residues))
203
204
# Local contacts (sequence neighbors)
205
for i in range(n_residues - 1):
206
contacts[i, i+1] = 1
207
contacts[i+1, i] = 1
208
209
# Helical pattern (i, i+3/4)
210
for i in range(0, n_residues - 4, 5):
211
for j in range(min(20, n_residues - i)):
212
if j in [3, 4]:
213
contacts[i, i+j] = 1
214
contacts[i+j, i] = 1
215
216
# Antiparallel beta sheet pattern
217
for i in range(0, n_residues//2):
218
j = n_residues - 1 - i
219
if j > i + 10:
220
contacts[i, j] = 1
221
contacts[j, i] = 1
222
223
# Random long-range contacts
224
for _ in range(n_residues // 5):
225
i, j = np.random.randint(0, n_residues, 2)
226
if abs(i - j) > 8:
227
contacts[i, j] = 1
228
contacts[j, i] = 1
229
230
return contacts
231
232
n_res_map = 80
233
contact_map = generate_contact_map(n_res_map)
234
235
# Create comprehensive figure
236
fig = plt.figure(figsize=(14, 16))
237
238
# Plot 1: Ramachandran plot with density
239
ax1 = fig.add_subplot(3, 3, 1)
240
241
# 2D histogram for density
242
H, xedges, yedges = np.histogram2d(phi_all, psi_all, bins=60, range=[[-180, 180], [-180, 180]])
243
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
244
ax1.imshow(H.T, extent=extent, origin='lower', cmap='YlOrRd', aspect='auto')
245
246
ax1.set_xlabel('$\\phi$ (degrees)')
247
ax1.set_ylabel('$\\psi$ (degrees)')
248
ax1.set_title('Ramachandran Plot')
249
ax1.set_xlim([-180, 180])
250
ax1.set_ylim([-180, 180])
251
ax1.grid(True, alpha=0.3)
252
253
# Plot 2: Ramachandran with scatter colored by structure
254
ax2 = fig.add_subplot(3, 3, 2)
255
colors = {'alpha': 'red', 'beta': 'blue', '310': 'orange', 'pi': 'purple',
256
'ppII': 'cyan', 'left': 'green', 'coil': 'gray'}
257
258
for struct in structures.keys():
259
mask = [l == struct for l in labels]
260
ax2.scatter(phi_all[mask], psi_all[mask], c=colors[struct], s=5, alpha=0.5, label=struct)
261
262
ax2.set_xlabel('$\\phi$ (degrees)')
263
ax2.set_ylabel('$\\psi$ (degrees)')
264
ax2.set_title('Secondary Structure Classification')
265
ax2.legend(fontsize=6, ncol=2, loc='upper right')
266
ax2.set_xlim([-180, 180])
267
ax2.set_ylim([-180, 180])
268
ax2.grid(True, alpha=0.3)
269
270
# Plot 3: Propensity comparison
271
ax3 = fig.add_subplot(3, 3, 3)
272
x = np.arange(len(amino_acids))
273
width = 0.25
274
275
ax3.bar(x - width, helix_prop, width, label='$\\alpha$-helix', color='red', alpha=0.7)
276
ax3.bar(x, sheet_prop, width, label='$\\beta$-sheet', color='blue', alpha=0.7)
277
ax3.bar(x + width, turn_prop, width, label='Turn', color='green', alpha=0.7)
278
ax3.axhline(y=1.0, color='black', linestyle='--', alpha=0.5)
279
280
ax3.set_xlabel('Amino Acid')
281
ax3.set_ylabel('Propensity')
282
ax3.set_title('Chou-Fasman Propensities')
283
ax3.set_xticks(x)
284
ax3.set_xticklabels(amino_acids, fontsize=6)
285
ax3.legend(fontsize=7)
286
ax3.grid(True, alpha=0.3, axis='y')
287
288
# Plot 4: RMSD distribution
289
ax4 = fig.add_subplot(3, 3, 4)
290
ax4.hist(rmsds, bins=30, alpha=0.7, color='steelblue', edgecolor='black')
291
ax4.axvline(x=np.mean(rmsds), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(rmsds):.1f}')
292
ax4.set_xlabel('RMSD (\\AA)')
293
ax4.set_ylabel('Frequency')
294
ax4.set_title('RMSD Distribution')
295
ax4.legend(fontsize=8)
296
ax4.grid(True, alpha=0.3)
297
298
# Plot 5: TM-score distribution
299
ax5 = fig.add_subplot(3, 3, 5)
300
ax5.hist(tm_scores, bins=30, alpha=0.7, color='green', edgecolor='black')
301
ax5.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='Fold threshold')
302
ax5.axvline(x=0.17, color='orange', linestyle='--', linewidth=2, label='Random')
303
ax5.set_xlabel('TM-score')
304
ax5.set_ylabel('Frequency')
305
ax5.set_title('TM-score Distribution')
306
ax5.legend(fontsize=7)
307
ax5.grid(True, alpha=0.3)
308
309
# Plot 6: Contact map
310
ax6 = fig.add_subplot(3, 3, 6)
311
ax6.imshow(contact_map, cmap='binary', origin='lower', aspect='equal')
312
ax6.set_xlabel('Residue $i$')
313
ax6.set_ylabel('Residue $j$')
314
ax6.set_title('Contact Map')
315
316
# Plot 7: Contact density by distance
317
ax7 = fig.add_subplot(3, 3, 7)
318
distances = []
319
contact_fracs = []
320
for d in range(1, 40):
321
mask = np.abs(np.arange(n_res_map)[:, None] - np.arange(n_res_map)) == d
322
contacts_at_d = contact_map[mask]
323
distances.append(d)
324
contact_fracs.append(np.mean(contacts_at_d))
325
326
ax7.bar(distances, contact_fracs, color='purple', alpha=0.7)
327
ax7.set_xlabel('Sequence Separation')
328
ax7.set_ylabel('Contact Frequency')
329
ax7.set_title('Contact vs. Sequence Distance')
330
ax7.grid(True, alpha=0.3, axis='y')
331
332
# Plot 8: Helix formers vs breakers
333
ax8 = fig.add_subplot(3, 3, 8)
334
sorted_idx = np.argsort(helix_prop)[::-1]
335
colors_bar = ['red' if helix_prop[i] > 1.1 else 'blue' if helix_prop[i] < 0.9 else 'gray'
336
for i in sorted_idx]
337
ax8.barh([amino_acids[i] for i in sorted_idx],
338
[helix_prop[i] for i in sorted_idx], color=colors_bar, alpha=0.7)
339
ax8.axvline(x=1.0, color='black', linestyle='--', alpha=0.7)
340
ax8.set_xlabel('Helix Propensity')
341
ax8.set_title('Helix Formers and Breakers')
342
ax8.grid(True, alpha=0.3, axis='x')
343
344
# Plot 9: Phi-psi outlier detection
345
ax9 = fig.add_subplot(3, 3, 9)
346
# Calculate Z-scores for detection of outliers
347
core_phi = phi_all[(labels == 'alpha') | (np.array(labels) == 'beta')]
348
core_psi = psi_all[(np.array(labels) == 'alpha') | (np.array(labels) == 'beta')]
349
350
# Distance from nearest cluster center
351
helix_center = (-63, -42)
352
sheet_center = (-119, 135)
353
354
dist_helix = np.sqrt((phi_all - helix_center[0])**2 + (psi_all - helix_center[1])**2)
355
dist_sheet = np.sqrt((phi_all - sheet_center[0])**2 + (psi_all - sheet_center[1])**2)
356
min_dist = np.minimum(dist_helix, dist_sheet)
357
358
outliers = min_dist > 50
359
ax9.scatter(phi_all[~outliers], psi_all[~outliers], c='blue', s=3, alpha=0.3, label='Core')
360
ax9.scatter(phi_all[outliers], psi_all[outliers], c='red', s=10, alpha=0.7, label='Outlier')
361
ax9.set_xlabel('$\\phi$ (degrees)')
362
ax9.set_ylabel('$\\psi$ (degrees)')
363
ax9.set_title('Outlier Detection')
364
ax9.legend(fontsize=8)
365
ax9.set_xlim([-180, 180])
366
ax9.set_ylim([-180, 180])
367
ax9.grid(True, alpha=0.3)
368
369
plt.tight_layout()
370
plt.savefig('protein_structure_plot.pdf', bbox_inches='tight', dpi=150)
371
print(r'\begin{center}')
372
print(r'\includegraphics[width=\textwidth]{protein_structure_plot.pdf}')
373
print(r'\end{center}')
374
plt.close()
375
376
# Statistics
377
mean_rmsd = np.mean(rmsds)
378
mean_tm = np.mean(tm_scores)
379
helix_fraction = structures['alpha']['n'] / len(phi_all) * 100
380
sheet_fraction = structures['beta']['n'] / len(phi_all) * 100
381
n_outliers = np.sum(outliers)
382
\end{pycode}
383
384
\section{Results and Analysis}
385
386
\subsection{Ramachandran Statistics}
387
388
\begin{pycode}
389
# Structure statistics table
390
print(r'\begin{table}[h]')
391
print(r'\centering')
392
print(r'\caption{Secondary Structure Distribution}')
393
print(r'\begin{tabular}{lcccc}')
394
print(r'\toprule')
395
print(r'Structure & Count & Fraction (\\%) & $\phi$ & $\psi$ \\')
396
print(r'\midrule')
397
for struct, params in structures.items():
398
frac = params['n'] / len(phi_all) * 100
399
print(f"{struct.capitalize()} & {params['n']} & {frac:.1f} & {params['phi'][0]}$^\\circ$ & {params['psi'][0]}$^\\circ$ \\\\")
400
print(r'\midrule')
401
print(f"Total & {len(phi_all)} & 100 & -- & -- \\\\")
402
print(r'\bottomrule')
403
print(r'\end{tabular}')
404
print(r'\end{table}')
405
\end{pycode}
406
407
\subsection{Structural Comparison}
408
409
\begin{example}[RMSD Analysis]
410
From \py{f"{n_comparisons}"} structural comparisons:
411
\begin{itemize}
412
\item Mean RMSD: \py{f"{mean_rmsd:.2f}"} \AA
413
\item Mean TM-score: \py{f"{mean_tm:.3f}"}
414
\item RMSD $<$ 2 \AA: typically same fold
415
\item TM-score $>$ 0.5: same fold (normalized)
416
\end{itemize}
417
\end{example}
418
419
\subsection{Quality Assessment}
420
421
\begin{pycode}
422
# Quality metrics
423
print(r'\begin{table}[h]')
424
print(r'\centering')
425
print(r'\caption{Structure Quality Indicators}')
426
print(r'\begin{tabular}{lc}')
427
print(r'\toprule')
428
print(r'Metric & Value \\')
429
print(r'\midrule')
430
core_frac = 100 - n_outliers/len(phi_all)*100
431
print(f"Core region residues & {core_frac:.1f}\\% \\\\")
432
print(f"Allowed region residues & {100-n_outliers/len(phi_all)*100:.1f}\\% \\\\")
433
print(f"Outliers (generously allowed) & {n_outliers} \\\\")
434
print(f"Total contacts & {int(np.sum(contact_map)/2)} \\\\")
435
print(r'\bottomrule')
436
print(r'\end{tabular}')
437
print(r'\end{table}')
438
\end{pycode}
439
440
\section{Secondary Structure Prediction}
441
442
\subsection{Chou-Fasman Algorithm}
443
444
\begin{remark}[Prediction Steps]
445
\begin{enumerate}
446
\item Calculate propensity for each residue
447
\item Identify nucleation sites (consecutive high-propensity residues)
448
\item Extend regions until breaker residues
449
\item Resolve overlapping predictions
450
\end{enumerate}
451
Accuracy: $\sim$60-65\% (3-state)
452
\end{remark}
453
454
\subsection{Modern Methods}
455
456
Machine learning methods achieve higher accuracy:
457
\begin{itemize}
458
\item Neural networks: $\sim$75-80\%
459
\item PSIPRED (profile-based): $\sim$80\%
460
\item AlphaFold (end-to-end): $>$90\%
461
\end{itemize}
462
463
\section{Limitations and Extensions}
464
465
\subsection{Model Limitations}
466
\begin{enumerate}
467
\item \textbf{Static view}: No dynamics/flexibility
468
\item \textbf{Local propensity}: Ignores long-range interactions
469
\item \textbf{Simplified contacts}: Binary rather than distance-based
470
\item \textbf{No side chains}: Backbone-only analysis
471
\end{enumerate}
472
473
\subsection{Possible Extensions}
474
\begin{itemize}
475
\item Side chain rotamer libraries
476
\item Molecular dynamics analysis
477
\item AlphaFold structure prediction
478
\item Protein-ligand docking
479
\end{itemize}
480
481
\section{Conclusion}
482
483
This analysis demonstrates protein structural bioinformatics:
484
\begin{itemize}
485
\item Ramachandran plot validates backbone geometry
486
\item \py{f"{helix_fraction:.0f}"}\% $\alpha$-helix, \py{f"{sheet_fraction:.0f}"}\% $\beta$-sheet
487
\item RMSD and TM-score quantify structural similarity
488
\item Contact maps reveal fold topology
489
\item Propensity scales enable structure prediction
490
\end{itemize}
491
492
\section*{Further Reading}
493
\begin{itemize}
494
\item Branden, C. \& Tooze, J. (1999). \textit{Introduction to Protein Structure}. Garland Science.
495
\item Lesk, A. M. (2010). \textit{Introduction to Protein Science}. Oxford University Press.
496
\item Zhang, Y. \& Skolnick, J. (2004). Scoring function for automated assessment of protein structure template quality. \textit{Proteins}, 57, 702-710.
497
\end{itemize}
498
499
\end{document}
500
501