CoCalc -- protein_structure.tex

GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/bioinformatics/protein_structure.tex
⁵¹ views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{siunitx}
7
\usepackage{booktabs}
8
\usepackage{subcaption}
9
\usepackage[makestderr]{pythontex}
10

11
% Theorem environments
12
\newtheorem{definition}{Definition}
13
\newtheorem{theorem}{Theorem}
14
\newtheorem{example}{Example}
15
\newtheorem{remark}{Remark}
16

17
\title{Protein Structure Analysis: From Backbone Geometry to Fold Recognition\\
18
\large Ramachandran Analysis, Secondary Structure, and Structural Comparison}
19
\author{Structural Bioinformatics Division\\Computational Science Templates}
20
\date{\today}
21

22
\begin{document}
23
\maketitle
24

25
\begin{abstract}
26
This comprehensive analysis presents methods for analyzing protein three-dimensional structure. We cover backbone dihedral angle analysis through Ramachandran plots, secondary structure prediction using propensity scales, structural comparison using RMSD and TM-score, and contact map analysis for fold topology. The analysis includes the geometric principles underlying protein conformation, statistical analysis of allowed regions in $\phi$-$\psi$ space, and methods for comparing protein structures to identify homologs and predict function.
27
\end{abstract}
28

29
\section{Introduction}
30

31
Protein structure determines function. Understanding the three-dimensional arrangement of amino acids reveals how proteins catalyze reactions, bind ligands, and interact with other molecules. Structural bioinformatics provides computational tools to analyze, compare, and predict protein structures.
32

33
\begin{definition}[Protein Structure Hierarchy]
34
\begin{itemize}
35
    \item \textbf{Primary}: Amino acid sequence
36
    \item \textbf{Secondary}: Local structure ($\alpha$-helix, $\beta$-sheet, loops)
37
    \item \textbf{Tertiary}: Complete 3D fold of a single chain
38
    \item \textbf{Quaternary}: Multi-chain assembly
39
\end{itemize}
40
\end{definition}
41

42
\section{Theoretical Framework}
43

44
\subsection{Backbone Dihedral Angles}
45

46
The protein backbone is defined by three repeating atoms: N-C$_\alpha$-C. Conformation is specified by dihedral angles:
47

48
\begin{definition}[Backbone Dihedrals]
49
\begin{align}
50
\phi &: C_{i-1} - N_i - C_\alpha^i - C_i \\
51
\psi &: N_i - C_\alpha^i - C_i - N_{i+1} \\
52
\omega &: C_\alpha^i - C_i - N_{i+1} - C_\alpha^{i+1}
53
\end{align}
54
The peptide bond angle $\omega$ is typically $180^\circ$ (trans) or $0^\circ$ (cis, rare).
55
\end{definition}
56

57
\subsection{Allowed Conformations}
58

59
Steric clashes restrict $\phi$ and $\psi$ to specific regions:
60

61
\begin{remark}[Ramachandran Regions]
62
\begin{itemize}
63
    \item $\alpha$-helix: $\phi \approx -60^\circ$, $\psi \approx -45^\circ$
64
    \item $\beta$-sheet: $\phi \approx -120^\circ$, $\psi \approx +130^\circ$
65
    \item Left-handed helix: $\phi \approx +60^\circ$, $\psi \approx +45^\circ$ (glycine only)
66
    \item Polyproline II: $\phi \approx -75^\circ$, $\psi \approx +145^\circ$
67
\end{itemize}
68
\end{remark}
69

70
\subsection{Structural Comparison}
71

72
\begin{theorem}[Root Mean Square Deviation]
73
RMSD measures structural similarity after optimal superposition:
74
\begin{equation}
75
\text{RMSD} = \sqrt{\frac{1}{N}\sum_{i=1}^{N}|\mathbf{r}_i^A - \mathbf{r}_i^B|^2}
76
\end{equation}
77
where $\mathbf{r}_i$ are atomic coordinates after alignment.
78
\end{theorem}
79

80
\begin{definition}[TM-score]
81
A length-independent measure of structural similarity:
82
\begin{equation}
83
\text{TM-score} = \frac{1}{L_N}\sum_{i=1}^{L_{ali}}\frac{1}{1+(d_i/d_0)^2}
84
\end{equation}
85
where $d_0 = 1.24\sqrt[3]{L_N - 15} - 1.8$ \AA{} and $L_N$ is the target length.
86
\end{definition}
87

88
\section{Computational Analysis}
89

90
\begin{pycode}
91
import numpy as np
92
import matplotlib.pyplot as plt
93
from scipy import stats
94
plt.rc('text', usetex=True)
95
plt.rc('font', family='serif')
96

97
np.random.seed(42)
98

99
# Simulate realistic Ramachandran plot data
100
n_residues = 1000
101

102
# Secondary structure populations based on PDB statistics
103
structures = {
104
    'alpha': {'phi': (-63, 12), 'psi': (-42, 12), 'n': 400},
105
    'beta': {'phi': (-119, 15), 'psi': (135, 18), 'n': 300},
106
    '310': {'phi': (-60, 10), 'psi': (-26, 10), 'n': 50},
107
    'pi': {'phi': (-57, 8), 'psi': (-70, 8), 'n': 20},
108
    'ppII': {'phi': (-75, 10), 'psi': (145, 12), 'n': 80},
109
    'left': {'phi': (60, 12), 'psi': (40, 12), 'n': 30},
110
    'coil': {'phi': (0, 60), 'psi': (0, 60), 'n': 120}
111
}
112

113
phi_all = []
114
psi_all = []
115
labels = []
116

117
for struct, params in structures.items():
118
    phi = np.random.normal(params['phi'][0], params['phi'][1], params['n'])
119
    psi = np.random.normal(params['psi'][0], params['psi'][1], params['n'])
120

121
    # Wrap to [-180, 180]
122
    phi = ((phi + 180) % 360) - 180
123
    psi = ((psi + 180) % 360) - 180
124

125
    phi_all.extend(phi)
126
    psi_all.extend(psi)
127
    labels.extend([struct] * params['n'])
128

129
phi_all = np.array(phi_all)
130
psi_all = np.array(psi_all)
131

132
# Chou-Fasman propensity parameters
133
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
134
               'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
135

136
aa_names = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile',
137
            'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val']
138

139
helix_prop = [1.42, 0.98, 0.67, 1.01, 0.70, 1.11, 1.51, 0.57,
140
              1.00, 1.08, 1.21, 1.16, 1.45, 1.13, 0.57, 0.77,
141
              0.83, 1.08, 0.69, 1.06]
142

143
sheet_prop = [0.83, 0.93, 0.89, 0.54, 1.19, 1.10, 0.37, 0.75,
144
              0.87, 1.60, 1.30, 0.74, 1.05, 1.38, 0.55, 0.75,
145
              1.19, 1.37, 1.47, 1.70]
146

147
turn_prop = [0.66, 0.95, 1.56, 1.46, 1.19, 0.98, 0.74, 1.56,
148
             0.95, 0.47, 0.59, 1.01, 0.60, 0.60, 1.52, 1.43,
149
             0.96, 0.96, 1.14, 0.50]
150

151
# RMSD distribution simulation
152
def calculate_rmsd(coords1, coords2):
153
    """Calculate RMSD between two coordinate sets"""
154
    return np.sqrt(np.mean(np.sum((coords1 - coords2)**2, axis=1)))
155

156
def kabsch_rmsd(P, Q):
157
    """RMSD after optimal superposition (Kabsch algorithm)"""
158
    # Center both structures
159
    P_centered = P - np.mean(P, axis=0)
160
    Q_centered = Q - np.mean(Q, axis=0)
161

162
    # Compute covariance matrix
163
    H = P_centered.T @ Q_centered
164

165
    # SVD
166
    U, S, Vt = np.linalg.svd(H)
167

168
    # Optimal rotation
169
    R = Vt.T @ U.T
170

171
    # Handle reflection
172
    if np.linalg.det(R) < 0:
173
        Vt[-1, :] *= -1
174
        R = Vt.T @ U.T
175

176
    # Apply rotation
177
    Q_rotated = Q_centered @ R
178

179
    return np.sqrt(np.mean(np.sum((P_centered - Q_rotated)**2, axis=1)))
180

181
# Generate RMSD distribution
182
n_comparisons = 500
183
rmsds = []
184
tm_scores = []
185

186
for _ in range(n_comparisons):
187
    n_atoms = np.random.randint(50, 200)
188
    coords1 = np.random.randn(n_atoms, 3) * 20
189
    perturbation = np.random.randn(n_atoms, 3) * np.random.uniform(0.5, 5)
190
    coords2 = coords1 + perturbation
191
    rmsds.append(kabsch_rmsd(coords1, coords2))
192

193
    # Approximate TM-score
194
    d0 = 1.24 * (n_atoms - 15)**(1/3) - 1.8
195
    d_i = np.sqrt(np.sum((coords1 - coords2)**2, axis=1))
196
    tm = np.mean(1 / (1 + (d_i/d0)**2))
197
    tm_scores.append(tm)
198

199
# Contact map simulation
200
def generate_contact_map(n_residues, secondary_structure='mixed'):
201
    """Generate realistic contact map"""
202
    contacts = np.zeros((n_residues, n_residues))
203

204
    # Local contacts (sequence neighbors)
205
    for i in range(n_residues - 1):
206
        contacts[i, i+1] = 1
207
        contacts[i+1, i] = 1
208

209
    # Helical pattern (i, i+3/4)
210
    for i in range(0, n_residues - 4, 5):
211
        for j in range(min(20, n_residues - i)):
212
            if j in [3, 4]:
213
                contacts[i, i+j] = 1
214
                contacts[i+j, i] = 1
215

216
    # Antiparallel beta sheet pattern
217
    for i in range(0, n_residues//2):
218
        j = n_residues - 1 - i
219
        if j > i + 10:
220
            contacts[i, j] = 1
221
            contacts[j, i] = 1
222

223
    # Random long-range contacts
224
    for _ in range(n_residues // 5):
225
        i, j = np.random.randint(0, n_residues, 2)
226
        if abs(i - j) > 8:
227
            contacts[i, j] = 1
228
            contacts[j, i] = 1
229

230
    return contacts
231

232
n_res_map = 80
233
contact_map = generate_contact_map(n_res_map)
234

235
# Create comprehensive figure
236
fig = plt.figure(figsize=(14, 16))
237

238
# Plot 1: Ramachandran plot with density
239
ax1 = fig.add_subplot(3, 3, 1)
240

241
# 2D histogram for density
242
H, xedges, yedges = np.histogram2d(phi_all, psi_all, bins=60, range=[[-180, 180], [-180, 180]])
243
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
244
ax1.imshow(H.T, extent=extent, origin='lower', cmap='YlOrRd', aspect='auto')
245

246
ax1.set_xlabel('$\\phi$ (degrees)')
247
ax1.set_ylabel('$\\psi$ (degrees)')
248
ax1.set_title('Ramachandran Plot')
249
ax1.set_xlim([-180, 180])
250
ax1.set_ylim([-180, 180])
251
ax1.grid(True, alpha=0.3)
252

253
# Plot 2: Ramachandran with scatter colored by structure
254
ax2 = fig.add_subplot(3, 3, 2)
255
colors = {'alpha': 'red', 'beta': 'blue', '310': 'orange', 'pi': 'purple',
256
          'ppII': 'cyan', 'left': 'green', 'coil': 'gray'}
257

258
for struct in structures.keys():
259
    mask = [l == struct for l in labels]
260
    ax2.scatter(phi_all[mask], psi_all[mask], c=colors[struct], s=5, alpha=0.5, label=struct)
261

262
ax2.set_xlabel('$\\phi$ (degrees)')
263
ax2.set_ylabel('$\\psi$ (degrees)')
264
ax2.set_title('Secondary Structure Classification')
265
ax2.legend(fontsize=6, ncol=2, loc='upper right')
266
ax2.set_xlim([-180, 180])
267
ax2.set_ylim([-180, 180])
268
ax2.grid(True, alpha=0.3)
269

270
# Plot 3: Propensity comparison
271
ax3 = fig.add_subplot(3, 3, 3)
272
x = np.arange(len(amino_acids))
273
width = 0.25
274

275
ax3.bar(x - width, helix_prop, width, label='$\\alpha$-helix', color='red', alpha=0.7)
276
ax3.bar(x, sheet_prop, width, label='$\\beta$-sheet', color='blue', alpha=0.7)
277
ax3.bar(x + width, turn_prop, width, label='Turn', color='green', alpha=0.7)
278
ax3.axhline(y=1.0, color='black', linestyle='--', alpha=0.5)
279

280
ax3.set_xlabel('Amino Acid')
281
ax3.set_ylabel('Propensity')
282
ax3.set_title('Chou-Fasman Propensities')
283
ax3.set_xticks(x)
284
ax3.set_xticklabels(amino_acids, fontsize=6)
285
ax3.legend(fontsize=7)
286
ax3.grid(True, alpha=0.3, axis='y')
287

288
# Plot 4: RMSD distribution
289
ax4 = fig.add_subplot(3, 3, 4)
290
ax4.hist(rmsds, bins=30, alpha=0.7, color='steelblue', edgecolor='black')
291
ax4.axvline(x=np.mean(rmsds), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(rmsds):.1f}')
292
ax4.set_xlabel('RMSD (\\AA)')
293
ax4.set_ylabel('Frequency')
294
ax4.set_title('RMSD Distribution')
295
ax4.legend(fontsize=8)
296
ax4.grid(True, alpha=0.3)
297

298
# Plot 5: TM-score distribution
299
ax5 = fig.add_subplot(3, 3, 5)
300
ax5.hist(tm_scores, bins=30, alpha=0.7, color='green', edgecolor='black')
301
ax5.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='Fold threshold')
302
ax5.axvline(x=0.17, color='orange', linestyle='--', linewidth=2, label='Random')
303
ax5.set_xlabel('TM-score')
304
ax5.set_ylabel('Frequency')
305
ax5.set_title('TM-score Distribution')
306
ax5.legend(fontsize=7)
307
ax5.grid(True, alpha=0.3)
308

309
# Plot 6: Contact map
310
ax6 = fig.add_subplot(3, 3, 6)
311
ax6.imshow(contact_map, cmap='binary', origin='lower', aspect='equal')
312
ax6.set_xlabel('Residue $i$')
313
ax6.set_ylabel('Residue $j$')
314
ax6.set_title('Contact Map')
315

316
# Plot 7: Contact density by distance
317
ax7 = fig.add_subplot(3, 3, 7)
318
distances = []
319
contact_fracs = []
320
for d in range(1, 40):
321
    mask = np.abs(np.arange(n_res_map)[:, None] - np.arange(n_res_map)) == d
322
    contacts_at_d = contact_map[mask]
323
    distances.append(d)
324
    contact_fracs.append(np.mean(contacts_at_d))
325

326
ax7.bar(distances, contact_fracs, color='purple', alpha=0.7)
327
ax7.set_xlabel('Sequence Separation')
328
ax7.set_ylabel('Contact Frequency')
329
ax7.set_title('Contact vs. Sequence Distance')
330
ax7.grid(True, alpha=0.3, axis='y')
331

332
# Plot 8: Helix formers vs breakers
333
ax8 = fig.add_subplot(3, 3, 8)
334
sorted_idx = np.argsort(helix_prop)[::-1]
335
colors_bar = ['red' if helix_prop[i] > 1.1 else 'blue' if helix_prop[i] < 0.9 else 'gray'
336
              for i in sorted_idx]
337
ax8.barh([amino_acids[i] for i in sorted_idx],
338
         [helix_prop[i] for i in sorted_idx], color=colors_bar, alpha=0.7)
339
ax8.axvline(x=1.0, color='black', linestyle='--', alpha=0.7)
340
ax8.set_xlabel('Helix Propensity')
341
ax8.set_title('Helix Formers and Breakers')
342
ax8.grid(True, alpha=0.3, axis='x')
343

344
# Plot 9: Phi-psi outlier detection
345
ax9 = fig.add_subplot(3, 3, 9)
346
# Calculate Z-scores for detection of outliers
347
core_phi = phi_all[(labels == 'alpha') | (np.array(labels) == 'beta')]
348
core_psi = psi_all[(np.array(labels) == 'alpha') | (np.array(labels) == 'beta')]
349

350
# Distance from nearest cluster center
351
helix_center = (-63, -42)
352
sheet_center = (-119, 135)
353

354
dist_helix = np.sqrt((phi_all - helix_center[0])**2 + (psi_all - helix_center[1])**2)
355
dist_sheet = np.sqrt((phi_all - sheet_center[0])**2 + (psi_all - sheet_center[1])**2)
356
min_dist = np.minimum(dist_helix, dist_sheet)
357

358
outliers = min_dist > 50
359
ax9.scatter(phi_all[~outliers], psi_all[~outliers], c='blue', s=3, alpha=0.3, label='Core')
360
ax9.scatter(phi_all[outliers], psi_all[outliers], c='red', s=10, alpha=0.7, label='Outlier')
361
ax9.set_xlabel('$\\phi$ (degrees)')
362
ax9.set_ylabel('$\\psi$ (degrees)')
363
ax9.set_title('Outlier Detection')
364
ax9.legend(fontsize=8)
365
ax9.set_xlim([-180, 180])
366
ax9.set_ylim([-180, 180])
367
ax9.grid(True, alpha=0.3)
368

369
plt.tight_layout()
370
plt.savefig('protein_structure_plot.pdf', bbox_inches='tight', dpi=150)
371
print(r'\begin{center}')
372
print(r'\includegraphics[width=\textwidth]{protein_structure_plot.pdf}')
373
print(r'\end{center}')
374
plt.close()
375

376
# Statistics
377
mean_rmsd = np.mean(rmsds)
378
mean_tm = np.mean(tm_scores)
379
helix_fraction = structures['alpha']['n'] / len(phi_all) * 100
380
sheet_fraction = structures['beta']['n'] / len(phi_all) * 100
381
n_outliers = np.sum(outliers)
382
\end{pycode}
383

384
\section{Results and Analysis}
385

386
\subsection{Ramachandran Statistics}
387

388
\begin{pycode}
389
# Structure statistics table
390
print(r'\begin{table}[h]')
391
print(r'\centering')
392
print(r'\caption{Secondary Structure Distribution}')
393
print(r'\begin{tabular}{lcccc}')
394
print(r'\toprule')
395
print(r'Structure & Count & Fraction (\\%) & $\phi$ & $\psi$ \\')
396
print(r'\midrule')
397
for struct, params in structures.items():
398
    frac = params['n'] / len(phi_all) * 100
399
    print(f"{struct.capitalize()} & {params['n']} & {frac:.1f} & {params['phi'][0]}$^\\circ$ & {params['psi'][0]}$^\\circ$ \\\\")
400
print(r'\midrule')
401
print(f"Total & {len(phi_all)} & 100 & -- & -- \\\\")
402
print(r'\bottomrule')
403
print(r'\end{tabular}')
404
print(r'\end{table}')
405
\end{pycode}
406

407
\subsection{Structural Comparison}
408

409
\begin{example}[RMSD Analysis]
410
From \py{f"{n_comparisons}"} structural comparisons:
411
\begin{itemize}
412
    \item Mean RMSD: \py{f"{mean_rmsd:.2f}"} \AA
413
    \item Mean TM-score: \py{f"{mean_tm:.3f}"}
414
    \item RMSD $<$ 2 \AA: typically same fold
415
    \item TM-score $>$ 0.5: same fold (normalized)
416
\end{itemize}
417
\end{example}
418

419
\subsection{Quality Assessment}
420

421
\begin{pycode}
422
# Quality metrics
423
print(r'\begin{table}[h]')
424
print(r'\centering')
425
print(r'\caption{Structure Quality Indicators}')
426
print(r'\begin{tabular}{lc}')
427
print(r'\toprule')
428
print(r'Metric & Value \\')
429
print(r'\midrule')
430
core_frac = 100 - n_outliers/len(phi_all)*100
431
print(f"Core region residues & {core_frac:.1f}\\% \\\\")
432
print(f"Allowed region residues & {100-n_outliers/len(phi_all)*100:.1f}\\% \\\\")
433
print(f"Outliers (generously allowed) & {n_outliers} \\\\")
434
print(f"Total contacts & {int(np.sum(contact_map)/2)} \\\\")
435
print(r'\bottomrule')
436
print(r'\end{tabular}')
437
print(r'\end{table}')
438
\end{pycode}
439

440
\section{Secondary Structure Prediction}
441

442
\subsection{Chou-Fasman Algorithm}
443

444
\begin{remark}[Prediction Steps]
445
\begin{enumerate}
446
    \item Calculate propensity for each residue
447
    \item Identify nucleation sites (consecutive high-propensity residues)
448
    \item Extend regions until breaker residues
449
    \item Resolve overlapping predictions
450
\end{enumerate}
451
Accuracy: $\sim$60-65\% (3-state)
452
\end{remark}
453

454
\subsection{Modern Methods}
455

456
Machine learning methods achieve higher accuracy:
457
\begin{itemize}
458
    \item Neural networks: $\sim$75-80\%
459
    \item PSIPRED (profile-based): $\sim$80\%
460
    \item AlphaFold (end-to-end): $>$90\%
461
\end{itemize}
462

463
\section{Limitations and Extensions}
464

465
\subsection{Model Limitations}
466
\begin{enumerate}
467
    \item \textbf{Static view}: No dynamics/flexibility
468
    \item \textbf{Local propensity}: Ignores long-range interactions
469
    \item \textbf{Simplified contacts}: Binary rather than distance-based
470
    \item \textbf{No side chains}: Backbone-only analysis
471
\end{enumerate}
472

473
\subsection{Possible Extensions}
474
\begin{itemize}
475
    \item Side chain rotamer libraries
476
    \item Molecular dynamics analysis
477
    \item AlphaFold structure prediction
478
    \item Protein-ligand docking
479
\end{itemize}
480

481
\section{Conclusion}
482

483
This analysis demonstrates protein structural bioinformatics:
484
\begin{itemize}
485
    \item Ramachandran plot validates backbone geometry
486
    \item \py{f"{helix_fraction:.0f}"}\% $\alpha$-helix, \py{f"{sheet_fraction:.0f}"}\% $\beta$-sheet
487
    \item RMSD and TM-score quantify structural similarity
488
    \item Contact maps reveal fold topology
489
    \item Propensity scales enable structure prediction
490
\end{itemize}
491

492
\section*{Further Reading}
493
\begin{itemize}
494
    \item Branden, C. \& Tooze, J. (1999). \textit{Introduction to Protein Structure}. Garland Science.
495
    \item Lesk, A. M. (2010). \textit{Introduction to Protein Science}. Oxford University Press.
496
    \item Zhang, Y. \& Skolnick, J. (2004). Scoring function for automated assessment of protein structure template quality. \textit{Proteins}, 57, 702-710.
497
\end{itemize}
498

499
\end{document}
500

501
Product

Resources

Company