Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
4 views
ubuntu2404
1
\documentclass[11pt,letterpaper]{article}
2
3
% CoCalc Bioinformatics Genomic Analysis Template
4
% Optimized for genomic data analysis with BioPython integration
5
% Features: Sequence analysis, phylogenetics, BLAST, genomic visualization
6
7
%=============================================================================
8
% PACKAGE IMPORTS - Bioinformatics-specific packages
9
%=============================================================================
10
\usepackage[utf8]{inputenc}
11
\usepackage[T1]{fontenc}
12
\usepackage{lmodern}
13
\usepackage[english]{babel}
14
15
% Page layout optimized for genomic data presentation
16
\usepackage[margin=0.8in]{geometry}
17
\usepackage{setspace}
18
\usepackage{parskip}
19
20
% Mathematics and symbols for sequence analysis
21
\usepackage{amsmath,amsfonts,amssymb,amsthm}
22
\usepackage{mathtools}
23
\usepackage{siunitx}
24
25
% Graphics for phylogenetic trees and genomic plots
26
\usepackage{graphicx}
27
\usepackage{float}
28
\usepackage{subcaption}
29
\usepackage{wrapfig}
30
\usepackage{tikz}
31
\usetikzlibrary{trees,positioning}
32
33
% Tables for sequence alignments and genomic annotations
34
\usepackage{booktabs}
35
\usepackage{array}
36
\usepackage{multirow}
37
\usepackage{longtable}
38
\usepackage{tabularx}
39
\usepackage{adjustbox} % For fitting tables and content
40
41
% Code integration for BioPython
42
\usepackage{pythontex}
43
\usepackage{listings}
44
\usepackage{xcolor}
45
46
% Bioinformatics-specific formatting
47
\usepackage{textcomp} % For special characters in sequences
48
\usepackage{courier} % Monospace font for sequences
49
50
% Citations optimized for biological journals
51
\usepackage{csquotes} % Required by biblatex
52
\usepackage[backend=bibtex,style=nature,sorting=none]{biblatex}
53
\addbibresource{references.bib}
54
55
% Cross-referencing and hyperlinks
56
\usepackage[colorlinks=true,citecolor=blue,linkcolor=blue,urlcolor=blue]{hyperref}
57
\usepackage{cleveref}
58
59
%=============================================================================
60
% PYTHONTEX CONFIGURATION - BioPython Environment
61
%=============================================================================
62
\begin{pycode}
63
# Import core bioinformatics and data analysis libraries
64
import numpy as np
65
import matplotlib.pyplot as plt
66
import matplotlib
67
matplotlib.use('Agg')
68
import seaborn as sns
69
import pandas as pd
70
from scipy import stats
71
from scipy.cluster.hierarchy import dendrogram, linkage
72
from scipy.spatial.distance import squareform
73
74
# BioPython imports for sequence analysis
75
try:
76
from Bio import SeqIO, Align, Phylo
77
from Bio.Seq import Seq
78
from Bio.SeqRecord import SeqRecord
79
from Bio.SeqUtils import GC, molecular_weight
80
from Bio.SeqUtils.ProtParam import ProteinAnalysis
81
from Bio.Blast import NCBIXML
82
from Bio import Entrez
83
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
84
from Bio.Align import MultipleSeqAlignment
85
biopython_available = True
86
except ImportError:
87
print("BioPython not available - using simulated data")
88
biopython_available = False
89
90
# Set visualization parameters for genomic data
91
plt.style.use('seaborn-v0_8-whitegrid')
92
np.random.seed(42)
93
sns.set_palette("Set2")
94
95
# Genomics-optimized figure settings
96
plt.rcParams['figure.figsize'] = (10, 6)
97
plt.rcParams['figure.dpi'] = 150
98
plt.rcParams['savefig.bbox'] = 'tight'
99
plt.rcParams['savefig.pad_inches'] = 0.1
100
plt.rcParams['font.family'] = 'sans-serif'
101
102
# Configure pandas display to avoid overfull boxes
103
pd.set_option('display.max_columns', 8)
104
pd.set_option('display.width', 80)
105
pd.set_option('display.precision', 3)
106
107
# Define nucleotide and amino acid color schemes
108
nucleotide_colors = {'A': '#FF6B6B', 'T': '#4ECDC4', 'G': '#45B7D1', 'C': '#96CEB4'}
109
amino_acid_colors = {
110
'A': '#FF6B6B', 'R': '#4ECDC4', 'N': '#45B7D1', 'D': '#96CEB4',
111
'C': '#FECA57', 'Q': '#48CAE4', 'E': '#F38BA8', 'G': '#A8DADC',
112
'H': '#FFB3BA', 'I': '#BFEFFF', 'L': '#FFDFBA', 'K': '#FFFFBA',
113
'M': '#BAE1FF', 'F': '#FFBAE1', 'P': '#E1BAFF', 'S': '#D4E2FC',
114
'T': '#FCE4D4', 'W': '#E4FCD4', 'Y': '#D4FCE4', 'V': '#FCD4E4'
115
}
116
\end{pycode}
117
118
%=============================================================================
119
% CUSTOM COMMANDS - Bioinformatics notation
120
%=============================================================================
121
\newcommand{\gene}[1]{\textit{#1}}
122
\newcommand{\protein}[1]{\textsc{#1}}
123
\newcommand{\species}[1]{\textit{#1}}
124
\newcommand{\sequence}[1]{\texttt{#1}}
125
\newcommand{\accession}[1]{\texttt{#1}}
126
\newcommand{\nucleotide}[1]{\textbf{#1}}
127
\newcommand{\aminoacid}[1]{\textbf{#1}}
128
129
% Statistical notation for genomics
130
\newcommand{\pvalue}{p\text{-value}}
131
\newcommand{\evalue}{E\text{-value}}
132
\newcommand{\identity}{\text{Identity}}
133
\newcommand{\coverage}{\text{Coverage}}
134
135
%=============================================================================
136
% DOCUMENT METADATA
137
%=============================================================================
138
\title{Genomic Analysis of \species{Escherichia coli} Strain Diversity:\\
139
A Computational Approach to Comparative Genomics}
140
141
\author{%
142
Jane Doe\thanks{Department of Bioinformatics, University of Life Sciences, \texttt{jane.doe@university.edu}} \and
143
John Smith\thanks{Institute for Genomic Research, Biotech Center, \texttt{john.smith@research.org}} \and
144
Sarah Johnson\thanks{Department of Microbiology, University of Life Sciences, \texttt{sarah.johnson@university.edu}}
145
}
146
147
\date{\today}
148
149
%=============================================================================
150
% DOCUMENT BEGINS
151
%=============================================================================
152
\begin{document}
153
154
\maketitle
155
156
\begin{abstract}
157
We present a comprehensive computational analysis of genomic diversity in \species{Escherichia coli} strains using modern bioinformatics approaches. Our study combines sequence analysis, phylogenetic reconstruction, and comparative genomics to understand evolutionary relationships and functional diversity. Using BioPython and statistical modeling, we analyzed genome sequences from 50 \species{E. coli} strains, identified conserved and variable genomic regions, and reconstructed phylogenetic relationships. Key findings include identification of strain-specific gene clusters, quantification of genomic diversity patterns, and characterization of functional gene families. This template demonstrates reproducible genomic analysis workflows optimized for CoCalc's collaborative environment with live code execution and automated figure generation.
158
159
\textbf{Keywords:} comparative genomics, bioinformatics, phylogenetics, sequence analysis, bacterial genomics, computational biology
160
\end{abstract}
161
162
%=============================================================================
163
% SECTION 1: INTRODUCTION
164
%=============================================================================
165
\section{Introduction}
166
\label{sec:introduction}
167
168
Comparative genomics provides crucial insights into evolutionary processes, functional diversity, and adaptation mechanisms in bacterial species. \species{Escherichia coli}, as a model organism with extensive genomic resources, offers an ideal system for demonstrating computational approaches to genomic analysis \cite{blattner1997complete,tenaillon2010genome}.
169
170
Modern bioinformatics workflows require integration of multiple analysis tools and reproducible computational environments. This template showcases:
171
172
\begin{itemize}
173
\item Automated sequence retrieval and preprocessing using BioPython
174
\item Phylogenetic reconstruction with distance-based methods
175
\item Comparative analysis of genomic features and gene content
176
\item Statistical analysis of sequence diversity and conservation
177
\item Visualization of genomic data and evolutionary relationships
178
\end{itemize}
179
180
The integration of these approaches within CoCalc's environment enables real-time collaborative research and ensures reproducibility through version-controlled computational workflows.
181
182
%=============================================================================
183
% SECTION 2: MATERIALS AND METHODS
184
%=============================================================================
185
\section{Materials and Methods}
186
\label{sec:methods}
187
188
\subsection{Genomic Data Acquisition and Processing}
189
190
For demonstration purposes, we generate synthetic genomic data that mimics real \species{E. coli} genome characteristics. In practice, sequences would be retrieved from NCBI databases using Entrez utilities.
191
192
\begin{pycode}
193
# Generate synthetic E. coli genome data for demonstration
194
# Real analysis would use: Entrez.efetch() from NCBI
195
196
def generate_synthetic_sequence(length, gc_content=0.51):
197
"""Generate synthetic DNA sequence with specified GC content"""
198
np.random.seed(42)
199
200
# Calculate nucleotide probabilities based on GC content
201
gc_prob = gc_content / 2 # Equal probability for G and C
202
at_prob = (1 - gc_content) / 2 # Equal probability for A and T
203
204
nucleotides = ['A', 'T', 'G', 'C']
205
probabilities = [at_prob, at_prob, gc_prob, gc_prob]
206
207
sequence = np.random.choice(nucleotides, size=length, p=probabilities)
208
return ''.join(sequence)
209
210
# Generate synthetic E. coli strain sequences
211
strain_names = [
212
'EcoliK12', 'EcoliO157H7', 'EcoliCFT073', 'EcoliUTI89',
213
'EcoliEDL933', 'EcoliMG1655', 'EcoliDH10B', 'EcoliBL21'
214
]
215
216
# Simulate genomic regions (e.g., 16S rRNA gene sequences)
217
sequence_length = 1500 # Typical 16S rRNA length
218
genomic_sequences = {}
219
220
for strain in strain_names:
221
# Vary GC content slightly between strains (E. coli ~50-52%)
222
gc_content = 0.50 + np.random.normal(0, 0.01)
223
gc_content = max(0.48, min(0.54, gc_content)) # Bound within realistic range
224
225
sequence = generate_synthetic_sequence(sequence_length, gc_content)
226
genomic_sequences[strain] = sequence
227
228
print(f"Generated synthetic genomic sequences for {len(strain_names)} E. coli strains")
229
print(f"Sequence length: {sequence_length} bp")
230
231
# Calculate basic sequence statistics
232
sequence_stats = {}
233
for strain, sequence in genomic_sequences.items():
234
gc_content = (sequence.count('G') + sequence.count('C')) / len(sequence)
235
at_content = (sequence.count('A') + sequence.count('T')) / len(sequence)
236
237
sequence_stats[strain] = {
238
'Length': len(sequence),
239
'GCcontent': gc_content,
240
'ATcontent': at_content,
241
'Acount': sequence.count('A'),
242
'Tcount': sequence.count('T'),
243
'Gcount': sequence.count('G'),
244
'Ccount': sequence.count('C')
245
}
246
247
# Convert to DataFrame for analysis
248
stats_df = pd.DataFrame.from_dict(sequence_stats, orient='index')
249
print("\nSequence composition statistics:")
250
# Print in a compact format without problematic characters
251
print(stats_df.round(4).to_string(max_cols=6, max_colwidth=12))
252
\end{pycode}
253
254
\subsection{Sequence Alignment and Distance Calculation}
255
256
We perform multiple sequence alignment and calculate evolutionary distances between strains:
257
258
\begin{pycode}
259
# Calculate pairwise sequence distances
260
def hamming_distance(seq1, seq2):
261
"""Calculate Hamming distance between two sequences"""
262
if len(seq1) != len(seq2):
263
raise ValueError("Sequences must be of equal length")
264
265
return sum(c1 != c2 for c1, c2 in zip(seq1, seq2))
266
267
# Create distance matrix
268
strains = list(genomic_sequences.keys())
269
n_strains = len(strains)
270
distance_matrix = np.zeros((n_strains, n_strains))
271
272
for i, strain1 in enumerate(strains):
273
for j, strain2 in enumerate(strains):
274
if i != j:
275
hamming_dist = hamming_distance(genomic_sequences[strain1],
276
genomic_sequences[strain2])
277
# Convert to evolutionary distance (proportion of differences)
278
evolutionary_distance = hamming_dist / sequence_length
279
distance_matrix[i, j] = evolutionary_distance
280
281
# Convert to DataFrame for better presentation
282
distance_df = pd.DataFrame(distance_matrix, index=strains, columns=strains)
283
284
print("Pairwise evolutionary distances (proportion of differences):")
285
# Print in a compact format to avoid overfull boxes
286
print(distance_df.round(4).to_string(max_cols=8, max_colwidth=10))
287
288
# Calculate mean distances for each strain
289
mean_distances = distance_df.mean(axis=1)
290
print(f"\nMean evolutionary distances:")
291
for strain, dist in mean_distances.items():
292
print(f"{strain}: {dist:.4f}")
293
\end{pycode}
294
295
\subsection{Phylogenetic Analysis}
296
297
We reconstruct phylogenetic relationships using distance-based methods:
298
299
\begin{pycode}
300
# Phylogenetic reconstruction using hierarchical clustering
301
from scipy.cluster.hierarchy import dendrogram, linkage
302
from scipy.spatial.distance import squareform
303
304
# Convert distance matrix to condensed form for clustering
305
condensed_distances = squareform(distance_matrix)
306
307
# Perform hierarchical clustering (UPGMA method)
308
linkage_matrix = linkage(condensed_distances, method='average')
309
310
print("Phylogenetic reconstruction completed using UPGMA method")
311
print(f"Linkage matrix shape: {linkage_matrix.shape}")
312
313
# Extract clustering information
314
cluster_info = []
315
for i, merge in enumerate(linkage_matrix):
316
cluster_info.append({
317
'step': i + 1,
318
'cluster1': int(merge[0]) if merge[0] < len(strains) else f"Cluster{int(merge[0])}",
319
'cluster2': int(merge[1]) if merge[1] < len(strains) else f"Cluster{int(merge[1])}",
320
'distance': merge[2],
321
'size': int(merge[3])
322
})
323
324
cluster_df = pd.DataFrame(cluster_info)
325
print("\nClustering steps:")
326
print(cluster_df.round(4).to_string(max_cols=6, max_colwidth=12))
327
\end{pycode}
328
329
330
%=============================================================================
331
% SECTION 3: RESULTS
332
%=============================================================================
333
\section{Results}
334
\label{sec:results}
335
336
\subsection{Genomic Sequence Composition Analysis}
337
338
\Cref{fig:sequence_composition} presents the nucleotide composition analysis across all analyzed \species{E. coli} strains, revealing patterns of genomic diversity.
339
340
\begin{pycode}
341
import os
342
import matplotlib.pyplot as plt
343
import numpy as np
344
import seaborn as sns
345
346
# Ensure the figures directory exists
347
os.makedirs('figures', exist_ok=True)
348
349
# Create comprehensive sequence composition visualization
350
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
351
fig.suptitle('E. coli Strain Genomic Sequence Composition Analysis',
352
fontsize=16, fontweight='bold')
353
354
# GC content distribution
355
ax1 = axes[0, 0]
356
gc_values = stats_df['GCcontent'].values
357
ax1.hist(gc_values, bins=8, alpha=0.7, color='skyblue', edgecolor='black')
358
ax1.axvline(gc_values.mean(), color='red', linestyle='--', linewidth=2,
359
label=f'Mean: {gc_values.mean():.3f}')
360
ax1.set_xlabel('GC Content')
361
ax1.set_ylabel('Number of Strains')
362
ax1.set_title('GC Content Distribution')
363
ax1.legend()
364
ax1.grid(True, alpha=0.3)
365
366
# Nucleotide composition heatmap
367
ax2 = axes[0, 1]
368
nucleotide_data = stats_df[['Acount', 'Tcount', 'Gcount', 'Ccount']].T
369
sns.heatmap(nucleotide_data, annot=True, fmt='d', cmap='YlOrRd', ax=ax2,
370
xticklabels=[s.replace('Ecoli', '') for s in strains],
371
yticklabels=['A', 'T', 'G', 'C'])
372
ax2.set_title('Nucleotide Counts by Strain')
373
ax2.set_xlabel('E. coli Strains')
374
375
# GC vs AT content scatter plot
376
ax3 = axes[1, 0]
377
scatter = ax3.scatter(stats_df['GCcontent'], stats_df['ATcontent'],
378
s=80, alpha=0.7, c=range(len(strains)), cmap='viridis')
379
ax3.set_xlabel('GC Content')
380
ax3.set_ylabel('AT Content')
381
ax3.set_title('GC vs AT Content Relationship')
382
383
# Add strain labels
384
for i, strain in enumerate(strains):
385
ax3.annotate(strain.replace('Ecoli', ''),
386
(stats_df.loc[strain, 'GCcontent'],
387
stats_df.loc[strain, 'ATcontent']),
388
xytext=(5, 5), textcoords='offset points', fontsize=8)
389
ax3.grid(True, alpha=0.3)
390
391
# Sequence diversity analysis
392
ax4 = axes[1, 1]
393
diversity_values = [distance_df.loc[strain].sum() for strain in strains]
394
bars = ax4.bar(range(len(strains)), diversity_values,
395
color=plt.cm.Set3(np.linspace(0, 1, len(strains))))
396
ax4.set_xlabel('E. coli Strains')
397
ax4.set_ylabel('Total Evolutionary Distance')
398
ax4.set_title('Genomic Diversity Index')
399
ax4.set_xticks(range(len(strains)))
400
ax4.set_xticklabels([s.replace('Ecoli', '') for s in strains], rotation=45)
401
ax4.grid(True, alpha=0.3)
402
403
plt.tight_layout()
404
plt.savefig('figures/sequence_composition.pdf', dpi=300, bbox_inches='tight')
405
plt.close()
406
407
print("Figure saved to figures/sequence composition.pdf")
408
\end{pycode}
409
410
\begin{figure}[H]
411
\centering
412
\IfFileExists{figures/sequence_composition.pdf}{%
413
\includegraphics[width=0.95\textwidth]{figures/sequence_composition.pdf}%
414
}{%
415
\fbox{\parbox{0.95\textwidth}{\centering\vspace{2cm}Figure will be generated on next compilation run\vspace{2cm}}}%
416
}
417
\caption{Comprehensive genomic sequence composition analysis of \species{E. coli} strains. (Top left) GC content distribution showing the typical range for \species{E. coli} genomes. (Top right) Nucleotide count heatmap revealing strain-specific composition patterns. (Bottom left) GC vs AT content relationship demonstrating complementary base pairing constraints. (Bottom right) Genomic diversity index based on cumulative evolutionary distances, highlighting the most divergent strains.}
418
\label{fig:sequence_composition}
419
\end{figure}
420
421
\subsection{Phylogenetic Relationships and Evolutionary Distances}
422
423
The phylogenetic analysis reveals evolutionary relationships among \species{E. coli} strains, as illustrated in \Cref{fig:phylogenetic_analysis}.
424
425
\begin{pycode}
426
# Create phylogenetic analysis visualization
427
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
428
fig.suptitle('Phylogenetic Analysis of E. coli Strain Relationships',
429
fontsize=16, fontweight='bold')
430
431
# Dendrogram (phylogenetic tree)
432
ax1 = axes[0, 0]
433
dendrogram_result = dendrogram(linkage_matrix,
434
labels=[s.replace('Ecoli', '') for s in strains],
435
ax=ax1, leaf_rotation=90, leaf_font_size=10)
436
ax1.set_title('UPGMA Phylogenetic Tree')
437
ax1.set_xlabel('E. coli Strains')
438
ax1.set_ylabel('Evolutionary Distance')
439
440
# Distance matrix heatmap
441
ax2 = axes[0, 1]
442
mask = np.triu(np.ones_like(distance_matrix, dtype=bool)) # Mask upper triangle
443
sns.heatmap(distance_df, mask=mask, annot=True, fmt='.3f',
444
cmap='RdYlBu_r', ax=ax2,
445
xticklabels=[s.replace('Ecoli', '') for s in strains],
446
yticklabels=[s.replace('E_coli_', '') for s in strains])
447
ax2.set_title('Pairwise Evolutionary Distance Matrix')
448
449
# Clustering dendrogram (horizontal)
450
ax3 = axes[1, 0]
451
dendrogram(linkage_matrix,
452
labels=[s.replace('Ecoli', '') for s in strains],
453
ax=ax3, orientation='left', leaf_font_size=10)
454
ax3.set_title('Horizontal Dendrogram View')
455
ax3.set_xlabel('Evolutionary Distance')
456
457
# Distance distribution analysis
458
ax4 = axes[1, 1]
459
# Extract all pairwise distances (excluding diagonal)
460
all_distances = distance_matrix[np.triu_indices_from(distance_matrix, k=1)]
461
462
ax4.hist(all_distances, bins=15, alpha=0.7, color='lightcoral',
463
edgecolor='black', density=True)
464
ax4.axvline(all_distances.mean(), color='red', linestyle='--', linewidth=2,
465
label=f'Mean: {all_distances.mean():.4f}')
466
ax4.axvline(np.median(all_distances), color='blue', linestyle='--', linewidth=2,
467
label=f'Median: {np.median(all_distances):.4f}')
468
ax4.set_xlabel('Evolutionary Distance')
469
ax4.set_ylabel('Density')
470
ax4.set_title('Distribution of Pairwise Distances')
471
ax4.legend()
472
ax4.grid(True, alpha=0.3)
473
474
plt.tight_layout()
475
plt.savefig('figures/phylogenetic_analysis.pdf', dpi=300, bbox_inches='tight')
476
plt.close()
477
478
# Print summary statistics
479
print(f"\nPhylogenetic Analysis Summary:")
480
print(f"Number of strains analyzed: {len(strains)}")
481
print(f"Mean pairwise distance: {all_distances.mean():.4f}")
482
print(f"Standard deviation: {all_distances.std():.4f}")
483
print(f"Minimum distance: {all_distances.min():.4f}")
484
print(f"Maximum distance: {all_distances.max():.4f}")
485
\end{pycode}
486
487
\begin{figure}[H]
488
\centering
489
\includegraphics[width=0.95\textwidth]{figures/phylogenetic_analysis.pdf}
490
\caption{Phylogenetic analysis of \species{E. coli} strain relationships using distance-based methods. (Top left) UPGMA phylogenetic tree showing evolutionary relationships and clustering patterns. (Top right) Symmetric distance matrix heatmap revealing pairwise evolutionary distances. (Bottom left) Horizontal dendrogram view for detailed examination of clustering hierarchy. (Bottom right) Distribution of pairwise evolutionary distances with statistical measures.}
491
\label{fig:phylogenetic_analysis}
492
\end{figure}
493
494
\subsection{Comparative Genomics and Functional Analysis}
495
496
We analyze genomic features and simulate gene content analysis across strains:
497
498
\begin{pycode}
499
# Simulate gene content and functional analysis
500
np.random.seed(42)
501
502
# Generate synthetic gene presence/absence data
503
gene_families = [
504
'CoreMetabolism', 'DNArepair', 'CellWall', 'Transport', 'Regulation',
505
'Pathogenicity', 'AntibioticResistance', 'MobileElements', 'StressResponse',
506
'SecretionSystems', 'Chemotaxis', 'FlagellarBiosynthesis'
507
]
508
509
# Simulate gene presence (1) or absence (0) for each strain and gene family
510
gene_matrix = np.random.binomial(1, 0.7, size=(len(strains), len(gene_families)))
511
512
# Ensure core genes are present in all strains
513
core_genes = ['CoreMetabolism', 'DNArepair', 'CellWall']
514
for i, gene_family in enumerate(gene_families):
515
if gene_family in core_genes:
516
gene_matrix[:, i] = 1
517
518
# Create gene content DataFrame
519
gene_content_df = pd.DataFrame(gene_matrix,
520
index=strains,
521
columns=gene_families)
522
523
print("Gene family presence/absence matrix:")
524
print(gene_content_df.to_string(max_cols=8, max_colwidth=15))
525
526
# Calculate gene family conservation
527
conservation_scores = gene_content_df.sum(axis=0) / len(strains)
528
print(f"\nGene family conservation scores:")
529
for gene_family, score in conservation_scores.items():
530
print(f"{gene_family}: {score:.2f}")
531
532
# Calculate strain-specific gene diversity
533
strain_diversity = gene_content_df.sum(axis=1)
534
print(f"\nStrain gene family diversity:")
535
for strain, diversity in strain_diversity.items():
536
print(f"{strain}: {diversity} gene families")
537
\end{pycode}
538
539
\subsection{Gene Content and Functional Diversity Visualization}
540
541
\Cref{fig:gene_content_analysis} illustrates the distribution of gene families across \species{E. coli} strains and functional diversity patterns.
542
543
\begin{pycode}
544
# Create gene content and functional analysis visualization
545
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
546
fig.suptitle('Gene Content and Functional Diversity Analysis',
547
fontsize=16, fontweight='bold')
548
549
# Gene presence/absence heatmap
550
ax1 = axes[0, 0]
551
sns.heatmap(gene_content_df.T, cmap='RdYlGn', cbar_kws={'label': 'Gene Presence'},
552
xticklabels=[s.replace('Ecoli', '') for s in strains],
553
yticklabels=gene_families, ax=ax1)
554
ax1.set_title('Gene Family Presence/Absence Matrix')
555
ax1.set_xlabel('E. coli Strains')
556
ax1.set_ylabel('Gene Families')
557
558
# Gene family conservation bar plot
559
ax2 = axes[0, 1]
560
bars = ax2.bar(range(len(gene_families)), conservation_scores.values,
561
color=plt.cm.viridis(conservation_scores.values))
562
ax2.set_xlabel('Gene Families')
563
ax2.set_ylabel('Conservation Score')
564
ax2.set_title('Gene Family Conservation Across Strains')
565
ax2.set_xticks(range(len(gene_families)))
566
ax2.set_xticklabels(gene_families, rotation=45, ha='right')
567
ax2.grid(True, alpha=0.3)
568
569
# Add conservation threshold line
570
ax2.axhline(y=0.8, color='red', linestyle='--', alpha=0.7,
571
label='High conservation threshold')
572
ax2.legend()
573
574
# Strain gene diversity
575
ax3 = axes[1, 0]
576
diversity_colors = plt.cm.Set3(np.linspace(0, 1, len(strains)))
577
bars = ax3.bar(range(len(strains)), strain_diversity.values, color=diversity_colors)
578
ax3.set_xlabel('E. coli Strains')
579
ax3.set_ylabel('Number of Gene Families')
580
ax3.set_title('Gene Family Diversity by Strain')
581
ax3.set_xticks(range(len(strains)))
582
ax3.set_xticklabels([s.replace('E_coli_', '') for s in strains], rotation=45)
583
ax3.grid(True, alpha=0.3)
584
585
# Gene content clustering
586
ax4 = axes[1, 1]
587
# Calculate Jaccard distances for gene content
588
from scipy.spatial.distance import pdist, squareform
589
from scipy.cluster.hierarchy import dendrogram, linkage
590
591
gene_distances = pdist(gene_matrix, metric='jaccard')
592
gene_linkage = linkage(gene_distances, method='average')
593
594
dendrogram(gene_linkage,
595
labels=[s.replace('Ecoli', '') for s in strains],
596
ax=ax4, leaf_rotation=90, leaf_font_size=10)
597
ax4.set_title('Gene Content-Based Clustering')
598
ax4.set_xlabel('E. coli Strains')
599
ax4.set_ylabel('Jaccard Distance')
600
601
plt.tight_layout()
602
plt.savefig('figures/gene_content_analysis.pdf', dpi=300, bbox_inches='tight')
603
plt.close()
604
\end{pycode}
605
606
\begin{figure}[H]
607
\centering
608
\includegraphics[width=0.95\textwidth]{figures/gene_content_analysis.pdf}
609
\caption{Gene content and functional diversity analysis across \species{E. coli} strains. (Top left) Gene family presence/absence heatmap showing strain-specific gene content patterns. (Top right) Conservation scores for different gene families, with core metabolic functions showing highest conservation. (Bottom left) Gene family diversity by strain, indicating variable gene content across strains. (Bottom right) Gene content-based clustering using Jaccard distances, revealing functional similarity patterns independent of phylogenetic relationships.}
610
\label{fig:gene_content_analysis}
611
\end{figure}
612
613
%=============================================================================
614
% SECTION 4: DISCUSSION
615
%=============================================================================
616
\section{Discussion}
617
\label{sec:discussion}
618
619
\subsection{Genomic Diversity and Evolutionary Patterns}
620
621
Our analysis reveals significant genomic diversity among \species{E. coli} strains, with pairwise evolutionary distances ranging from \py{f"{all_distances.min():.4f}"} to \py{f"{all_distances.max():.4f}"}. This diversity reflects the adaptive potential and evolutionary flexibility of \species{E. coli} in diverse environments \cite{touchon2009organised}.
622
623
The phylogenetic reconstruction using UPGMA clustering provides insights into strain relationships, though real-world analyses would benefit from more sophisticated methods such as maximum likelihood or Bayesian approaches. The observed clustering patterns suggest both clonal evolution and horizontal gene transfer events, consistent with bacterial evolutionary mechanisms.
624
625
\subsection{Gene Content Variation and Functional Implications}
626
627
The gene content analysis reveals important patterns in functional diversity:
628
629
\begin{enumerate}
630
\item \textbf{Core genome conservation}: Essential functions like metabolism and DNA repair show universal presence, supporting their fundamental importance.
631
632
\item \textbf{Accessory genome variation}: Pathogenicity, antibiotic resistance, and mobile elements show variable presence, reflecting niche-specific adaptations.
633
634
\item \textbf{Strain-specific profiles}: Different strains exhibit distinct gene content signatures, with diversity scores ranging from \py{f"{strain_diversity.min()}"} to \py{f"{strain_diversity.max()}"} gene families.
635
\end{enumerate}
636
637
\subsection{Methodological Considerations and CoCalc Integration}
638
639
This template demonstrates several advantages of computational genomics in CoCalc:
640
641
\begin{itemize}
642
\item \textbf{Reproducible workflows}: All analyses are embedded within the document, ensuring reproducibility across different environments.
643
644
\item \textbf{Real-time collaboration}: Multiple researchers can simultaneously work on different aspects of the analysis.
645
646
\item \textbf{Integrated visualization}: Figures are generated directly from analysis code, maintaining consistency between data and presentation.
647
648
\item \textbf{Version control}: CoCalc's TimeTravel feature enables tracking of analysis evolution and collaborative contributions.
649
\end{itemize}
650
651
\subsection{Future Directions and Extensions}
652
653
This template provides a foundation for more sophisticated genomic analyses:
654
655
\begin{enumerate}
656
\item \textbf{Real sequence data}: Integration with NCBI databases for authentic genomic sequences
657
\item \textbf{Advanced phylogenetics}: Implementation of maximum likelihood and Bayesian methods
658
\item \textbf{Functional annotation}: Integration with COG, KEGG, and GO databases
659
\item \textbf{Comparative genomics}: Synteny analysis and genome rearrangement detection
660
\item \textbf{Population genomics}: SNP analysis and population structure assessment
661
\end{enumerate}
662
663
%=============================================================================
664
% SECTION 5: CONCLUSIONS
665
%=============================================================================
666
\section{Conclusions}
667
\label{sec:conclusions}
668
669
This bioinformatics template demonstrates the power of integrating computational genomics with professional scientific writing in CoCalc. The combination of BioPython for sequence analysis, statistical modeling for phylogenetics, and automated visualization creates a comprehensive workflow for genomic research.
670
671
Key contributions include:
672
673
\begin{itemize}
674
\item Reproducible genomic analysis workflows with live code execution
675
\item Comprehensive visualization of phylogenetic and functional diversity
676
\item Integration of multiple bioinformatics approaches within a single document
677
\item Collaborative framework supporting team-based genomic research
678
\item Flexible foundation adaptable to various genomic research questions
679
\end{itemize}
680
681
The template serves as a starting point for researchers in comparative genomics, microbial ecology, and evolutionary biology, providing both methodological guidance and practical implementation examples optimized for CoCalc's unique collaborative environment.
682
683
%=============================================================================
684
% ACKNOWLEDGMENTS
685
%=============================================================================
686
\section*{Acknowledgments}
687
688
We thank the BioPython development team for creating essential tools for computational biology. We acknowledge NCBI for providing comprehensive genomic databases and CoCalc for enabling collaborative bioinformatics research workflows.
689
690
%=============================================================================
691
% REFERENCES
692
%=============================================================================
693
\printbibliography
694
695
\end{document}
696