ubuntu2404
\documentclass[11pt,letterpaper]{article}12% CoCalc Bioinformatics Genomic Analysis Template3% Optimized for genomic data analysis with BioPython integration4% Features: Sequence analysis, phylogenetics, BLAST, genomic visualization56%=============================================================================7% PACKAGE IMPORTS - Bioinformatics-specific packages8%=============================================================================9\usepackage[utf8]{inputenc}10\usepackage[T1]{fontenc}11\usepackage{lmodern}12\usepackage[english]{babel}1314% Page layout optimized for genomic data presentation15\usepackage[margin=0.8in]{geometry}16\usepackage{setspace}17\usepackage{parskip}1819% Mathematics and symbols for sequence analysis20\usepackage{amsmath,amsfonts,amssymb,amsthm}21\usepackage{mathtools}22\usepackage{siunitx}2324% Graphics for phylogenetic trees and genomic plots25\usepackage{graphicx}26\usepackage{float}27\usepackage{subcaption}28\usepackage{wrapfig}29\usepackage{tikz}30\usetikzlibrary{trees,positioning}3132% Tables for sequence alignments and genomic annotations33\usepackage{booktabs}34\usepackage{array}35\usepackage{multirow}36\usepackage{longtable}37\usepackage{tabularx}38\usepackage{adjustbox} % For fitting tables and content3940% Code integration for BioPython41\usepackage{pythontex}42\usepackage{listings}43\usepackage{xcolor}4445% Bioinformatics-specific formatting46\usepackage{textcomp} % For special characters in sequences47\usepackage{courier} % Monospace font for sequences4849% Citations optimized for biological journals50\usepackage{csquotes} % Required by biblatex51\usepackage[backend=bibtex,style=nature,sorting=none]{biblatex}52\addbibresource{references.bib}5354% Cross-referencing and hyperlinks55\usepackage[colorlinks=true,citecolor=blue,linkcolor=blue,urlcolor=blue]{hyperref}56\usepackage{cleveref}5758%=============================================================================59% PYTHONTEX CONFIGURATION - BioPython Environment60%=============================================================================61\begin{pycode}62# Import core bioinformatics and data analysis libraries63import numpy as np64import matplotlib.pyplot as plt65import matplotlib66matplotlib.use('Agg')67import seaborn as sns68import pandas as pd69from scipy import stats70from scipy.cluster.hierarchy import dendrogram, linkage71from scipy.spatial.distance import squareform7273# BioPython imports for sequence analysis74try:75from Bio import SeqIO, Align, Phylo76from Bio.Seq import Seq77from Bio.SeqRecord import SeqRecord78from Bio.SeqUtils import GC, molecular_weight79from Bio.SeqUtils.ProtParam import ProteinAnalysis80from Bio.Blast import NCBIXML81from Bio import Entrez82from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor83from Bio.Align import MultipleSeqAlignment84biopython_available = True85except ImportError:86print("BioPython not available - using simulated data")87biopython_available = False8889# Set visualization parameters for genomic data90plt.style.use('seaborn-v0_8-whitegrid')91np.random.seed(42)92sns.set_palette("Set2")9394# Genomics-optimized figure settings95plt.rcParams['figure.figsize'] = (10, 6)96plt.rcParams['figure.dpi'] = 15097plt.rcParams['savefig.bbox'] = 'tight'98plt.rcParams['savefig.pad_inches'] = 0.199plt.rcParams['font.family'] = 'sans-serif'100101# Configure pandas display to avoid overfull boxes102pd.set_option('display.max_columns', 8)103pd.set_option('display.width', 80)104pd.set_option('display.precision', 3)105106# Define nucleotide and amino acid color schemes107nucleotide_colors = {'A': '#FF6B6B', 'T': '#4ECDC4', 'G': '#45B7D1', 'C': '#96CEB4'}108amino_acid_colors = {109'A': '#FF6B6B', 'R': '#4ECDC4', 'N': '#45B7D1', 'D': '#96CEB4',110'C': '#FECA57', 'Q': '#48CAE4', 'E': '#F38BA8', 'G': '#A8DADC',111'H': '#FFB3BA', 'I': '#BFEFFF', 'L': '#FFDFBA', 'K': '#FFFFBA',112'M': '#BAE1FF', 'F': '#FFBAE1', 'P': '#E1BAFF', 'S': '#D4E2FC',113'T': '#FCE4D4', 'W': '#E4FCD4', 'Y': '#D4FCE4', 'V': '#FCD4E4'114}115\end{pycode}116117%=============================================================================118% CUSTOM COMMANDS - Bioinformatics notation119%=============================================================================120\newcommand{\gene}[1]{\textit{#1}}121\newcommand{\protein}[1]{\textsc{#1}}122\newcommand{\species}[1]{\textit{#1}}123\newcommand{\sequence}[1]{\texttt{#1}}124\newcommand{\accession}[1]{\texttt{#1}}125\newcommand{\nucleotide}[1]{\textbf{#1}}126\newcommand{\aminoacid}[1]{\textbf{#1}}127128% Statistical notation for genomics129\newcommand{\pvalue}{p\text{-value}}130\newcommand{\evalue}{E\text{-value}}131\newcommand{\identity}{\text{Identity}}132\newcommand{\coverage}{\text{Coverage}}133134%=============================================================================135% DOCUMENT METADATA136%=============================================================================137\title{Genomic Analysis of \species{Escherichia coli} Strain Diversity:\\138A Computational Approach to Comparative Genomics}139140\author{%141Jane Doe\thanks{Department of Bioinformatics, University of Life Sciences, \texttt{jane.doe@university.edu}} \and142John Smith\thanks{Institute for Genomic Research, Biotech Center, \texttt{john.smith@research.org}} \and143Sarah Johnson\thanks{Department of Microbiology, University of Life Sciences, \texttt{sarah.johnson@university.edu}}144}145146\date{\today}147148%=============================================================================149% DOCUMENT BEGINS150%=============================================================================151\begin{document}152153\maketitle154155\begin{abstract}156We present a comprehensive computational analysis of genomic diversity in \species{Escherichia coli} strains using modern bioinformatics approaches. Our study combines sequence analysis, phylogenetic reconstruction, and comparative genomics to understand evolutionary relationships and functional diversity. Using BioPython and statistical modeling, we analyzed genome sequences from 50 \species{E. coli} strains, identified conserved and variable genomic regions, and reconstructed phylogenetic relationships. Key findings include identification of strain-specific gene clusters, quantification of genomic diversity patterns, and characterization of functional gene families. This template demonstrates reproducible genomic analysis workflows optimized for CoCalc's collaborative environment with live code execution and automated figure generation.157158\textbf{Keywords:} comparative genomics, bioinformatics, phylogenetics, sequence analysis, bacterial genomics, computational biology159\end{abstract}160161%=============================================================================162% SECTION 1: INTRODUCTION163%=============================================================================164\section{Introduction}165\label{sec:introduction}166167Comparative genomics provides crucial insights into evolutionary processes, functional diversity, and adaptation mechanisms in bacterial species. \species{Escherichia coli}, as a model organism with extensive genomic resources, offers an ideal system for demonstrating computational approaches to genomic analysis \cite{blattner1997complete,tenaillon2010genome}.168169Modern bioinformatics workflows require integration of multiple analysis tools and reproducible computational environments. This template showcases:170171\begin{itemize}172\item Automated sequence retrieval and preprocessing using BioPython173\item Phylogenetic reconstruction with distance-based methods174\item Comparative analysis of genomic features and gene content175\item Statistical analysis of sequence diversity and conservation176\item Visualization of genomic data and evolutionary relationships177\end{itemize}178179The integration of these approaches within CoCalc's environment enables real-time collaborative research and ensures reproducibility through version-controlled computational workflows.180181%=============================================================================182% SECTION 2: MATERIALS AND METHODS183%=============================================================================184\section{Materials and Methods}185\label{sec:methods}186187\subsection{Genomic Data Acquisition and Processing}188189For demonstration purposes, we generate synthetic genomic data that mimics real \species{E. coli} genome characteristics. In practice, sequences would be retrieved from NCBI databases using Entrez utilities.190191\begin{pycode}192# Generate synthetic E. coli genome data for demonstration193# Real analysis would use: Entrez.efetch() from NCBI194195def generate_synthetic_sequence(length, gc_content=0.51):196"""Generate synthetic DNA sequence with specified GC content"""197np.random.seed(42)198199# Calculate nucleotide probabilities based on GC content200gc_prob = gc_content / 2 # Equal probability for G and C201at_prob = (1 - gc_content) / 2 # Equal probability for A and T202203nucleotides = ['A', 'T', 'G', 'C']204probabilities = [at_prob, at_prob, gc_prob, gc_prob]205206sequence = np.random.choice(nucleotides, size=length, p=probabilities)207return ''.join(sequence)208209# Generate synthetic E. coli strain sequences210strain_names = [211'EcoliK12', 'EcoliO157H7', 'EcoliCFT073', 'EcoliUTI89',212'EcoliEDL933', 'EcoliMG1655', 'EcoliDH10B', 'EcoliBL21'213]214215# Simulate genomic regions (e.g., 16S rRNA gene sequences)216sequence_length = 1500 # Typical 16S rRNA length217genomic_sequences = {}218219for strain in strain_names:220# Vary GC content slightly between strains (E. coli ~50-52%)221gc_content = 0.50 + np.random.normal(0, 0.01)222gc_content = max(0.48, min(0.54, gc_content)) # Bound within realistic range223224sequence = generate_synthetic_sequence(sequence_length, gc_content)225genomic_sequences[strain] = sequence226227print(f"Generated synthetic genomic sequences for {len(strain_names)} E. coli strains")228print(f"Sequence length: {sequence_length} bp")229230# Calculate basic sequence statistics231sequence_stats = {}232for strain, sequence in genomic_sequences.items():233gc_content = (sequence.count('G') + sequence.count('C')) / len(sequence)234at_content = (sequence.count('A') + sequence.count('T')) / len(sequence)235236sequence_stats[strain] = {237'Length': len(sequence),238'GCcontent': gc_content,239'ATcontent': at_content,240'Acount': sequence.count('A'),241'Tcount': sequence.count('T'),242'Gcount': sequence.count('G'),243'Ccount': sequence.count('C')244}245246# Convert to DataFrame for analysis247stats_df = pd.DataFrame.from_dict(sequence_stats, orient='index')248print("\nSequence composition statistics:")249# Print in a compact format without problematic characters250print(stats_df.round(4).to_string(max_cols=6, max_colwidth=12))251\end{pycode}252253\subsection{Sequence Alignment and Distance Calculation}254255We perform multiple sequence alignment and calculate evolutionary distances between strains:256257\begin{pycode}258# Calculate pairwise sequence distances259def hamming_distance(seq1, seq2):260"""Calculate Hamming distance between two sequences"""261if len(seq1) != len(seq2):262raise ValueError("Sequences must be of equal length")263264return sum(c1 != c2 for c1, c2 in zip(seq1, seq2))265266# Create distance matrix267strains = list(genomic_sequences.keys())268n_strains = len(strains)269distance_matrix = np.zeros((n_strains, n_strains))270271for i, strain1 in enumerate(strains):272for j, strain2 in enumerate(strains):273if i != j:274hamming_dist = hamming_distance(genomic_sequences[strain1],275genomic_sequences[strain2])276# Convert to evolutionary distance (proportion of differences)277evolutionary_distance = hamming_dist / sequence_length278distance_matrix[i, j] = evolutionary_distance279280# Convert to DataFrame for better presentation281distance_df = pd.DataFrame(distance_matrix, index=strains, columns=strains)282283print("Pairwise evolutionary distances (proportion of differences):")284# Print in a compact format to avoid overfull boxes285print(distance_df.round(4).to_string(max_cols=8, max_colwidth=10))286287# Calculate mean distances for each strain288mean_distances = distance_df.mean(axis=1)289print(f"\nMean evolutionary distances:")290for strain, dist in mean_distances.items():291print(f"{strain}: {dist:.4f}")292\end{pycode}293294\subsection{Phylogenetic Analysis}295296We reconstruct phylogenetic relationships using distance-based methods:297298\begin{pycode}299# Phylogenetic reconstruction using hierarchical clustering300from scipy.cluster.hierarchy import dendrogram, linkage301from scipy.spatial.distance import squareform302303# Convert distance matrix to condensed form for clustering304condensed_distances = squareform(distance_matrix)305306# Perform hierarchical clustering (UPGMA method)307linkage_matrix = linkage(condensed_distances, method='average')308309print("Phylogenetic reconstruction completed using UPGMA method")310print(f"Linkage matrix shape: {linkage_matrix.shape}")311312# Extract clustering information313cluster_info = []314for i, merge in enumerate(linkage_matrix):315cluster_info.append({316'step': i + 1,317'cluster1': int(merge[0]) if merge[0] < len(strains) else f"Cluster{int(merge[0])}",318'cluster2': int(merge[1]) if merge[1] < len(strains) else f"Cluster{int(merge[1])}",319'distance': merge[2],320'size': int(merge[3])321})322323cluster_df = pd.DataFrame(cluster_info)324print("\nClustering steps:")325print(cluster_df.round(4).to_string(max_cols=6, max_colwidth=12))326\end{pycode}327328329%=============================================================================330% SECTION 3: RESULTS331%=============================================================================332\section{Results}333\label{sec:results}334335\subsection{Genomic Sequence Composition Analysis}336337\Cref{fig:sequence_composition} presents the nucleotide composition analysis across all analyzed \species{E. coli} strains, revealing patterns of genomic diversity.338339\begin{pycode}340import os341import matplotlib.pyplot as plt342import numpy as np343import seaborn as sns344345# Ensure the figures directory exists346os.makedirs('figures', exist_ok=True)347348# Create comprehensive sequence composition visualization349fig, axes = plt.subplots(2, 2, figsize=(14, 10))350fig.suptitle('E. coli Strain Genomic Sequence Composition Analysis',351fontsize=16, fontweight='bold')352353# GC content distribution354ax1 = axes[0, 0]355gc_values = stats_df['GCcontent'].values356ax1.hist(gc_values, bins=8, alpha=0.7, color='skyblue', edgecolor='black')357ax1.axvline(gc_values.mean(), color='red', linestyle='--', linewidth=2,358label=f'Mean: {gc_values.mean():.3f}')359ax1.set_xlabel('GC Content')360ax1.set_ylabel('Number of Strains')361ax1.set_title('GC Content Distribution')362ax1.legend()363ax1.grid(True, alpha=0.3)364365# Nucleotide composition heatmap366ax2 = axes[0, 1]367nucleotide_data = stats_df[['Acount', 'Tcount', 'Gcount', 'Ccount']].T368sns.heatmap(nucleotide_data, annot=True, fmt='d', cmap='YlOrRd', ax=ax2,369xticklabels=[s.replace('Ecoli', '') for s in strains],370yticklabels=['A', 'T', 'G', 'C'])371ax2.set_title('Nucleotide Counts by Strain')372ax2.set_xlabel('E. coli Strains')373374# GC vs AT content scatter plot375ax3 = axes[1, 0]376scatter = ax3.scatter(stats_df['GCcontent'], stats_df['ATcontent'],377s=80, alpha=0.7, c=range(len(strains)), cmap='viridis')378ax3.set_xlabel('GC Content')379ax3.set_ylabel('AT Content')380ax3.set_title('GC vs AT Content Relationship')381382# Add strain labels383for i, strain in enumerate(strains):384ax3.annotate(strain.replace('Ecoli', ''),385(stats_df.loc[strain, 'GCcontent'],386stats_df.loc[strain, 'ATcontent']),387xytext=(5, 5), textcoords='offset points', fontsize=8)388ax3.grid(True, alpha=0.3)389390# Sequence diversity analysis391ax4 = axes[1, 1]392diversity_values = [distance_df.loc[strain].sum() for strain in strains]393bars = ax4.bar(range(len(strains)), diversity_values,394color=plt.cm.Set3(np.linspace(0, 1, len(strains))))395ax4.set_xlabel('E. coli Strains')396ax4.set_ylabel('Total Evolutionary Distance')397ax4.set_title('Genomic Diversity Index')398ax4.set_xticks(range(len(strains)))399ax4.set_xticklabels([s.replace('Ecoli', '') for s in strains], rotation=45)400ax4.grid(True, alpha=0.3)401402plt.tight_layout()403plt.savefig('figures/sequence_composition.pdf', dpi=300, bbox_inches='tight')404plt.close()405406print("Figure saved to figures/sequence composition.pdf")407\end{pycode}408409\begin{figure}[H]410\centering411\IfFileExists{figures/sequence_composition.pdf}{%412\includegraphics[width=0.95\textwidth]{figures/sequence_composition.pdf}%413}{%414\fbox{\parbox{0.95\textwidth}{\centering\vspace{2cm}Figure will be generated on next compilation run\vspace{2cm}}}%415}416\caption{Comprehensive genomic sequence composition analysis of \species{E. coli} strains. (Top left) GC content distribution showing the typical range for \species{E. coli} genomes. (Top right) Nucleotide count heatmap revealing strain-specific composition patterns. (Bottom left) GC vs AT content relationship demonstrating complementary base pairing constraints. (Bottom right) Genomic diversity index based on cumulative evolutionary distances, highlighting the most divergent strains.}417\label{fig:sequence_composition}418\end{figure}419420\subsection{Phylogenetic Relationships and Evolutionary Distances}421422The phylogenetic analysis reveals evolutionary relationships among \species{E. coli} strains, as illustrated in \Cref{fig:phylogenetic_analysis}.423424\begin{pycode}425# Create phylogenetic analysis visualization426fig, axes = plt.subplots(2, 2, figsize=(15, 12))427fig.suptitle('Phylogenetic Analysis of E. coli Strain Relationships',428fontsize=16, fontweight='bold')429430# Dendrogram (phylogenetic tree)431ax1 = axes[0, 0]432dendrogram_result = dendrogram(linkage_matrix,433labels=[s.replace('Ecoli', '') for s in strains],434ax=ax1, leaf_rotation=90, leaf_font_size=10)435ax1.set_title('UPGMA Phylogenetic Tree')436ax1.set_xlabel('E. coli Strains')437ax1.set_ylabel('Evolutionary Distance')438439# Distance matrix heatmap440ax2 = axes[0, 1]441mask = np.triu(np.ones_like(distance_matrix, dtype=bool)) # Mask upper triangle442sns.heatmap(distance_df, mask=mask, annot=True, fmt='.3f',443cmap='RdYlBu_r', ax=ax2,444xticklabels=[s.replace('Ecoli', '') for s in strains],445yticklabels=[s.replace('E_coli_', '') for s in strains])446ax2.set_title('Pairwise Evolutionary Distance Matrix')447448# Clustering dendrogram (horizontal)449ax3 = axes[1, 0]450dendrogram(linkage_matrix,451labels=[s.replace('Ecoli', '') for s in strains],452ax=ax3, orientation='left', leaf_font_size=10)453ax3.set_title('Horizontal Dendrogram View')454ax3.set_xlabel('Evolutionary Distance')455456# Distance distribution analysis457ax4 = axes[1, 1]458# Extract all pairwise distances (excluding diagonal)459all_distances = distance_matrix[np.triu_indices_from(distance_matrix, k=1)]460461ax4.hist(all_distances, bins=15, alpha=0.7, color='lightcoral',462edgecolor='black', density=True)463ax4.axvline(all_distances.mean(), color='red', linestyle='--', linewidth=2,464label=f'Mean: {all_distances.mean():.4f}')465ax4.axvline(np.median(all_distances), color='blue', linestyle='--', linewidth=2,466label=f'Median: {np.median(all_distances):.4f}')467ax4.set_xlabel('Evolutionary Distance')468ax4.set_ylabel('Density')469ax4.set_title('Distribution of Pairwise Distances')470ax4.legend()471ax4.grid(True, alpha=0.3)472473plt.tight_layout()474plt.savefig('figures/phylogenetic_analysis.pdf', dpi=300, bbox_inches='tight')475plt.close()476477# Print summary statistics478print(f"\nPhylogenetic Analysis Summary:")479print(f"Number of strains analyzed: {len(strains)}")480print(f"Mean pairwise distance: {all_distances.mean():.4f}")481print(f"Standard deviation: {all_distances.std():.4f}")482print(f"Minimum distance: {all_distances.min():.4f}")483print(f"Maximum distance: {all_distances.max():.4f}")484\end{pycode}485486\begin{figure}[H]487\centering488\includegraphics[width=0.95\textwidth]{figures/phylogenetic_analysis.pdf}489\caption{Phylogenetic analysis of \species{E. coli} strain relationships using distance-based methods. (Top left) UPGMA phylogenetic tree showing evolutionary relationships and clustering patterns. (Top right) Symmetric distance matrix heatmap revealing pairwise evolutionary distances. (Bottom left) Horizontal dendrogram view for detailed examination of clustering hierarchy. (Bottom right) Distribution of pairwise evolutionary distances with statistical measures.}490\label{fig:phylogenetic_analysis}491\end{figure}492493\subsection{Comparative Genomics and Functional Analysis}494495We analyze genomic features and simulate gene content analysis across strains:496497\begin{pycode}498# Simulate gene content and functional analysis499np.random.seed(42)500501# Generate synthetic gene presence/absence data502gene_families = [503'CoreMetabolism', 'DNArepair', 'CellWall', 'Transport', 'Regulation',504'Pathogenicity', 'AntibioticResistance', 'MobileElements', 'StressResponse',505'SecretionSystems', 'Chemotaxis', 'FlagellarBiosynthesis'506]507508# Simulate gene presence (1) or absence (0) for each strain and gene family509gene_matrix = np.random.binomial(1, 0.7, size=(len(strains), len(gene_families)))510511# Ensure core genes are present in all strains512core_genes = ['CoreMetabolism', 'DNArepair', 'CellWall']513for i, gene_family in enumerate(gene_families):514if gene_family in core_genes:515gene_matrix[:, i] = 1516517# Create gene content DataFrame518gene_content_df = pd.DataFrame(gene_matrix,519index=strains,520columns=gene_families)521522print("Gene family presence/absence matrix:")523print(gene_content_df.to_string(max_cols=8, max_colwidth=15))524525# Calculate gene family conservation526conservation_scores = gene_content_df.sum(axis=0) / len(strains)527print(f"\nGene family conservation scores:")528for gene_family, score in conservation_scores.items():529print(f"{gene_family}: {score:.2f}")530531# Calculate strain-specific gene diversity532strain_diversity = gene_content_df.sum(axis=1)533print(f"\nStrain gene family diversity:")534for strain, diversity in strain_diversity.items():535print(f"{strain}: {diversity} gene families")536\end{pycode}537538\subsection{Gene Content and Functional Diversity Visualization}539540\Cref{fig:gene_content_analysis} illustrates the distribution of gene families across \species{E. coli} strains and functional diversity patterns.541542\begin{pycode}543# Create gene content and functional analysis visualization544fig, axes = plt.subplots(2, 2, figsize=(15, 10))545fig.suptitle('Gene Content and Functional Diversity Analysis',546fontsize=16, fontweight='bold')547548# Gene presence/absence heatmap549ax1 = axes[0, 0]550sns.heatmap(gene_content_df.T, cmap='RdYlGn', cbar_kws={'label': 'Gene Presence'},551xticklabels=[s.replace('Ecoli', '') for s in strains],552yticklabels=gene_families, ax=ax1)553ax1.set_title('Gene Family Presence/Absence Matrix')554ax1.set_xlabel('E. coli Strains')555ax1.set_ylabel('Gene Families')556557# Gene family conservation bar plot558ax2 = axes[0, 1]559bars = ax2.bar(range(len(gene_families)), conservation_scores.values,560color=plt.cm.viridis(conservation_scores.values))561ax2.set_xlabel('Gene Families')562ax2.set_ylabel('Conservation Score')563ax2.set_title('Gene Family Conservation Across Strains')564ax2.set_xticks(range(len(gene_families)))565ax2.set_xticklabels(gene_families, rotation=45, ha='right')566ax2.grid(True, alpha=0.3)567568# Add conservation threshold line569ax2.axhline(y=0.8, color='red', linestyle='--', alpha=0.7,570label='High conservation threshold')571ax2.legend()572573# Strain gene diversity574ax3 = axes[1, 0]575diversity_colors = plt.cm.Set3(np.linspace(0, 1, len(strains)))576bars = ax3.bar(range(len(strains)), strain_diversity.values, color=diversity_colors)577ax3.set_xlabel('E. coli Strains')578ax3.set_ylabel('Number of Gene Families')579ax3.set_title('Gene Family Diversity by Strain')580ax3.set_xticks(range(len(strains)))581ax3.set_xticklabels([s.replace('E_coli_', '') for s in strains], rotation=45)582ax3.grid(True, alpha=0.3)583584# Gene content clustering585ax4 = axes[1, 1]586# Calculate Jaccard distances for gene content587from scipy.spatial.distance import pdist, squareform588from scipy.cluster.hierarchy import dendrogram, linkage589590gene_distances = pdist(gene_matrix, metric='jaccard')591gene_linkage = linkage(gene_distances, method='average')592593dendrogram(gene_linkage,594labels=[s.replace('Ecoli', '') for s in strains],595ax=ax4, leaf_rotation=90, leaf_font_size=10)596ax4.set_title('Gene Content-Based Clustering')597ax4.set_xlabel('E. coli Strains')598ax4.set_ylabel('Jaccard Distance')599600plt.tight_layout()601plt.savefig('figures/gene_content_analysis.pdf', dpi=300, bbox_inches='tight')602plt.close()603\end{pycode}604605\begin{figure}[H]606\centering607\includegraphics[width=0.95\textwidth]{figures/gene_content_analysis.pdf}608\caption{Gene content and functional diversity analysis across \species{E. coli} strains. (Top left) Gene family presence/absence heatmap showing strain-specific gene content patterns. (Top right) Conservation scores for different gene families, with core metabolic functions showing highest conservation. (Bottom left) Gene family diversity by strain, indicating variable gene content across strains. (Bottom right) Gene content-based clustering using Jaccard distances, revealing functional similarity patterns independent of phylogenetic relationships.}609\label{fig:gene_content_analysis}610\end{figure}611612%=============================================================================613% SECTION 4: DISCUSSION614%=============================================================================615\section{Discussion}616\label{sec:discussion}617618\subsection{Genomic Diversity and Evolutionary Patterns}619620Our analysis reveals significant genomic diversity among \species{E. coli} strains, with pairwise evolutionary distances ranging from \py{f"{all_distances.min():.4f}"} to \py{f"{all_distances.max():.4f}"}. This diversity reflects the adaptive potential and evolutionary flexibility of \species{E. coli} in diverse environments \cite{touchon2009organised}.621622The phylogenetic reconstruction using UPGMA clustering provides insights into strain relationships, though real-world analyses would benefit from more sophisticated methods such as maximum likelihood or Bayesian approaches. The observed clustering patterns suggest both clonal evolution and horizontal gene transfer events, consistent with bacterial evolutionary mechanisms.623624\subsection{Gene Content Variation and Functional Implications}625626The gene content analysis reveals important patterns in functional diversity:627628\begin{enumerate}629\item \textbf{Core genome conservation}: Essential functions like metabolism and DNA repair show universal presence, supporting their fundamental importance.630631\item \textbf{Accessory genome variation}: Pathogenicity, antibiotic resistance, and mobile elements show variable presence, reflecting niche-specific adaptations.632633\item \textbf{Strain-specific profiles}: Different strains exhibit distinct gene content signatures, with diversity scores ranging from \py{f"{strain_diversity.min()}"} to \py{f"{strain_diversity.max()}"} gene families.634\end{enumerate}635636\subsection{Methodological Considerations and CoCalc Integration}637638This template demonstrates several advantages of computational genomics in CoCalc:639640\begin{itemize}641\item \textbf{Reproducible workflows}: All analyses are embedded within the document, ensuring reproducibility across different environments.642643\item \textbf{Real-time collaboration}: Multiple researchers can simultaneously work on different aspects of the analysis.644645\item \textbf{Integrated visualization}: Figures are generated directly from analysis code, maintaining consistency between data and presentation.646647\item \textbf{Version control}: CoCalc's TimeTravel feature enables tracking of analysis evolution and collaborative contributions.648\end{itemize}649650\subsection{Future Directions and Extensions}651652This template provides a foundation for more sophisticated genomic analyses:653654\begin{enumerate}655\item \textbf{Real sequence data}: Integration with NCBI databases for authentic genomic sequences656\item \textbf{Advanced phylogenetics}: Implementation of maximum likelihood and Bayesian methods657\item \textbf{Functional annotation}: Integration with COG, KEGG, and GO databases658\item \textbf{Comparative genomics}: Synteny analysis and genome rearrangement detection659\item \textbf{Population genomics}: SNP analysis and population structure assessment660\end{enumerate}661662%=============================================================================663% SECTION 5: CONCLUSIONS664%=============================================================================665\section{Conclusions}666\label{sec:conclusions}667668This bioinformatics template demonstrates the power of integrating computational genomics with professional scientific writing in CoCalc. The combination of BioPython for sequence analysis, statistical modeling for phylogenetics, and automated visualization creates a comprehensive workflow for genomic research.669670Key contributions include:671672\begin{itemize}673\item Reproducible genomic analysis workflows with live code execution674\item Comprehensive visualization of phylogenetic and functional diversity675\item Integration of multiple bioinformatics approaches within a single document676\item Collaborative framework supporting team-based genomic research677\item Flexible foundation adaptable to various genomic research questions678\end{itemize}679680The template serves as a starting point for researchers in comparative genomics, microbial ecology, and evolutionary biology, providing both methodological guidance and practical implementation examples optimized for CoCalc's unique collaborative environment.681682%=============================================================================683% ACKNOWLEDGMENTS684%=============================================================================685\section*{Acknowledgments}686687We thank the BioPython development team for creating essential tools for computational biology. We acknowledge NCBI for providing comprehensive genomic databases and CoCalc for enabling collaborative bioinformatics research workflows.688689%=============================================================================690% REFERENCES691%=============================================================================692\printbibliography693694\end{document}695696