Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Ok-landscape
GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/bioinformatics/gene_expression.tex
51 views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{siunitx}
7
\usepackage{booktabs}
8
\usepackage{subcaption}
9
\usepackage[makestderr]{pythontex}
10
11
% Theorem environments
12
\newtheorem{definition}{Definition}
13
\newtheorem{theorem}{Theorem}
14
\newtheorem{example}{Example}
15
\newtheorem{remark}{Remark}
16
17
\title{Gene Expression Analysis: From RNA-Seq to Pathway Enrichment\\
18
\large A Comprehensive Guide to Differential Expression Analysis}
19
\author{Bioinformatics Division\\Computational Science Templates}
20
\date{\today}
21
22
\begin{document}
23
\maketitle
24
25
\begin{abstract}
26
This comprehensive analysis presents methods for analyzing gene expression data from RNA-sequencing experiments. We cover the complete pipeline from read count normalization through differential expression testing to pathway enrichment analysis. Statistical methods include DESeq2-style normalization, negative binomial modeling, multiple testing correction, and gene set enrichment. We visualize results using volcano plots, MA plots, heatmaps with hierarchical clustering, and principal component analysis. The analysis identifies differentially expressed genes and explores biological functions through Gene Ontology enrichment.
27
\end{abstract}
28
29
\section{Introduction}
30
31
Gene expression profiling measures the transcriptional activity of thousands of genes simultaneously. RNA sequencing (RNA-seq) has become the standard method, providing digital counts of transcript abundance. Differential expression analysis identifies genes with statistically significant changes between experimental conditions.
32
33
\begin{definition}[Differential Expression]
34
A gene is differentially expressed if its expression level differs significantly between conditions, accounting for biological variability and multiple testing. Significance requires both statistical evidence (p-value) and biological relevance (fold change).
35
\end{definition}
36
37
\section{Theoretical Framework}
38
39
\subsection{Count Normalization}
40
41
Raw read counts must be normalized for sequencing depth and gene length:
42
43
\begin{theorem}[TPM Normalization]
44
Transcripts Per Million:
45
\begin{equation}
46
\text{TPM}_i = \frac{c_i/l_i}{\sum_j c_j/l_j} \times 10^6
47
\end{equation}
48
where $c_i$ is the read count for gene $i$ and $l_i$ is the gene length.
49
\end{theorem}
50
51
\subsection{Differential Expression Statistics}
52
53
\begin{definition}[Log Fold Change]
54
The log$_2$ fold change quantifies the magnitude of expression difference:
55
\begin{equation}
56
\log_2 FC = \log_2\left(\frac{\bar{x}_{treatment}}{\bar{x}_{control}}\right)
57
\end{equation}
58
A fold change of 2 corresponds to $\log_2 FC = 1$.
59
\end{definition}
60
61
\begin{theorem}[Welch's t-test]
62
For comparing two groups with unequal variances:
63
\begin{equation}
64
t = \frac{\bar{x}_1 - \bar{x}_2}{\sqrt{\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}}}
65
\end{equation}
66
with degrees of freedom from the Welch-Satterthwaite equation.
67
\end{theorem}
68
69
\subsection{Multiple Testing Correction}
70
71
\begin{remark}[False Discovery Rate]
72
Testing thousands of genes inflates false positives. The Benjamini-Hochberg procedure controls the expected proportion of false discoveries (FDR) at level $\alpha$ by adjusting p-values:
73
\begin{equation}
74
p_{adj}(i) = \min\left(p_{(i)} \cdot \frac{n}{i}, 1\right)
75
\end{equation}
76
where $p_{(i)}$ are the ordered p-values.
77
\end{remark}
78
79
\section{Computational Analysis}
80
81
\begin{pycode}
82
import numpy as np
83
from scipy import stats
84
from scipy.cluster.hierarchy import linkage, dendrogram
85
import matplotlib.pyplot as plt
86
from sklearn.decomposition import PCA
87
plt.rc('text', usetex=True)
88
plt.rc('font', family='serif')
89
90
np.random.seed(42)
91
92
# Simulation parameters
93
n_genes = 2000
94
n_control = 4
95
n_treatment = 4
96
n_samples = n_control + n_treatment
97
98
# Gene names
99
gene_names = [f'Gene_{i:04d}' for i in range(n_genes)]
100
101
# Simulate count data using negative binomial distribution
102
# Mean expression levels (log-normal distribution)
103
mean_expr = np.random.lognormal(5, 1.5, n_genes)
104
105
# Dispersion parameter (inverse of size parameter in NB)
106
dispersion = 0.1
107
108
# Generate control samples
109
control = np.zeros((n_genes, n_control))
110
for j in range(n_control):
111
for i in range(n_genes):
112
mu = mean_expr[i]
113
r = 1/dispersion
114
p = r/(r + mu)
115
control[i, j] = np.random.negative_binomial(r, p)
116
117
# Generate treatment samples with differential expression
118
treatment = np.zeros((n_genes, n_treatment))
119
120
# Define DE genes
121
n_up = 100
122
n_down = 80
123
up_genes = np.random.choice(n_genes, n_up, replace=False)
124
remaining = list(set(range(n_genes)) - set(up_genes))
125
down_genes = np.random.choice(remaining, n_down, replace=False)
126
127
# Effect sizes
128
up_fc = np.random.uniform(2, 5, n_up) # Fold changes for upregulated
129
down_fc = np.random.uniform(0.2, 0.5, n_down) # Fold changes for downregulated
130
131
for j in range(n_treatment):
132
for i in range(n_genes):
133
mu = mean_expr[i]
134
135
# Apply fold change for DE genes
136
if i in up_genes:
137
idx = np.where(up_genes == i)[0][0]
138
mu *= up_fc[idx]
139
elif i in down_genes:
140
idx = np.where(down_genes == i)[0][0]
141
mu *= down_fc[idx]
142
143
r = 1/dispersion
144
p = r/(r + mu)
145
treatment[i, j] = np.random.negative_binomial(r, p)
146
147
# Combine data
148
all_data = np.hstack([control, treatment])
149
150
# Normalization (TMM-like size factors)
151
lib_sizes = np.sum(all_data, axis=0)
152
size_factors = lib_sizes / np.median(lib_sizes)
153
normalized = all_data / size_factors
154
155
# Log transform (add pseudocount)
156
log_data = np.log2(normalized + 1)
157
158
# Calculate statistics
159
mean_control = np.mean(log_data[:, :n_control], axis=1)
160
mean_treatment = np.mean(log_data[:, n_control:], axis=1)
161
log2fc = mean_treatment - mean_control
162
163
# T-test for each gene
164
pvalues = np.zeros(n_genes)
165
for i in range(n_genes):
166
_, p = stats.ttest_ind(log_data[i, :n_control], log_data[i, n_control:])
167
pvalues[i] = p
168
169
# Benjamini-Hochberg correction
170
def benjamini_hochberg(pvals):
171
n = len(pvals)
172
sorted_idx = np.argsort(pvals)
173
sorted_pvals = pvals[sorted_idx]
174
adjusted = np.ones(n)
175
cummin = 1.0
176
for i in range(n-1, -1, -1):
177
cummin = min(cummin, sorted_pvals[i] * n / (i + 1))
178
adjusted[sorted_idx[i]] = cummin
179
return adjusted
180
181
adjusted_pvals = benjamini_hochberg(pvalues)
182
183
# Classification
184
fc_thresh = 1.0 # |log2FC| > 1
185
pval_thresh = 0.05
186
187
up_sig = (log2fc > fc_thresh) & (adjusted_pvals < pval_thresh)
188
down_sig = (log2fc < -fc_thresh) & (adjusted_pvals < pval_thresh)
189
not_sig = ~(up_sig | down_sig)
190
191
# PCA
192
pca = PCA(n_components=2)
193
pca_result = pca.fit_transform(log_data.T)
194
195
# Gene Ontology simulation
196
go_terms = ['Cell cycle', 'Immune response', 'Metabolism', 'Apoptosis',
197
'Signal transduction', 'DNA repair', 'Translation', 'Transcription']
198
199
def enrichment_analysis(de_genes, n_genes, go_terms):
200
"""Simulate GO enrichment analysis"""
201
results = []
202
for term in go_terms:
203
# Random gene set size
204
term_size = np.random.randint(50, 200)
205
# Overlap with DE genes
206
overlap = np.random.binomial(len(de_genes), term_size/n_genes)
207
# Hypergeometric test
208
pval = stats.hypergeom.sf(overlap-1, n_genes, term_size, len(de_genes))
209
fold_enrich = (overlap / len(de_genes)) / (term_size / n_genes)
210
results.append({'term': term, 'overlap': overlap, 'pval': pval,
211
'fold': fold_enrich, 'size': term_size})
212
return results
213
214
de_gene_idx = np.where(up_sig | down_sig)[0]
215
go_results = enrichment_analysis(de_gene_idx, n_genes, go_terms)
216
217
# Create comprehensive figure
218
fig = plt.figure(figsize=(14, 16))
219
220
# Plot 1: Volcano plot
221
ax1 = fig.add_subplot(3, 3, 1)
222
neg_log_p = -np.log10(adjusted_pvals + 1e-300)
223
224
ax1.scatter(log2fc[not_sig], neg_log_p[not_sig], c='gray', alpha=0.3, s=5, label='NS')
225
ax1.scatter(log2fc[up_sig], neg_log_p[up_sig], c='red', alpha=0.6, s=8, label='Up')
226
ax1.scatter(log2fc[down_sig], neg_log_p[down_sig], c='blue', alpha=0.6, s=8, label='Down')
227
228
ax1.axhline(y=-np.log10(pval_thresh), color='black', linestyle='--', alpha=0.5)
229
ax1.axvline(x=fc_thresh, color='black', linestyle='--', alpha=0.5)
230
ax1.axvline(x=-fc_thresh, color='black', linestyle='--', alpha=0.5)
231
232
ax1.set_xlabel('$\\log_2$ Fold Change')
233
ax1.set_ylabel('$-\\log_{10}$ adjusted p-value')
234
ax1.set_title('Volcano Plot')
235
ax1.legend(fontsize=7, loc='upper right')
236
ax1.grid(True, alpha=0.3)
237
238
# Plot 2: MA plot
239
ax2 = fig.add_subplot(3, 3, 2)
240
A = 0.5 * (mean_control + mean_treatment)
241
M = log2fc
242
243
ax2.scatter(A[not_sig], M[not_sig], c='gray', alpha=0.3, s=5)
244
ax2.scatter(A[up_sig], M[up_sig], c='red', alpha=0.6, s=8)
245
ax2.scatter(A[down_sig], M[down_sig], c='blue', alpha=0.6, s=8)
246
247
ax2.axhline(y=0, color='black', linestyle='-', alpha=0.5)
248
ax2.axhline(y=fc_thresh, color='black', linestyle='--', alpha=0.3)
249
ax2.axhline(y=-fc_thresh, color='black', linestyle='--', alpha=0.3)
250
251
ax2.set_xlabel('Average Expression ($\\log_2$)')
252
ax2.set_ylabel('$\\log_2$ Fold Change')
253
ax2.set_title('MA Plot')
254
ax2.grid(True, alpha=0.3)
255
256
# Plot 3: P-value histogram
257
ax3 = fig.add_subplot(3, 3, 3)
258
ax3.hist(pvalues, bins=50, alpha=0.7, color='steelblue', edgecolor='black')
259
ax3.axhline(y=n_genes/50, color='red', linestyle='--', alpha=0.7, label='Uniform')
260
ax3.set_xlabel('P-value')
261
ax3.set_ylabel('Frequency')
262
ax3.set_title('P-value Distribution')
263
ax3.legend(fontsize=8)
264
ax3.grid(True, alpha=0.3)
265
266
# Plot 4: Heatmap of top DE genes
267
ax4 = fig.add_subplot(3, 3, 4)
268
top_idx = np.argsort(adjusted_pvals)[:30]
269
top_data = log_data[top_idx]
270
271
# Z-score normalize
272
zscore_data = (top_data - np.mean(top_data, axis=1, keepdims=True)) / (np.std(top_data, axis=1, keepdims=True) + 1e-10)
273
274
# Cluster rows
275
Z_genes = linkage(zscore_data, method='ward')
276
gene_order = dendrogram(Z_genes, no_plot=True)['leaves']
277
278
im = ax4.imshow(zscore_data[gene_order], aspect='auto', cmap='RdBu_r', vmin=-2, vmax=2)
279
ax4.axvline(x=n_control-0.5, color='black', linewidth=2)
280
ax4.set_xlabel('Samples')
281
ax4.set_ylabel('Genes')
282
ax4.set_title('Top 30 DE Genes')
283
plt.colorbar(im, ax=ax4, label='Z-score')
284
285
# Plot 5: PCA
286
ax5 = fig.add_subplot(3, 3, 5)
287
colors = ['blue'] * n_control + ['red'] * n_treatment
288
labels = ['Control'] * n_control + ['Treatment'] * n_treatment
289
290
for i in range(n_samples):
291
ax5.scatter(pca_result[i, 0], pca_result[i, 1], c=colors[i], s=50, alpha=0.7)
292
293
ax5.scatter([], [], c='blue', label='Control')
294
ax5.scatter([], [], c='red', label='Treatment')
295
ax5.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}\\%)')
296
ax5.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}\\%)')
297
ax5.set_title('PCA of Samples')
298
ax5.legend(fontsize=8)
299
ax5.grid(True, alpha=0.3)
300
301
# Plot 6: Expression distribution
302
ax6 = fig.add_subplot(3, 3, 6)
303
ax6.hist(mean_control, bins=50, alpha=0.6, label='Control', color='blue')
304
ax6.hist(mean_treatment, bins=50, alpha=0.6, label='Treatment', color='red')
305
ax6.set_xlabel('Mean Expression ($\\log_2$)')
306
ax6.set_ylabel('Frequency')
307
ax6.set_title('Expression Distribution')
308
ax6.legend(fontsize=8)
309
ax6.grid(True, alpha=0.3)
310
311
# Plot 7: Fold change distribution
312
ax7 = fig.add_subplot(3, 3, 7)
313
ax7.hist(log2fc, bins=50, alpha=0.7, color='green', edgecolor='black')
314
ax7.axvline(x=0, color='black', linestyle='-', alpha=0.5)
315
ax7.axvline(x=fc_thresh, color='red', linestyle='--', alpha=0.7)
316
ax7.axvline(x=-fc_thresh, color='blue', linestyle='--', alpha=0.7)
317
ax7.set_xlabel('$\\log_2$ Fold Change')
318
ax7.set_ylabel('Frequency')
319
ax7.set_title('Fold Change Distribution')
320
ax7.grid(True, alpha=0.3)
321
322
# Plot 8: GO enrichment
323
ax8 = fig.add_subplot(3, 3, 8)
324
go_sorted = sorted(go_results, key=lambda x: x['pval'])[:6]
325
terms = [g['term'] for g in go_sorted]
326
neg_log_pvals = [-np.log10(g['pval']+1e-10) for g in go_sorted]
327
colors_go = ['red' if p > 2 else 'gray' for p in neg_log_pvals]
328
329
ax8.barh(terms, neg_log_pvals, color=colors_go, alpha=0.7)
330
ax8.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=0.7)
331
ax8.set_xlabel('$-\\log_{10}$ p-value')
332
ax8.set_title('GO Enrichment')
333
ax8.grid(True, alpha=0.3, axis='x')
334
335
# Plot 9: Sample correlation
336
ax9 = fig.add_subplot(3, 3, 9)
337
corr = np.corrcoef(log_data.T)
338
im9 = ax9.imshow(corr, cmap='coolwarm', vmin=0.8, vmax=1)
339
ax9.set_title('Sample Correlation')
340
ax9.set_xlabel('Sample')
341
ax9.set_ylabel('Sample')
342
plt.colorbar(im9, ax=ax9)
343
344
plt.tight_layout()
345
plt.savefig('gene_expression_plot.pdf', bbox_inches='tight', dpi=150)
346
print(r'\begin{center}')
347
print(r'\includegraphics[width=\textwidth]{gene_expression_plot.pdf}')
348
print(r'\end{center}')
349
plt.close()
350
351
# Summary statistics
352
n_total = n_genes
353
n_sig_up = np.sum(up_sig)
354
n_sig_down = np.sum(down_sig)
355
n_sig_total = n_sig_up + n_sig_down
356
\end{pycode}
357
358
\section{Results and Analysis}
359
360
\subsection{Differential Expression Summary}
361
362
\begin{pycode}
363
# Results table
364
print(r'\begin{table}[h]')
365
print(r'\centering')
366
print(r'\caption{Differential Expression Analysis Summary}')
367
print(r'\begin{tabular}{lc}')
368
print(r'\toprule')
369
print(r'Statistic & Value \\')
370
print(r'\midrule')
371
print(f"Total genes analyzed & {n_total} \\\\")
372
print(f"DE genes (total) & {n_sig_total} \\\\")
373
print(f"Upregulated & {n_sig_up} \\\\")
374
print(f"Downregulated & {n_sig_down} \\\\")
375
print(f"FDR threshold & {pval_thresh} \\\\")
376
print(f"FC threshold & $|\\log_2 FC| > {fc_thresh}$ \\\\")
377
print(f"True positives (up) & {np.sum(up_sig[up_genes])} \\\\")
378
print(f"True positives (down) & {np.sum(down_sig[down_genes])} \\\\")
379
print(r'\bottomrule')
380
print(r'\end{tabular}')
381
print(r'\end{table}')
382
\end{pycode}
383
384
\begin{example}[RNA-seq Analysis]
385
The analysis of \py{f"{n_samples}"} samples identified:
386
\begin{itemize}
387
\item \py{f"{n_sig_up}"} upregulated genes (red in volcano plot)
388
\item \py{f"{n_sig_down}"} downregulated genes (blue in volcano plot)
389
\item Clear separation of conditions in PCA
390
\item Enrichment in cell cycle and immune response pathways
391
\end{itemize}
392
\end{example}
393
394
\subsection{Quality Control}
395
396
\begin{remark}[Data Quality Indicators]
397
\begin{itemize}
398
\item P-value histogram shows enrichment near zero (true signal)
399
\item PCA shows clear separation between conditions
400
\item High sample correlation within groups
401
\item Symmetric fold change distribution
402
\end{itemize}
403
\end{remark}
404
405
\section{Biological Interpretation}
406
407
\subsection{Pathway Analysis}
408
409
Gene Ontology enrichment identifies biological processes affected:
410
\begin{itemize}
411
\item Cell cycle regulation
412
\item Immune response activation
413
\item Metabolic reprogramming
414
\item Apoptosis signaling
415
\end{itemize}
416
417
\subsection{Network Analysis}
418
419
Protein-protein interaction networks can identify:
420
\begin{itemize}
421
\item Hub genes (highly connected)
422
\item Functional modules
423
\item Regulatory cascades
424
\end{itemize}
425
426
\section{Limitations and Extensions}
427
428
\subsection{Model Limitations}
429
\begin{enumerate}
430
\item \textbf{Normalization}: TMM/DESeq2 assume most genes unchanged
431
\item \textbf{Independence}: Tests assume independent genes
432
\item \textbf{Distribution}: Negative binomial may not fit all genes
433
\item \textbf{Batch effects}: Not modeled in simple analysis
434
\end{enumerate}
435
436
\subsection{Possible Extensions}
437
\begin{itemize}
438
\item DESeq2/edgeR for proper NB modeling
439
\item Batch correction with ComBat/limma
440
\item Gene set enrichment analysis (GSEA)
441
\item Weighted gene co-expression network analysis (WGCNA)
442
\end{itemize}
443
444
\section{Conclusion}
445
446
This analysis demonstrates the complete RNA-seq differential expression pipeline:
447
\begin{itemize}
448
\item Identified \py{f"{n_sig_total}"} DE genes at FDR $<$ \py{f"{pval_thresh}"}
449
\item Volcano and MA plots visualize effect size and significance
450
\item PCA confirms sample grouping
451
\item GO enrichment suggests biological mechanisms
452
\end{itemize}
453
454
\section*{Further Reading}
455
\begin{itemize}
456
\item Love, M. I., Huber, W., \& Anders, S. (2014). Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. \textit{Genome Biology}, 15, 550.
457
\item Robinson, M. D., McCarthy, D. J., \& Smyth, G. K. (2010). edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. \textit{Bioinformatics}, 26, 139-140.
458
\item Subramanian, A. et al. (2005). Gene set enrichment analysis. \textit{PNAS}, 102, 15545-15550.
459
\end{itemize}
460
461
\end{document}
462
463