Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book1/06/seq_logo_demo.ipynb
1192 views
Kernel: Python 3
# DNA Sequence Demonstration # Author: Drishtii@, mu-arkhipov@ # Based on # https://github.com/probml/pmtk3/blob/master/demos/seqlogoDemo.m # See https://github.com/probml/pml-book/issues/581 ##!pip install -qq logomaker import jax.numpy as jnp import os try: import pandas as pd except ModuleNotFoundError: %pip install -qq pandas import pandas as pd try: import probml_utils as pml except ModuleNotFoundError: %pip install -qq git+https://github.com/probml/probml-utils.git import probml_utils as pml try: import logomaker except ModuleNotFoundError: %pip install -qq logomaker import logomaker import matplotlib.pyplot as plt os.environ["FIG_DIR"] = "/Users/kpmurphy/github/bookv2/figures" seqs = [ "atagccggtacggca", "ttagctgcaaccgca", "tcagccactagagca", "ataaccgcgaccgca", "ttagccgctaaggta", "taagcctcgtacgta", "ttagccgttacggcc", "atatccggtacagta", "atagcaggtaccgaa", "acatccgtgacggaa", ] seq_len = len(seqs[0]) letter_to_cat = {"a": 0, "c": 1, "g": 2, "t": 3} count_mat = jnp.zeros([seq_len, len(letter_to_cat)]) for seq in seqs: for n_pos, letter in enumerate(seq): count_mat = count_mat.at[n_pos, letter_to_cat[letter]].add(1) prob_mat = count_mat / jnp.sum(count_mat, 1, keepdims=True) # prob_mat = prob_mat.at[0].set([0.25, 0.25, 0.25, 0.25]) # for debugging
pos_height_mat = prob_mat df = pd.DataFrame(pos_height_mat, columns=["A", "C", "G", "T"]) df.index = jnp.arange(1, len(df) + 1) logos = logomaker.Logo(df) logos.ax.set_xticks(jnp.arange(1, 16)) logos.ax.set_yticks(jnp.linspace(0, 1, 3)) logos.ax.set_ylabel("Probability") logos.ax.set_xlabel("Sequence Position") pml.savefig("seqlogo_unscaled") plt.show()
saving image to /Users/kpmurphy/github/bookv2/figures/seqlogo_unscaled.pdf Figure size: [10. 2.5]
Image in a Jupyter notebook
entropy = jnp.zeros(seq_len) for n_pos in range(seq_len): entropy = entropy.at[n_pos].set(-sum(p * jnp.log2(p) for p in prob_mat[n_pos, :] if p > 0)) # entropy[n_pos] = 2 # https://en.wikipedia.org/wiki/Sequence_logo#Consensus_logo n = len(seqs) e = 0 # (1/jnp.log(2)) * (4-1)/(2*n) # small sample correction pos_height_mat = prob_mat * (2 - entropy.reshape(-1, 1) + e) # position df = pd.DataFrame(pos_height_mat, columns=["A", "C", "G", "T"]) df.index = jnp.arange(1, len(df) + 1) logos = logomaker.Logo(df) logos.ax.set_xticks(jnp.arange(1, 16)) logos.ax.set_yticks(jnp.linspace(0, 2, 3)) logos.ax.set_ylabel("Bits") logos.ax.set_xlabel("Sequence Position") pml.savefig("seqlogo_scaled") plt.show()
saving image to /Users/kpmurphy/github/bookv2/figures/seqlogo_scaled.pdf Figure size: [10. 2.5]
Image in a Jupyter notebook