Path: blob/master/sage/coding/source_coding/huffman.py
4038 views
r"""1Huffman Encoding23This module implements functionalities relating to Huffman encoding and4decoding.56AUTHOR:78- Nathann Cohen (2010-05): initial version.91011Classes and functions12=====================13"""1415###########################################################################16# Copyright (c) 2010 Nathann Cohen <[email protected]>17#18# This program is free software; you can redistribute it and/or modify19# it under the terms of the GNU General Public License as published by20# the Free Software Foundation; either version 2 of the License, or21# (at your option) any later version.22#23# This program is distributed in the hope that it will be useful,24# but WITHOUT ANY WARRANTY; without even the implied warranty of25# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the26# GNU General Public License for more details.27#28# http://www.gnu.org/licenses/29###########################################################################3031from sage.structure.sage_object import SageObject3233###########################################################################34#35# Helper functions36#37###########################################################################3839def frequency_table(string):40r"""41Return the frequency table corresponding to the given string.4243INPUT:4445- ``string`` -- a string of symbols over some alphabet.4647OUTPUT:4849- A table of frequency of each unique symbol in ``string``. If ``string``50is an empty string, return an empty table.5152EXAMPLES:5354The frequency table of a non-empty string::5556sage: from sage.coding.source_coding.huffman import frequency_table57sage: str = "Stop counting my characters!"58sage: T = sorted(frequency_table(str).items())59sage: for symbol, code in T:60... print symbol, code61...62363! 164S 165a 266c 367e 168g 169h 170i 171m 172n 273o 274p 175r 276s 177t 378u 179y 18081The frequency of an empty string::8283sage: frequency_table("")84{}85"""86d = {}87for s in string:88d[s] = d.get(s, 0) + 189return d9091class Huffman(SageObject):92r"""93This class implements the basic functionalities of Huffman codes.9495It can build a Huffman code from a given string, or from the information96of a dictionary associating to each key (the elements of the alphabet) a97weight (most of the time, a probability value or a number of occurrences).9899INPUT:100101- ``source`` -- can be either102103- A string from which the Huffman encoding should be created.104105- A dictionary that associates to each symbol of an alphabet a numeric106value. If we consider the frequency of each alphabetic symbol, then107``source`` is considered as the frequency table of the alphabet with108each numeric (non-negative integer) value being the number of109occurrences of a symbol. The numeric values can also represent weights110of the symbols. In that case, the numeric values are not necessarily111integers, but can be real numbers.112113In order to construct a Huffman code for an alphabet, we use exactly one of114the following methods:115116#. Let ``source`` be a string of symbols over an alphabet and feed117``source`` to the constructor of this class. Based on the input string, a118frequency table is constructed that contains the frequency of each unique119symbol in ``source``. The alphabet in question is then all the unique120symbols in ``source``. A significant implication of this is that any121subsequent string that we want to encode must contain only symbols that122can be found in ``source``.123124#. Let ``source`` be the frequency table of an alphabet. We can feed this125table to the constructor of this class. The table ``source`` can be a126table of frequencies or a table of weights.127128Examples::129130sage: from sage.coding.source_coding.huffman import Huffman, frequency_table131sage: h1 = Huffman("There once was a french fry")132sage: for letter, code in h1.encoding_table().iteritems():133... print "'"+ letter + "' : " + code134'a' : 0111135' ' : 00136'c' : 1010137'e' : 100138'f' : 1011139'h' : 1100140'o' : 11100141'n' : 1101142's' : 11101143'r' : 010144'T' : 11110145'w' : 11111146'y' : 0110147148We can obtain the same result by "training" the Huffman code with the149following table of frequency::150151sage: ft = frequency_table("There once was a french fry"); ft152{'a': 2, ' ': 5, 'c': 2, 'e': 4, 'f': 2, 'h': 2, 'o': 1, 'n': 2, 's': 1, 'r': 3, 'T': 1, 'w': 1, 'y': 1}153sage: h2 = Huffman(ft)154155Once ``h1`` has been trained, and hence possesses an encoding table,156it is possible to obtain the Huffman encoding of any string157(possibly the same) using this code::158159sage: encoded = h1.encode("There once was a french fry"); encoded160'11110110010001010000111001101101010000111110111111010001110010110101001101101011000010110100110'161162We can decode the above encoded string in the following way::163164sage: h1.decode(encoded)165'There once was a french fry'166167Obviously, if we try to decode a string using a Huffman instance which168has been trained on a different sample (and hence has a different encoding169table), we are likely to get some random-looking string::170171sage: h3 = Huffman("There once were two french fries")172sage: h3.decode(encoded)173' wehnefetrhft ne ewrowrirTc'174175This does not look like our original string.176177Instead of using frequency, we can assign weights to each alphabetic178symbol::179180sage: from sage.coding.source_coding.huffman import Huffman181sage: T = {"a":45, "b":13, "c":12, "d":16, "e":9, "f":5}182sage: H = Huffman(T)183sage: L = ["deaf", "bead", "fab", "bee"]184sage: E = []185sage: for e in L:186... E.append(H.encode(e))187... print E[-1]188...189111110101100190101110101111911100010119210111011101193sage: D = []194sage: for e in E:195... D.append(H.decode(e))196... print D[-1]197...198deaf199bead200fab201bee202sage: D == L203True204"""205206def __init__(self, source):207r"""208Constructor for Huffman.209210See the docstring of this class for full documentation.211212EXAMPLES::213214sage: from sage.coding.source_coding.huffman import Huffman215sage: str = "Sage is my most favorite general purpose computer algebra system"216sage: h = Huffman(str)217218TESTS:219220Feeding anything else than a string or a dictionary::221222sage: Huffman(Graph())223Traceback (most recent call last):224...225ValueError: Input must be either a string or a dictionary.226"""227228# alphabetic symbol to Huffman encoding translation table229self._character_to_code = []230# Huffman binary tree231self._tree = None232# index of each alphabetic symbol233self._index = None234235if isinstance(source,basestring):236self._build_code(frequency_table(source))237elif isinstance(source, dict):238self._build_code(source)239else:240raise ValueError("Input must be either a string or a dictionary.")241242def _build_code_from_tree(self, tree, d, prefix):243r"""244Builds the Huffman code corresponding to a given tree and prefix.245246INPUT:247248- ``tree`` -- integer, or list of size `2`249250- ``d`` -- the dictionary to fill251252- ``prefix`` (string) -- binary string which is the prefix253of any element of the tree254255EXAMPLES::256257sage: from sage.coding.source_coding.huffman import Huffman258sage: str = "Sage is my most favorite general purpose computer algebra system"259sage: h = Huffman(str)260sage: d = {}261sage: h._build_code_from_tree(h._tree, d, prefix="")262"""263# This is really a recursive construction of a Huffman code. By264# feeding this class a sufficiently large alphabet, it is possible to265# exceed the maximum recursion depth and hence result in a RuntimeError.266try:267self._build_code_from_tree(tree[0],268d,269prefix="".join([prefix, "0"]))270self._build_code_from_tree(tree[1],271d,272prefix="".join([prefix, "1"]))273except TypeError:274d[tree] = prefix275276def _build_code(self, dic):277r"""278Constructs a Huffman code corresponding to an alphabet with the given279weight table.280281INPUT:282283- ``dic`` -- a dictionary that associates to each symbol of an alphabet284a numeric value. If we consider the frequency of each alphabetic285symbol, then ``dic`` is considered as the frequency table of the286alphabet with each numeric (non-negative integer) value being the287number of occurrences of a symbol. The numeric values can also288represent weights of the symbols. In that case, the numeric values289are not necessarily integers, but can be real numbers. In general,290we refer to ``dic`` as a weight table.291292EXAMPLE::293294sage: from sage.coding.source_coding.huffman import Huffman, frequency_table295sage: str = "Sage is my most favorite general purpose computer algebra system"296sage: h = Huffman(str)297sage: d = {}298sage: h._build_code(frequency_table(str))299"""300from heapq import heappush, heappop301heap = []302# Each alphabetic symbol is now represented by an element with303# weight w and index i.304for i, (s, w) in enumerate(dic.items()):305heappush(heap, (w, i))306for i in range(1, len(dic)):307weight_a, node_a = heappop(heap)308weight_b, node_b = heappop(heap)309heappush(heap, (weight_a + weight_b, [node_a, node_b]))310# dictionary of symbol to Huffman encoding311d = {}312self._tree = heap[0][1]313# Build the binary tree of a Huffman code, where the root of the tree314# is associated with the empty string.315self._build_code_from_tree(self._tree, d, prefix="")316self._index = dict((i, s) for i, (s, w) in enumerate(dic.items()))317self._character_to_code = dict(318(s, d[i]) for i, (s, w) in enumerate(dic.items()))319320def encode(self, string):321r"""322Encode the given string based on the current encoding table.323324INPUT:325326- ``string`` -- a string of symbols over an alphabet.327328OUTPUT:329330- A Huffman encoding of ``string``.331332EXAMPLES:333334This is how a string is encoded and then decoded::335336sage: from sage.coding.source_coding.huffman import Huffman337sage: str = "Sage is my most favorite general purpose computer algebra system"338sage: h = Huffman(str)339sage: encoded = h.encode(str); encoded340'00000110100010101011000011101010011100101010011011011100111101110010110100001011011111000001110101010001010110011010111111011001110100101000111110010011011100101011100000110001100101000101110101111101110110011000101011000111111101101111010010111001110100011'341sage: h.decode(encoded)342'Sage is my most favorite general purpose computer algebra system'343"""344if self._character_to_code:345return "".join(map(lambda x: self._character_to_code[x], string))346347def decode(self, string):348r"""349Decode the given string using the current encoding table.350351INPUT:352353- ``string`` -- a string of Huffman encodings.354355OUTPUT:356357- The Huffman decoding of ``string``.358359EXAMPLES:360361This is how a string is encoded and then decoded::362363sage: from sage.coding.source_coding.huffman import Huffman364sage: str = "Sage is my most favorite general purpose computer algebra system"365sage: h = Huffman(str)366sage: encoded = h.encode(str); encoded367'00000110100010101011000011101010011100101010011011011100111101110010110100001011011111000001110101010001010110011010111111011001110100101000111110010011011100101011100000110001100101000101110101111101110110011000101011000111111101101111010010111001110100011'368sage: h.decode(encoded)369'Sage is my most favorite general purpose computer algebra system'370371TESTS:372373Of course, the string one tries to decode has to be a binary one. If374not, an exception is raised::375376sage: h.decode('I clearly am not a binary string')377Traceback (most recent call last):378...379ValueError: Input must be a binary string.380"""381# This traverses the whole Huffman binary tree in order to work out382# the symbol represented by a stream of binaries. This method of383# decoding is really slow. A faster method is needed.384# TODO: faster decoding implementation385chars = []386tree = self._tree387index = self._index388for i in string:389if i == "0":390tree = tree[0]391elif i == "1":392tree = tree[1]393else:394raise ValueError("Input must be a binary string.")395if not isinstance(tree, list):396chars.append(index[tree])397tree = self._tree398return "".join(chars)399400def encoding_table(self):401r"""402Returns the current encoding table.403404INPUT:405406- None.407408OUTPUT:409410- A dictionary associating an alphabetic symbol to a Huffman encoding.411412EXAMPLES::413414sage: from sage.coding.source_coding.huffman import Huffman415sage: str = "Sage is my most favorite general purpose computer algebra system"416sage: h = Huffman(str)417sage: T = sorted(h.encoding_table().items())418sage: for symbol, code in T:419... print symbol, code420...421101422S 00000423a 1101424b 110001425c 110000426e 010427f 110010428g 0001429i 10000430l 10011431m 0011432n 110011433o 0110434p 0010435r 1111436s 1110437t 0111438u 10001439v 00001440y 10010441"""442return self._character_to_code.copy()443444def tree(self):445r"""446Returns the Huffman tree corresponding to the current encoding.447448INPUT:449450- None.451452OUTPUT:453454- The binary tree representing a Huffman code.455456EXAMPLES::457458sage: from sage.coding.source_coding.huffman import Huffman459sage: str = "Sage is my most favorite general purpose computer algebra system"460sage: h = Huffman(str)461sage: T = h.tree(); T462Digraph on 39 vertices463sage: T.show(figsize=[20,20])464<BLANKLINE>465"""466from sage.graphs.digraph import DiGraph467g = DiGraph()468g.add_edges(self._generate_edges(self._tree))469return g470471def _generate_edges(self, tree, parent="", bit=""):472"""473Generate the edges of the given Huffman tree.474475INPUT:476477- ``tree`` -- a Huffman binary tree.478479- ``parent`` -- (default: empty string) a parent vertex with exactly480two children.481482- ``bit`` -- (default: empty string) the bit signifying either the483left or right branch. The bit "0" denotes the left branch and "1"484denotes the right branch.485486OUTPUT:487488- An edge list of the Huffman binary tree.489490EXAMPLES::491492sage: from sage.coding.source_coding.huffman import Huffman493sage: H = Huffman("Sage")494sage: T = H.tree()495sage: T.edges(labels=None)496[('0', 'S: 01'), ('0', 'a: 00'), ('1', 'e: 10'), ('1', 'g: 11'), ('root', '0'), ('root', '1')]497"""498if parent == "":499u = "root"500else:501u = parent502s = "".join([parent, bit])503try:504left = self._generate_edges(tree[0], parent=s, bit="0")505right = self._generate_edges(tree[1], parent=s, bit="1")506L = [(u, s)] if s != "" else []507return left + right + L508except TypeError:509return [(u, "".join([self.decode(s), ": ", s]))]510511512