GitHub Repository: SSQ/Coursera-Stanford-Greedy-Algorithms-Minimum-Spanning-Trees-and-Dynamic-Programming
Path: blob/master/Programming Assignment 3/HuffmanCoding.ipynb
⁴²³ views

Kernel: Python [conda env:DAND]

Huffman Coding

1 point

1.In this programming problem and the next you'll code up the greedy algorithm from the lectures on Huffman coding.

Download the text file below.

huffman.txt This file describes an instance of the problem. It has the following format:

[number_of_symbols]

[weight of symbol #1]

[weight of symbol #2]

...

For example, the third line of the file is "6852892," indicating that the weight of the second symbol of the alphabet is 6852892. (We're using weights instead of frequencies, like in the "A More Complex Example" video.)

Your task in this problem is to run the Huffman coding algorithm from lecture on this data set. What is the maximum length of a codeword in the resulting Huffman code?

ADVICE: If you're not getting the correct answer, try debugging your algorithm using some small test cases. And then post them to the discussion forum!

In [1]:

import pandas as pd
import numpy as np
import time

In [2]:

# get the file path
file_path = 'huffman.txt'
# file_path = 'test 1.txt'
# convert text file to np.array type
huffman_data = np.loadtxt(file_path)
int_huffman_data = huffman_data.astype(int)

# test np.loadtxt()
print('int_huffman_data[:3]: ')
print(int_huffman_data[:3])

Out[2]:

int_huffman_data[:3]: 
[7540662 6852892 3235725]

In [3]:

# test sort the weights inscreasing order
sorted_data = np.sort(int_huffman_data)
sorted_index = np.argsort(int_huffman_data) 

# test np.sort() and np.argsort()
for i in range(len(sorted_data)):
    if sorted_data[i] != [int_huffman_data[x] for x in sorted_index][i]:
        print("sorted failed")
        break
print("sorted successfully")

Out[3]:

sorted successfully

In [4]:

# test np.append() and np.delete

print(sorted_data[:10])
print(sorted_data[-10:])

sorted_data = np.append(sorted_data, sorted_data[0] + sorted_data[1])

sorted_data = np.delete(sorted_data, [0, 1])

print(sorted_data[:10])
print(sorted_data[-10:])

Out[4]:

[ 1873 12710 37164 40882 57802 61282 64537 70116 83429 90939]
[9879636 9879739 9889780 9911792 9916957 9933557 9959436 9968891 9978358
 9979223]
[ 37164  40882  57802  61282  64537  70116  83429  90939  97421 107416]
[9879739 9889780 9911792 9916957 9933557 9959436 9968891 9978358 9979223
   14583]

In [5]:

# test np.delete
print(sorted_index[:10])
print(sorted_index[-10:])

sorted_index = np.delete(sorted_index, [0, 1])

print(sorted_index[:10])
print(sorted_index[-10:])

Out[5]:

[471 798 752 448 554 867 356 957 598 382]
[701 773 430 896 876 105  33 780 514 342]
[752 448 554 867 356 957 598 382 210 538]
[701 773 430 896 876 105  33 780 514 342]

In [51]:

# test opertaions

test_data={}
for i in range(len(int_huffman_data)):
    test_data[int_huffman_data[i]] = [i]
#print(test_data)
print('int_huffman_data[0]: ')
print(int_huffman_data[0])

# print(sorted(test_data))
print('sorted(test_data)[0] + sorted(test_data)[1]: ')
print(sorted(test_data)[0] + sorted(test_data)[1])

sorted_test = sorted(test_data)

test_data[sorted_test[0]+sorted_test[1]] = [test_data[sorted_test[0]], test_data[sorted_test[1]]]

print('test_data[sorted_test[0]]: ')
print(test_data[sorted_test[0]])

print('test_data[sorted_test[1]]: ')
print(test_data[sorted_test[1]])

print('sorted(test_data)[0] + sorted(test_data)[1]: ')
print(sorted(test_data)[0] + sorted(test_data)[1])

print('test_data[sorted_test[0]+sorted_test[1]]: ')
print(test_data[sorted_test[0]+sorted_test[1]])

print('len(test_data[sorted_test[0]+sorted_test[1]]): ')
print(len(test_data[sorted_test[0]+sorted_test[1]]))

del test_data[sorted_test[0]]
del test_data[sorted_test[1]]

print('check exist: ')
print(sorted_test[0] in test_data.keys())

print('check exist: ')
print(sorted_test[1] in test_data.keys())

test_dict = {100000: [0,[2,1]],99: 99}
print("Value : %s" %  test_dict.values())
print("Value : %s" %  test_dict.values()[0])
print("Key : %s" %  test_dict.keys())
print("Key : %s" %  test_dict.keys()[0])

test_dict = {100000: [[0,3],[2,1]]}
print("Value : %s" %  test_dict.values()[0])

Out[51]:

int_huffman_data[0]: 
7540662
sorted(test_data)[0] + sorted(test_data)[1]: 
14583
test_data[sorted_test[0]]: 
[471]
test_data[sorted_test[1]]: 
[798]
sorted(test_data)[0] + sorted(test_data)[1]: 
14583
test_data[sorted_test[0]+sorted_test[1]]: 
[[471], [798]]
len(test_data[sorted_test[0]+sorted_test[1]]): 
2
check exist: 
False
check exist: 
False
Value : [[0, [2, 1]], 99]
Value : [0, [2, 1]]
Key : [100000, 99]
Key : 100000
Value : [[0, 3], [2, 1]]

In [53]:

# weightposition = {weight0: position0, weight1: position1, weight2: position2, ...}
WeightPosition={}
for i in range(len(int_huffman_data)):
    # adding int() to avoid "RuntimeWarning: overflow encountered in long_scalars"
    WeightPosition[int(int_huffman_data[i])] = [int(i)]

# input:
#     dictionary. i.e. WeightPosition = {weight0: position0, weight1: position1, weight2: position2, ...}.
# output:
#     array. i.e. WeightPosition.values()[0]
# function:
#     merge two smallest weights to a binary tree
def merger(WeightPosition):
    while len(WeightPosition) > 1:
        sorted_weight = sorted(WeightPosition) # return an array of sorted (weight)keys in dictionay WeightPosition
        
        # weightposition = { (weight0 + weight1) : [position0, position1], weight0: position0, weight1: position1, weight2: position2, ...}
        WeightPosition[sorted_weight[0]+sorted_weight[1]] = [WeightPosition[sorted_weight[0]], WeightPosition[sorted_weight[1]]]

        # weightposition = { (weight0 + weight1) : [position0, position1], weight2: position2, ...}
        del WeightPosition[sorted_weight[0]]
        del WeightPosition[sorted_weight[1]]
        
    return WeightPosition.values()[0] # only 1 value

mergered = merger(WeightPosition)
print("mergered length: ")
print(len(mergered))

Out[53]:

mergered length: 
2

In [64]:

# test 
test_dict2 = {[[2],[3]]: [0], [[3],[5]]: [1]}
for i, j in enumerate(test_dict2):
    test_dict2[i[0]] = j.extend([0])
    test_dict2[i[1]] = j.extend([1])
print(test_dict2)

Out[64]:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-64-d3c59e406f60> in <module>()
      1 # test
----> 2 test_dict2 = {[[2],[3]]: [0], [[3],[5]]: [1]}
      3 for i, j in enumerate(test_dict2):
      4     test_dict2[i[0]] = j.extend([0])
      5     test_dict2[i[1]] = j.extend([1])
TypeError: unhashable type: 'list'

In [66]:

# test
def test_func(a):
    return a
test_array = [1,2,3]
print(max(test_func(x) for x in test_array))

Out[66]:

3

In [67]:

# calculate the max_deep and min_deep of the tree
def min_depth(mergered):
    if len(mergered) == 1:
        return 0
    return 1 + min(min_depth(x) for x in mergered)

def max_depth(mergered):
    if len(mergered) == 1:
        return 0
    return 1 + max(max_depth(x) for x in mergered)

In [68]:

# get the answer
code_max_length = max_depth(mergered)
print('code_max_length: ')
print(code_max_length)

code_min_length = min_depth(mergered)
print('code_min_length: ')
print(code_min_length)

Out[68]:

code_max_length: 
19
code_min_length: 
9

Question 1 answer

1 point 2.Continuing the previous problem, what is the minimum length of a codeword in your Huffman code?

Huffman Coding

Question 1 answer

Question 2 answer

Product

Resources

Company