Saturday, March 15, 2008

Huffman Compression

I was bored one day, and a friend of mine told me about a CSE 143 homework assignment involving Huffman compression. It interested me enough to where I decided to implement this algorithm. However, the way I store the binary data at the end concerned me, so I didn't continue on towards decompression.

huffman.py

from tree import *
from sys import argv
import os
import sys
import math

file = None



class LHash(dict):
def popmin2(self):
if not len(self):
return []
elif len(self) < 2:
min = self.keys()[0]
ret = [(min, self[min].pop(0))]
if not len(self[min]):
del self[min]
return ret
ret = [None, None]
min = self.keys()[0]
ret[0] = (min, self[min].pop(0))
if len(self[min]):
ret[1] = (min, self[min].pop(0))
if not len(self[min]):
del self[min]
else:
del self[min]
min = self.keys()[0]
ret[1] = (min, self[min].pop(0))
if not len(self[min]):
del self[min]
return ret
class DHash(dict):
def levels(self):
ret = LHash()
for i in self.keys():
x = self[i]
if x in ret:
ret[x] += [i]
else:
ret[x] = [i]
return ret

class BList(tuple):
def out(self):
outstring = ''
tnum = 0
split = 7
l = len(self) / split
#print '%d / 8 = %d' % (len(self), l)
for i in range(0, l):
tnum = 0
ch = self[split * i:split * i + split]
for i in range(0, split):
tnum += ch[i] << split - i - 1
outstring += chr(tnum)
last = self[split * (l - 1):len(self)]
if len(last):
while len(last) < split:
last += (0,)
tnum = 0
print last
for i in range(0, split):
tnum += last[i] << split - i - 1
outstring += chr(tnum)
return outstring

self = argv.pop(0)

for i in argv:
if os.path.exists(i):
file = i
break
if file:
print 'Huffman\'ing', file
fp = open(file, 'r')
text = fp.read()
fp.close()
chars = DHash()
for i in text:
#print '%s -> %d' % (i, ord(i))
t = ord(i)
if t in chars:
chars[t] += 1
else:
chars[t] = 1
levels = chars.keys()
levels.sort(reverse = True)
max = levels[0]
chars[max + 1] = 1
levels = chars.levels()
cat = [1]
newt = 1
while len(levels) > 1 and len(cat):
cat = levels.popmin2()
newt = cat[0][0] + cat[1][0]
ttree = HTree(newt)
toadd = ()
if isinstance(cat[0][1], HTree):
toadd += (cat[0][1].rootnode(),)
else:
toadd += (cat[0][1],)
if isinstance(cat[1][1], HTree):
toadd += (cat[1][1].rootnode(),)
else:
toadd += (cat[1][1],)
ttree.addPair(add = toadd)
if newt in levels:
levels[newt] += [ttree]
else:
levels[newt] = [ttree]
htree = levels[newt]
print ttree
thash = {}
ttree.iterGet(thash)
print thash
#for i in thash:
#print '%s: %s' % (i, thash[i])
sys.exit(0)
outbinary = ()
for i in text:
outbinary += thash[ord(i)]
outbinary = BList(outbinary)
print outbinary.out()

tree.py

class HTree(object):
class Node(object):
def __init__(self, data):
self.data = data
self.children = [None, None]
def __repr__(self):
ret = '[' + repr(self.data) + ': '
if self.children[0] == None and self.children[1] == None:
ret += 'None, None]'
else:
ret += repr(self.children[0]) + ', ' + repr(self.children[1]) + ']'
return ret
class NodeNotFoundException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return 'No node at position ' + repr(self.value)
class InvalidPositionException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return 'Given position not 0 or 1: ' + repr(self.value)
def __init__(self, value = None):
self.root = self.Node(value)
self.size = 1
def rootnode(self):
return self.root
def addPair(self, add, position = None):
cursor = self.root
tmp = None
if isinstance(position, list) or isinstance(position, tuple):
i = 0
#print 'i: 0, len: %d' % (len(position),)
while i < len(position) and cursor:
if not (position[i] == 0 or position[i] == 1):
raise InvalidPositionException, position[i]
#print 'cursor: ' + repr(cursor)
cursor = cursor.children[position[i]]
#print 'cursor: ' + repr(cursor)
i += 1
if not cursor:
raise NodeNotFoundException, str(position)
tmp = []
if isinstance(add[0], self.Node):
tmp += [add[0]]
else:
tmp += [self.Node(add[0])]
if isinstance(add[1], self.Node):
tmp += [add[1]]
else:
tmp += [self.Node(add[1])]
#print repr(tmp)
if cursor and tmp:
cursor.children = tmp
self.size += 1
def iterGet(self, hash, root = None, pos = ()):
if not root:
root = self.root
#print '\t%s' % str(root)
if root.children[0]:
self.iterGet(hash, root.children[0], pos + (0,))
if not root.children[0] and not root.children[1]:
if isinstance(root.data, str):
raise Exception
hash[root.data] = pos
if root.children[1]:
self.iterGet(hash, root.children[1], pos + (1,))
def __repr__(self):
return 'TREE: ' + repr(self.root)

No comments: