版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
python哈夫曼压缩与解压算法
压缩
#encoding: utf-8
from bitarray import bitarray
import random
import json
class Node(object):
"""docstring for Node"""
left=None
right=None
times=0
char = ''
parent=None
def __init__(self):
super(Node, self).__init__()
# 获取带有权重的字符 的字典
def countTimes(str):
times = {}
for i in range(0,len(str)):
if (not str[i] in times):
times[str[i]]=1
else:
times[str[i]]+=1
return sorted(times.items(),key = lambda x:x[1])
# 将带有次数以及字符的数据转换成一颗树
def getTree(data):
temp=data[:]
head= None #拿到 head 就相当于拿到了整棵树
nodes = []
# 都变成 Node
for x in temp:
tempNode = Node()
tempNode.char = x[0]
tempNode.times = x[1]
nodes.append(tempNode)
while len(nodes)>=2:
# 需要把nodes都排序好,然后再操作第1,2个,因为是他们最小
nodes = sorted(nodes,key= lambda x:x.times)
nums = nodes[0].times + nodes[1].times
chars = nodes[0].char + nodes[1].char
node_parent =Node()
# print("len(temp):%d\tnums:%d\tchars:%s"%(len(temp),nums,chars))
node_parent.times = nums
node_parent.left = nodes[0]
node_parent.right = nodes[1]
node_parent.char =chars
head = node_parent
del(nodes[0:2]) # 删除第1,2个
nodes.append(node_parent) # 加入本轮1,2个的合体,参与下一轮的循环
return head
# 哈夫曼编码,使用递归。注意:只含叶子节点 返回 字符-哈夫曼编码 字典
def encode(head,code=""):
head_bak = head
now =head
global result # python 要声明global这个才可以 对全局变量进行写操作
if now.left != None:
encode(now.left,code+"0")
if now.right != None:
encode(now.right,code+"1")
if now.right == None and now.left == None:
result[now.char] = code
def printTree(head):
temp = head
if temp.left != None:
printTree(temp.left)
if temp.right != None:
printTree(temp.right)
print("char:%s\ttimes:%d"%(temp.char,temp.times))
# 可以将 字符权重 字典变为bitarray 以便写入二进制压缩文件
def dict2bits(dictObject,endian="little"):
str = json.dumps(dictObject)
bits = bitarray(endian=endian)
for x in str:
bits.frombytes(bytes(x,encoding="utf-8"))
return bits
# 将原字符串按照哈夫曼编码进行压缩
def zipTobits(s,encodeList,endian="little"):
bits = bitarray(endian=endian)
for x in s:
code = encodeList[x]
for c in code:
if c == "0":
bits.append(False)
else:
bits.append(True)
if len(bits)%8:
# 如果不满一个字节,则使用0 填充
# 其实如果不填充,因为读入时也是按字节读入,故读时也会自动填充0
for x in range(0,8-len(bits)%8):
bits.append(False)
return bits
# 以二进制的形式保存成文件
def saveBits(bits,encodeList):
with open("ziped.hfm","wb") as p:
# 先写字符-哈夫曼编码 字典,再写压缩后的内容
p.write(dict2bits(encodeList))
p.write(bits)
# 将原字符串保存下来
def saveStr(str):
with open("unziped.hfm","w") as p:
p.write(str)
def getSeedStr(times):
str=""
for x in range(0,times):
str+=chr(random.randint(ord('a'),ord('z')))
return str
# a = "abcabcabcabcabcabcddddddddd"
# for x in range(1,10):
# a+=a
theStr=getSeedStr(100)
sortedTimes = countTimes(a)
head=getTree(sortedTimes)
result = {}
encode(head)
bits= zipTobits(theStr,result)
saveBits(bits,result)
saveStr(theStr)
解压
#encoding: utf-8
from bitarray import bitarray
import json
# 从二进制文件读取数据,并返回 字符-哈夫曼编码 字典 与 压缩后的字符串
def readFile(filepath):
size = 1
encodeListStr=""
zipedBits = bitarray(endian="little")
# 读入 字符-哈夫曼编码 字典
with open(filepath,"rb") as f:
tag_begin = f.read(size).decode()
if tag_begin != '{':
print("Data Error")
return
encodeListStr+=tag_begin
content=""
while content != "}":
content = f.read(size)
if content == '':
print("Data Error")
return
content = content.decode()
encodeListStr += content
zipedBits.frombytes(f.read())
return encodeListStr,zipedBits
# 字符-哈夫曼编码 字典是{'a':'00'}形式的,要转为 bitarray的形式,即 {'a':bitarray('00')}
# 以便可以进行判断
def changeToBitArray(encodeList):
encodeBitArray = {}
for x in encodeList.keys():
bits = bitarray(endian="little")
code = encodeList[x]
for c in code:
if c == "0":
bits.append(False)
else:
bits.append(True)
encodeBitArray[x] = bits
return encodeBitArray
# 是否匹配
def contain(bits,encodeBitArray):
flag = False
char = ''
for x in encodeBitArray.keys():
if encodeBitArray[x] == bits:
flag = True
char = x
break
return flag,char
def saveStr(str):
with open("decode.hfm","w") as p:
p.write(str)
# 解压的核心内容
def decode(encodeList,bits):
waitingBits = bitarray(endian = "little")
encodeBitArray = changeToBitArray(encodeList)
decodeStr = ""
while bits.length() > 0:
waitingBits.append(bits[0])
bits = bits[1:]
flag,char = contain(waitingBits,encodeBitArray)
# print("waitingBits:%s\tflag:%s\tchar:%s"%(waitingBits,flag,char))
if flag == True:
decodeStr+=char
waitingBits = bitarray(endian = "little")
return decodeStr
encodeListStr,zipedBits = readFile("ziped.hfm")
encodeList = json.loads(encodeListStr)
decodeStr = decode(encodeList,zipedBits)
saveStr(decodeStr)
主要问题
- 不满一个字节填充问题。有一种场景是 当不够一个字节然后填充了
00
,但是恰好00
是某个字符的哈夫曼编码,此时不知如何进行处理 - 哈夫曼压缩算法好像只能对字符进行压缩,这里限制在
[a-z]
其实也可以不限制,只不过要多做一些处理,比如编码字符范围包括{
}
时,要进行反义操作之类的。 - 本人不善于算法,所以解压时进行匹配时只能每次都去匹配一下,不知道有没有其他算法可以改善这个情况?希望有大佬能指点指点。
至于第一个问题如果有人能告诉我如何解决那就更好了