哈夫曼编码
- 哈夫曼编码定义:哈夫曼编码是一种数据压缩技术,由大卫·哈夫曼开发,用于有效压缩经常出现的字符,从而减少数据的大小,同时不丢失任何信息。
- 哈夫曼编码过程:这种编码通过构建一棵基于字符频率的树来为每个字符生成独特的编码。它采用前缀码,确保无歧义解码,通过将字符按频率排序并逐步构建树来完成编码。
- 哈夫曼编码应用:哈夫曼编码广泛应用于传统压缩格式(如GZIP、BZIP2、PKZIP等),文本和传真传输中,提供了有效的压缩方法,以减小数据传输大小。
霍夫曼编码是一种压缩数据的技术,可以在不丢失任何细节的情况下减少数据的大小。它最初是由大卫·霍夫曼开发的。
霍夫曼编码通常用于压缩那些经常出现字符的数据。
霍夫曼编码如何工作?
假设下面的字符串需要通过网络发送。
每个字符占用 8 位。上面的字符串总共有 15 个字符。因此,发送此字符串需要 8 * 15 = 120
位。
使用霍夫曼编码技术,我们可以将字符串压缩到更小的大小。
霍夫曼编码首先使用字符的频率创建一棵树,然后为每个字符生成代码。
一旦数据被编码,就必须解码。解码使用相同的树进行。
霍夫曼编码通过前缀码的概念防止解码过程中的任何歧义,即与一个字符相关联的代码不应出现在任何其他代码的前缀中。上面创建的树有助于维护这一属性。
霍夫曼编码通过以下步骤完成。
-
计算字符串中每个字符的频率。
-
按频率递增顺序对字符进行排序。这些存储在优先队列
Q
中。 -
将每个唯一字符作为叶节点。
-
创建一个空节点
z
。将最小频率分配给z
的左子节点,并将第二最小频率分配给z
的右子节点。将z
的值设置为上述两个最小频率的总和。 -
从
Q
中移除这两个最小频率,并将总和添加到频率列表中(上图中的 * 表示内部节点)。 -
将节点
z
插入树中。 -
重复步骤 3 到 5,直到所有字符处理完毕。
-
对于每个非叶节点,将左边缘赋值为 0,右边缘赋值为 1。
要通过网络发送上述字符串,我们必须发送树和上述压缩代码。总大小由下表给出。
字符 | 频率 | 代码 | 大小 |
---|---|---|---|
A | 5 | 11 | 5*2 = 10 |
B | 1 | 100 | 1*3 = 3 |
C | 6 | 0 | 6*1 = 6 |
D | 3 | 101 | 3*3 = 9 |
4 * 8 = 32 位 | 15 位 | 28 位 |
未编码时,字符串的总大小为 120 位。编码后大小减少到 32 + 15 + 28 = 75
。
解码代码
为了解码代码,我们可以取得代码并通过树来找到字符。
假设要解码 101,我们可以从根开始遍历,如下图所示。
霍夫曼编码算法
创建一个包含每个独特字符的优先队列 Q。
按它们的频率升序排序。
对于所有独特的字符:
创建一个新节点
从 Q 中提取最小值,并将其赋给 newNode 的左子节点
从 Q 中提取最小值,并将其赋给 newNode 的右子节点
计算这两个最小值的总和,并将其赋给 newNode 的值
将这个 newNode 插入树中
返回根节点
Python、Java 和 C/C++ 示例
# Python 中的霍夫曼编码
string = 'BCAADDDCCACACAC'
# 创建树节点
class NodeTree(object):
def __init__(self, left=None, right=None):
self.left = left
self.right = right
def children(self):
return (self.left, self.right)
def nodes(self):
return (self.left, self.right)
def __str__(self):
return '%s_%s' % (self.left, self.right)
# 实现霍夫曼编码的主函数
def huffman_code_tree(node, left=True, binString=''):
if type(node) is str:
return {node: binString}
(l, r) = node.children()
d = dict()
d.update(huffman_code_tree(l, True, binString + '0'))
d.update(huffman_code_tree(r, False, binString + '1'))
return d
# 计算频率
freq = {}
for c in string:
if c in freq:
freq[c] += 1
else:
freq[c] = 1
freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
nodes = freq
while len(nodes) > 1:
(key1, c1) = nodes[-1]
(key2, c2) = nodes[-2]
nodes = nodes[:-2]
node = NodeTree(key1, key2)
nodes.append((node, c1 + c2))
nodes = sorted(nodes, key=lambda x: x[1], reverse=True)
huffmanCode = huffman_code_tree(nodes[0][0])
print(' 字符 | 霍夫曼编码 ')
print('----------------------')
for (char, frequency) in freq:
print(' %-4r |%12s' % (char, huffmanCode[char]))
// C++中的哈夫曼编码
import java.util.PriorityQueue;
import java.util.Comparator;
class HuffmanNode {
int item;
char c;
HuffmanNode left;
HuffmanNode right;
}
// For comparing the nodes
class ImplementComparator implements Comparator<HuffmanNode> {
public int compare(HuffmanNode x, HuffmanNode y) {
return x.item - y.item;
}
}
// IMplementing the huffman algorithm
public class Huffman {
public static void printCode(HuffmanNode root, String s) {
if (root.left == null && root.right == null && Character.isLetter(root.c)) {
System.out.println(root.c + " | " + s);
return;
}
printCode(root.left, s + "0");
printCode(root.right, s + "1");
}
public static void main(String[] args) {
int n = 4;
char[] charArray = { 'A', 'B', 'C', 'D' };
int[] charfreq = { 5, 1, 6, 3 };
PriorityQueue<HuffmanNode> q = new PriorityQueue<HuffmanNode>(n, new ImplementComparator());
for (int i = 0; i < n; i++) {
HuffmanNode hn = new HuffmanNode();
hn.c = charArray[i];
hn.item = charfreq[i];
hn.left = null;
hn.right = null;
q.add(hn);
}
HuffmanNode root = null;
while (q.size() > 1) {
HuffmanNode x = q.peek();
q.poll();
HuffmanNode y = q.peek();
q.poll();
HuffmanNode f = new HuffmanNode();
f.item = x.item + y.item;
f.c = '-';
f.left = x;
f.right = y;
root = f;
q.add(f);
}
System.out.println(" Char | Huffman code ");
System.out.println("--------------------");
printCode(root, "");
}
}
// Huffman Coding in C
#include <stdio.h>
#include <stdlib.h>
#define MAX_TREE_HT 50
struct MinHNode {
char item;
unsigned freq;
struct MinHNode *left, *right;
};
struct MinHeap {
unsigned size;
unsigned capacity;
struct MinHNode **array;
};
// Create nodes
struct MinHNode *newNode(char item, unsigned freq) {
struct MinHNode *temp = (struct MinHNode *)malloc(sizeof(struct MinHNode));
temp->left = temp->right = NULL;
temp->item = item;
temp->freq = freq;
return temp;
}
// Create min heap
struct MinHeap *createMinH(unsigned capacity) {
struct MinHeap *minHeap = (struct MinHeap *)malloc(sizeof(struct MinHeap));
minHeap->size = 0;
minHeap->capacity = capacity;
minHeap->array = (struct MinHNode **)malloc(minHeap->capacity * sizeof(struct MinHNode *));
return minHeap;
}
// Function to swap
void swapMinHNode(struct MinHNode **a, struct MinHNode **b) {
struct MinHNode *t = *a;
*a = *b;
*b = t;
}
// Heapify
void minHeapify(struct MinHeap *minHeap, int idx) {
int smallest = idx;
int left = 2 * idx + 1;
int right = 2 * idx + 2;
if (left < minHeap->size && minHeap->array[left]->freq < minHeap->array[smallest]->freq)
smallest = left;
if (right < minHeap->size && minHeap->array[right]->freq < minHeap->array[smallest]->freq)
smallest = right;
if (smallest != idx) {
swapMinHNode(&minHeap->array[smallest], &minHeap->array[idx]);
minHeapify(minHeap, smallest);
}
}
// Check if size if 1
int checkSizeOne(struct MinHeap *minHeap) {
return (minHeap->size == 1);
}
// Extract min
struct MinHNode *extractMin(struct MinHeap *minHeap) {
struct MinHNode *temp = minHeap->array[0];
minHeap->array[0] = minHeap->array[minHeap->size - 1];
--minHeap->size;
minHeapify(minHeap, 0);
return temp;
}
// Insertion function
void insertMinHeap(struct MinHeap *minHeap, struct MinHNode *minHeapNode) {
++minHeap->size;
int i = minHeap->size - 1;
while (i && minHeapNode->freq < minHeap->array[(i - 1) / 2]->freq) {
minHeap->array[i] = minHeap->array[(i - 1) / 2];
i = (i - 1) / 2;
}
minHeap->array[i] = minHeapNode;
}
void buildMinHeap(struct MinHeap *minHeap) {
int n = minHeap->size - 1;
int i;
for (i = (n - 1) / 2; i >= 0; --i)
minHeapify(minHeap, i);
}
int isLeaf(struct MinHNode *root) {
return !(root->left) && !(root->right);
}
struct MinHeap *createAndBuildMinHeap(char item[], int freq[], int size) {
struct MinHeap *minHeap = createMinH(size);
for (int i = 0; i < size; ++i)
minHeap->array[i] = newNode(item[i], freq[i]);
minHeap->size = size;
buildMinHeap(minHeap);
return minHeap;
}
struct MinHNode *buildHuffmanTree(char item[], int freq[], int size) {
struct MinHNode *left, *right, *top;
struct MinHeap *minHeap = createAndBuildMinHeap(item, freq, size);
while (!checkSizeOne(minHeap)) {
left = extractMin(minHeap);
right = extractMin(minHeap);
top = newNode('$', left->freq + right->freq);
top->left = left;
top->right = right;
insertMinHeap(minHeap, top);
}
return extractMin(minHeap);
}
void printHCodes(struct MinHNode *root, int arr[], int top) {
if (root->left) {
arr[top] = 0;
printHCodes(root->left, arr, top + 1);
}
if (root->right) {
arr[top] = 1;
printHCodes(root->right, arr, top + 1);
}
if (isLeaf(root)) {
printf(" %c | ", root->item);
printArray(arr, top);
}
}
// Wrapper function
void HuffmanCodes(char item[], int freq[], int size) {
struct MinHNode *root = buildHuffmanTree(item, freq, size);
int arr[MAX_TREE_HT], top = 0;
printHCodes(root, arr, top);
}
// Print the array
void printArray(int arr[], int n) {
int i;
for (i = 0; i < n; ++i)
printf("%d", arr[i]);
printf("\n");
}
int main() {
char arr[] = {'A', 'B', 'C', 'D'};
int freq[] = {5, 1, 6, 3};
int size = sizeof(arr) / sizeof(arr[0]);
printf(" Char | Huffman code ");
printf("\n--------------------\n");
HuffmanCodes(arr, freq, size);
}
// Huffman Coding in C++
#include <iostream>
using namespace std;
#define MAX_TREE_HT 50
// 哈夫曼树节点结构
struct MinHNode {
unsigned freq; // 频率
char item; // 字符
struct MinHNode *left, *right; // 左子节点和右子节点
};
struct MinH {
unsigned size; // 大小
unsigned capacity; // 容量
struct MinHNode **array; // 数组
};
// 创建哈夫曼树节点
struct MinHNode *newNode(char item, unsigned freq) {
struct MinHNode *temp = (struct MinHNode *)malloc(sizeof(struct MinHNode));
temp->left = temp->right = NULL;
temp->item = item;
temp->freq = freq;
return temp;
}
// 创建最小堆
struct MinH *createMinH(unsigned capacity) {
struct MinH *minHeap = (struct MinH *)malloc(sizeof(struct MinH));
minHeap->size = 0;
minHeap->capacity = capacity;
minHeap->array = (struct MinHNode **)malloc(minHeap->capacity * sizeof(struct MinHNode *));
return minHeap;
}
// 打印数组
void printArray(int arr[], int n) {
int i;
for (i = 0; i < n; ++i)
cout << arr[i];
cout << "\n";
}
// 交换节点
void swapMinHNode(struct MinHNode **a, struct MinHNode **b) {
struct MinHNode *t = *a;
*a = *b;
*b = t;
}
// 堆化
void minHeapify(struct MinH *minHeap, int idx) {
int smallest = idx;
int left = 2 * idx + 1;
int right = 2 * idx + 2;
if (left < minHeap->size && minHeap->array[left]->freq < minHeap->array[smallest]->freq)
smallest = left;
if (right < minHeap->size && minHeap->array[right]->freq < minHeap->array[smallest]->freq)
smallest = right;
if (smallest != idx) {
swapMinHNode(&minHeap->array[smallest],
&minHeap->array[idx]);
minHeapify(minHeap, smallest);
}
}
// 检查大小是否为1
int checkSizeOne(struct MinH *minHeap) {
return (minHeap->size == 1);
}
// 提取最小值
struct MinHNode *extractMin(struct MinH *minHeap) {
struct MinHNode *temp = minHeap->array[0];
minHeap->array[0] = minHeap->array[minHeap->size - 1];
--minHeap->size;
minHeapify(minHeap, 0);
return temp;
}
// 插入节点到最小堆
void insertMinHeap(struct MinH *minHeap, struct MinHNode *minHeapNode) {
++minHeap->size;
int i = minHeap->size - 1;
while (i && minHeapNode->freq < minHeap->array[(i - 1) / 2]->freq) {
minHeap->array[i] = minHeap->array[(i - 1) / 2];
i = (i - 1) / 2;
}
minHeap->array[i] = minHeapNode;
}
// 构建最小堆
void buildMinHeap(struct MinH *minHeap) {
int n = minHeap->size - 1;
int i;
for (i = (n - 1) / 2; i >= 0; --i)
minHeapify(minHeap, i);
}
// 检查是否是叶子节点
int isLeaf(struct MinHNode *root) {
return !(root->left) && !(root->right);
}
// 创建并构建最小堆
struct MinH *createAndBuildMinHeap(char item[], int freq[], int size) {
struct MinH *minHeap = createMinH(size);
for (int i = 0; i < size; ++i)
minHeap->array[i] = newNode(item[i], freq[i]);
minHeap->size = size;
buildMinHeap(minHeap);
return minHeap;
}
// 构建哈夫曼树
struct MinHNode *buildHfTree(char item[], int freq[], int size) {
struct MinHNode *left, *right, *top;
struct MinH *minHeap = createAndBuildMinHeap(item, freq, size);
while (!checkSizeOne(minHeap)) {
left = extractMin(minHeap);
right = extractMin(minHeap);
top = newNode('$', left->freq + right->freq);
top->left = left;
top->right = right;
insertMinHeap(minHeap, top);
}
return extractMin(minHeap);
}
// 打印哈夫曼编码
void printHCodes(struct MinHNode *root, int arr[], int top) {
if (root->left) {
arr[top] = 0;
printHCodes(root->left, arr, top + 1);
}
if (root->right) {
arr[top] = 1;
printHCodes(root->right, arr, top + 1);
}
if (isLeaf(root)) {
cout << root->item << " | ";
printArray(arr, top);
}
}
// 哈夫曼编码的包装函数
void HuffmanCodes(char item[], int freq[], int size) {
struct MinHNode *root = buildHfTree(item, freq, size);
int arr[MAX_TREE_HT], top = 0;
printHCodes(root, arr, top);
}
int main() {
char arr[] = {'A', 'B', 'C', 'D'};
int freq[] = {5, 1, 6, 3};
int size = sizeof(arr) / sizeof(arr[0]);
cout << "Char | Huffman code ";
cout << "\n----------------------\n";
HuffmanCodes(arr, freq, size);
}
哈夫曼编码的复杂性
根据字符频率对每个唯一字符进行编码的时间复杂度为 O(nlog n)
。
从优先队列中提取最小频率发生了 2*(n-1)
次,其复杂度为 O(log n)
。因此总体复杂度为 O(nlog n)
。
哈夫曼编码的应用
- 哈夫曼编码用于传统的压缩格式,如GZIP、BZIP2、PKZIP等。
- 用于文本和传真传输。