2019-04-27发表2019-11-26更新 Uchiha Kakashi 数据结构21 分钟读完 (大约3093个字)0次访问

哈夫曼树

对一棵树的叶子全部赋权值，那么权值乘上叶子的高度的和最小的树就是哈夫曼树。
构造哈夫曼树的方法如下：
（1）令S={ w1,w2,…,wt }
（2）从S中取出两个最小的权wi和wj，画结点vi，带权wi，画结点vj，带权wj。画vi和vj的父亲v，连接vi和v,vj和v，令v带权wi+wj
（3）令S = (S - { wi,wj })∪{ wi+wj }
（4）判断S是否只有一个元素，若是，则停止，否则重复（2）（3）（4）。

这样的话，构造一棵哈夫曼树的C++代码就不难写了。

#include <algorithm>
#include <iostream>
#include <sstream>
#include <random>
#include <vector>
struct Huff_Node {			//哈夫曼树的结点
	int weight;				//权重
	Huff_Node *lchild;		//左孩子指针
	Huff_Node *rchild;		//右孩子指针
	//默认构造函数，使默认构造出的结点左右指针均为空，方便后续操作
	Huff_Node(int d = 0, Huff_Node *l = NULL, Huff_Node *r = NULL) :weight(d), lchild(l), rchild(r) {  }
};
std::stringstream generateRand(int n) {
	using namespace std;
	stringstream sstr;
	default_random_engine rand;
	while (n--)
		sstr << rand() % 100 << ' ';
	return sstr;
}
void traverse(Huff_Node *p) {		//先序遍历
	if (p) {
		std::cout << p->weight << ' ';
		traverse(p->lchild);
		traverse(p->rchild);
	}
}
void deconstruct(Huff_Node *p) {	//析构树
	if (p) {
		deconstruct(p->lchild);
		deconstruct(p->rchild);
		delete p;
	}
}
void traverseLayer(Huff_Node *r, int layer)     {    //层次遍历
	if (r == NULL) return;
	traverseLayer(r->rchild, layer + 1);
	for (int i = 1; i < 5 * layer; ++i)
		std::cout << '-';
	std::cout << r->weight << std::endl;
	traverseLayer(r->lchild, layer + 1);
}
int main() {
	using namespace std;
	default_random_engine rand;			//初始化随机数引擎
	int n = rand() % 100;				//生成一个100以内的正整数n
	stringstream stm = generateRand(n);		//生成随机整数
	streambuf *p_sbf = cin.rdbuf(stm.rdbuf());	//重定向流
	vector<Huff_Node*> huffarr(n);		//初始化由哈夫曼树结点构成的动态数组
	for (int i = 0; i < n; ++i) {
		huffarr[i] = new Huff_Node;		//给结点分配空间
		cin >> huffarr[i]->weight;		//输入数据
	}
	while (huffarr.size() > 1) {		//在数组的大小大于1时
		sort(huffarr.begin(), huffarr.end(), [](Huff_Node *L, Huff_Node *R) { return L->weight > R->weight; });	//按照从大到小排序
		Huff_Node *l = huffarr.back();					//数组最后一位指针对应的子树做左孩子
		huffarr.pop_back();
		Huff_Node *r = huffarr.back();					//数组倒数第二位指针对应的子树做右孩子
		huffarr.pop_back();
		Huff_Node *f = new Huff_Node(l->weight + r->weight, l, r);	//创建这两个子树的父节点
		huffarr.push_back(f);									//把父节点压入数组
	}
	Huff_Node *root = huffarr.front();                                                            //数组第一位为根
	traverse(root);												//先序遍历
	cout << endl;
	traverseLayer(root, 1);
	deconstruct(root);
	root = NULL;									//将根指针指向空，防止误操作
	cin.rdbuf(p_sbf);							        //恢复流
}

这个方法的时间复杂度也比较好分析，sort方法的时间复杂度为nlogn，while语句的执行次数最多为logn，故时间复杂度为n*(logn)^2。

C++的STL中的优先级队列是依靠堆来实现的，添加或删除元素的时间复杂度均为logn。故可以借助优先级队列，将代码的平均时间复杂度优化到(logn)^2。仅需改变main函数中的以下代码。

#include <queue>
...
	struct MyCmp {
		bool operator()(Huff_Node *L, Huff_Node *R) {
			return L->weight > R->weight;
		}
	};
	priority_queue<Huff_Node*, vector<Huff_Node*>, MyCmp> Q;		//初始化由哈夫曼树结点构成的优先级队列
	while (n--) {
		Huff_Node *p = new Huff_Node;		//给结点分配空间
		cin >> p->weight;					//输入数据
		Q.push(p);
	}
	while (Q.size() > 1) {				//在队列只剩的大小大于1时
		Huff_Node *l = Q.top();					//队头元素做左孩子
		Q.pop();					                        //出队
		Huff_Node *r = Q.top();					//队头元素做右孩子
		Q.pop();					                        //出队
		Huff_Node *f = new Huff_Node(l->weight + r->weight, l, r);	//创建这两个子树的父节点
		Q.push(f);								//把父节点压入数组
	}
	Huff_Node *root = Q.top();								//数组的第一位作为根节点

使用哈夫曼树压缩文件的大概思路如下：
1）读取文件，统计每个字符出现的频率，并根据此构造哈夫曼树。
2）遍历此树，生成遍历字符串。
3）读取文件，根据读取到的字符，选择字符串以字节方式写入文件。
4）写入的文件为压缩文件

但仅靠此无法还原文件，故应将<字符，权重>组合写入文件中。

选择一个大小确定的字符，可以是unsigned char，记作byte，8bit，一字节。

unsigned char范围肯定是0-255，正好，来个vector<Node *>(256)。

#include <vector>
#include <string>
using namespace std;
typedef unsigned char byte;
struct Huff_Node {
        byte data;
	unsigned long weight;
        string str;
	Huff_Node *lchild;
	Huff_Node *rchild;
	Huff_Node(unsigned long d = 0, byte d, Huff_Node *l = NULL, Huff_Node *r = NULL) :data('\000'), weight(d), lchild(l), rchild(r) {  }
};
...
vector<Huff_Node *> Vec(256, nullptr);
for (size_t i = 0; i < 256; ++i) 
        Vec[i] = new Huff_Node(0, i);

然后就开始读文件，既然这是C++程序，不如用fstream。

#include <fstream>
using namespace std;
...
ifstream fin;
fin.open("test.txt", ios::binary);
int c;
while ((c = fin.get()) != EOF)
        ++Vec[c]->weight;
struct MyCmp {
	bool operator()(Huff_Node *L, Huff_Node *R) {
		return L->weight > R->weight;
	}
};
priority_queue<Huff_Node *, vector<Huff_Node *>, MyCmp> Q(Vec.begin(), Vec.end());
while (Q.size() > 1) {
	Huff_Node *l = Q.top();
	Q.pop();
	Huff_Node *r = Q.top();
	Q.pop();
	Huff_Node *f = new Huff_Node(l->weight + r->weight, '\000', l, r);
	Q.push(f);
}
Huff_Node *root = Q.top();

之后就要遍历这个树生成对应的二进制字符串了，所以

void traverse(Huff_Node *p, string s) {
	if (p->lchild == NULL && p->rchild == NULL) {
		p->str = s;
        	return;
    	}
    	traverse(p->lchild, s + '0');
    	traverse(p->rchild, s + '1');
}
traverse(root, "");

之后再过一次文件，生成二进制码写入文件，生成二进制码不如使用strtol，不用自己写函数了。

所以大概写完的样子就只这样的。

bool encode(std::string const &FileName, std::string const &DestName) {
	std::ifstream fin;
	std::ofstream fout;
	//打开源文件
	fin.open(FileName, std::ios::binary);
	if (!fin) {
		std::cerr << "文件" << FileName << "打开失败" << std::endl;
		return false;
	}
	//对于空文件特殊处理
	if (fin.peek() == EOF) {
		fout.open("DestName", std::ios::binary);
		if (!fout) {
			std::cerr << "文件" << DestName << "打开或创建失败" << std::endl;
		}
		fout.close();
		return false;
	}
	std::vector<Huff_Node *> Vec(256, nullptr);
	int c;
	//数组中非nullptr的个数
	byte count = 0;
	//统计文件中各个字节的权重，并分配空间
	while ((c = fin.get()) != EOF) {
		if (Vec[c]) ++Vec[c]->weight;
		else {
			Vec[c] = new Huff_Node(1, c);
			++count;
		}
	}
	//优先级定义
	struct MyCmp {
		bool operator()(Huff_Node *L, Huff_Node *R) {
			return L && R ? L->weight > R->weight:L && !R;
		}
	};
	//构造哈夫曼树
	std::priority_queue<Huff_Node *, std::vector<Huff_Node *>, MyCmp> Q(Vec.begin(), Vec.end());
	while (Q.size() > 1) {
		//对于空结点的处理
		if (Q.top() == nullptr) {
			Q.pop();
			continue;
		}
		Huff_Node *l = Q.top();
		Q.pop();
		Huff_Node *r = Q.top();
		Q.pop();
		Q.push(new Huff_Node(l->weight + r->weight, '\000', l, r));
	}
	Huff_Node *root = Q.top();
	//遍历，生成遍历字符串
	traverse(root, "");
	//清除EOF Flag
	fin.clear();
	//设置指针到文件开头
	fin.seekg(0);
	//创建文件
	fout.open(DestName, std::ios::binary);
	if (!fout) {
		std::cerr << "文件" << DestName << "打开或创建失败" << std::endl;
		return false;
	}
	//创建临时文件
	std::ofstream temp_out;
	char add_zero_length = 0;
	temp_out.open("temp.tmp", std::ios::binary);
	if (!temp_out) {
		std::cerr << "文件temp.tmp打开或创建失败" << std::endl;
		return false;
	}
	//将二进制码写入临时文件
	while ((c = fin.get()) != EOF)
		temp_out << Vec[c]->str;
	if (temp_out.tellp() % 8) {          //如果文件长度不是8的被数
		do {
			++add_zero_length;
			temp_out << '0';             //在末尾补0
		} while (temp_out.tellp() % 8);
	}
	temp_out.close();
	//开始写入文件头部
	//先写入数组实际的大小
	fout.write((char*)&count, 1);
	for (size_t i = 0; i < 256; ++i) {
		Huff_Node *p = Vec[i];
		if (p) {
			//写入字符
			fout.write((char*)&p->data, 1);
			//写入权重
			fout.write((char*)&p->weight, sizeof(unsigned long));
		}
	}
	//写入添加的0的个数
	fout.write((char*)&add_zero_length, sizeof(add_zero_length));
	std::ifstream temp_in;
	temp_in.open("temp.tmp", std::ios::binary);
	temp_in.seekg(0);
	char str[9] = {};
	while (temp_in.peek() != EOF) {                  //当未达到文件末尾
		temp_in.read(str, 8);								//读入八个字节
		char _str = char(std::strtol(str, nullptr, 2));     //创建一个字符
		fout.write(&_str, 1);								//写入这个字符
	}
	temp_in.close();
	fout.close();
	//删除临时文件
	return std::remove("temp.tmp") != EOF;
}

当然，将含有二进制码的字符串放在文件里不是什么好选择，等我有时间了再改。

encode函数写完了，自然decode函数也就完了…
所以写完的代码大概就是这样的吧。

#include <bitset>
#include <cstddef>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <queue>
#include <string>
#include <vector>
using std::size_t;
typedef unsigned char byte;
struct Huff_Node {
	byte data;
	unsigned long weight;
	std::string str;
	Huff_Node *lchild;
	Huff_Node *rchild;
	Huff_Node(unsigned long w = 0, byte d = '\000', Huff_Node *l = nullptr, Huff_Node *r = nullptr) :data(d), weight(w), lchild(l), rchild(r) {  }
};
//优先级定义
struct MyCmp {
	bool operator()(Huff_Node *L, Huff_Node *R) {
		return L && R ? L->weight > R->weight:L && !R;
	}
};
//遍历字符串操作器
struct GenerateTraverseString {
	void operator()(Huff_Node *p, std::string const &s = "") {
		if (p->lchild == nullptr && p->rchild == nullptr) {
			p->str = s;
			return;
		}
		this->operator()(p->lchild, s + '0');
		this->operator()(p->rchild, s + '1');
	}
};
//析构哈夫曼树操作器
struct DeconstructTree {
	void operator()(Huff_Node *p) {
		if (p->lchild) this->operator()(p->lchild);
		if (p->rchild) this->operator()(p->rchild);
		delete p;
	}
};
//遍历函数
template <typename _VST>
void traverse(Huff_Node *p, _VST visit) {
	visit(p);
}
bool encode(std::string const &FileName, std::string const &DestName) {
	std::ifstream fin;
	std::ofstream fout;
	//打开源文件
	fin.open(FileName, std::ios::binary);
	if (!fin) {
		std::cerr << "文件" << FileName << "打开失败" << std::endl;
		return false;
	}
	//创建文件
	fout.open(DestName, std::ios::binary);
	if (!fout) {
		std::cerr << "文件" << DestName << "打开或创建失败" << std::endl;
		return false;
	}
	//对于空文件特殊处理
	if (fin.peek() == EOF) {
		fout.close();
		return true;
	}
	std::vector<Huff_Node *> Vec(256, nullptr);
	int c;
	//数组中非nullptr的个数
	byte count = 0;
	//统计文件中各个字节的权重，并分配空间
	while ((c = fin.get()) != EOF) {
		if (Vec[c]) ++Vec[c]->weight;
		else {
			Vec[c] = new Huff_Node(1, c);
			++count;
		}
	}
	Huff_Node *root = nullptr;
	if (count == 1) {
		for (size_t i = 0; i < 256; ++i)
			if (Vec[i] != nullptr)
				root = new Huff_Node(Vec[i]->weight, 0, new Huff_Node(), Vec[i]);
	}
	else {
		//构造哈夫曼树
		std::priority_queue<Huff_Node *, std::vector<Huff_Node *>, MyCmp> Q(Vec.begin(), Vec.end());
		while (Q.size() > 1) {
			//对于空结点的处理
			if (Q.top() == nullptr) {
				Q.pop();
				continue;
			}
			Huff_Node *l = Q.top();
			Q.pop();
			Huff_Node *r = Q.top();
			Q.pop();
			Q.push(new Huff_Node(l->weight + r->weight, '\000', l, r));
		}
		root = Q.top();
	}
	//遍历，生成遍历字符串
	traverse(root, GenerateTraverseString());
	//清除EOF Flag
	fin.clear();
	//设置指针到文件开头
	fin.seekg(0);
	//创建临时文件
	std::ofstream temp_out;
	char add_zero_length = 0;
	temp_out.open("temp.tmp", std::ios::binary);
	if (!temp_out) {
		std::cerr << "文件temp.tmp打开或创建失败" << std::endl;
		return false;
	}
	//将二进制码写入临时文件
	while ((c = fin.get()) != EOF)
		temp_out << Vec[c]->str;
	if (temp_out.tellp() % 8) {          //如果文件长度不是8的被数
		do {
			++add_zero_length;
			temp_out << '0';             //在末尾补0
		} while (temp_out.tellp() % 8);
	}
	temp_out.close();
	//开始写入文件头部
	//先写入数组实际的大小
	fout.write((char*)&count, 1);
	for (size_t i = 0; i < 256; ++i) {
		Huff_Node *p = Vec[i];
		if (p) {
			//写入字符
			fout.write((char*)&p->data, 1);
			//写入权重
			fout.write((char*)&p->weight, sizeof(unsigned long));
		}
	}
	//写入添加的0的个数
	fout.write((char*)&add_zero_length, sizeof(add_zero_length));
	std::ifstream temp_in;
	temp_in.open("temp.tmp", std::ios::binary);
	temp_in.seekg(0);
	char str[9] = {};
	while (temp_in.peek() != EOF) {                  //当未达到文件末尾
		temp_in.read(str, 8);								//读入八个字节
		char _str = char(std::strtol(str, nullptr, 2));     //创建一个字符
		fout.write(&_str, 1);								//写入这个字符
	}
	temp_in.close();
	fout.close();
	//析构树
	traverse(root, DeconstructTree());
	//删除临时文件
	return std::remove("temp.tmp") != EOF;
}
bool decode(std::string const &FileName, std::string const &DestName) {
	std::ifstream fin;
	std::ofstream fout;
	//打开源文件
	fin.open(FileName, std::ios::binary);
	if (!fin) {
		std::cerr << "文件" << FileName << "打开失败" << std::endl;
		return false;
	}
	//创建文件
	fout.open(DestName, std::ios::binary);
	if (!fout) {
		std::cerr << "文件" << DestName << "打开或创建失败" << std::endl;
		return false;
	}
	//对于空文件特殊处理
	if (fin.peek() == EOF) {
		fout.close();
		return true;
	}
	std::vector<Huff_Node *> Vec(256, nullptr);
	//数组中非nullptr的个数
	int count = [](int c) { return c == 0 ? 256 : c; }(fin.get());
	//读取文件中各个字节的权重，并分配空间
	for (int i = 0; i < count; ++i) {
		byte c = fin.get();
		Vec[c] = new Huff_Node(0, c);
		fin.read((char*)&Vec[c]->weight, sizeof(unsigned long));
	}
	Huff_Node *root = nullptr;
	if (count == 1) {
		for (size_t i = 0; i < 256; ++i)
			if (Vec[i] != nullptr)
				root = new Huff_Node(Vec[i]->weight, 0, new Huff_Node(), Vec[i]);
	}
	else {
		//构造哈夫曼树
		std::priority_queue<Huff_Node *, std::vector<Huff_Node *>, MyCmp> Q(Vec.begin(), Vec.end());
		while (Q.size() > 1) {
			//对于空结点的处理
			if (Q.top() == nullptr) {
				Q.pop();
				continue;
			}
			Huff_Node *l = Q.top();
			Q.pop();
			Huff_Node *r = Q.top();
			Q.pop();
			Q.push(new Huff_Node(l->weight + r->weight, '\000', l, r));
		}
		root = Q.top();
	}
	Huff_Node *p = root;
	//遍历，生成遍历字符串
	traverse(p, GenerateTraverseString());
	//添加的0个数
	count = fin.get();
	//创建临时文件
	std::ofstream temp_out;
	temp_out.open("temp.tmp", std::ios::binary);
	if (!temp_out) {
		std::cerr << "文件temp.tmp打开或创建失败" << std::endl;
		return false;
	}
	int c;
	char str[9] = {};
	//将二进制码写入临时文件
	while ((c = fin.get()) != EOF)
		temp_out << (std::bitset<8>(c).to_string());
	//文件的实际大小
	std::streampos size = temp_out.tellp() - (std::streampos)count;
	temp_out.close();
	//打开临时文件
	std::ifstream tmp_in("temp.tmp", std::ios::binary);
	//打开文件
	//遍历临时文件，并写入文件内容
	while (tmp_in.tellg() != size) {
		if (tmp_in.get() == '0')
			p = p->lchild;
		else p = p->rchild;
		if (p->lchild == NULL && p->rchild == NULL) {
			fout.write((char*)&p->data, 1);
			p = root;
		}
	}
	tmp_in.close();
	fout.close();
	//析构树
	traverse(root, DeconstructTree());
	//删除临时文件
	return remove("temp.tmp") != EOF;
}
int main() {
	bool flag1 = encode("test", "test.encode");
	bool flag2 = decode("test.encode", "decode.test");
}

哈夫曼树

http://example.com/2019/04/27/2019-04-27-哈夫曼树/

作者

Uchiha Kakashi

发布于

2019-04-27

更新于

2019-11-26

许可协议

#C++数据结构

哈夫曼树

作者

发布于

更新于

许可协议

链接

分类

最新文章

归档

标签