基于哈夫曼编码的文件压缩

236 阅读 0 评论 156 点赞

我是靠谱客的博主开心皮皮虾，这篇文章主要介绍基于哈夫曼编码的文件压缩，现在分享给大家，希望可以做个参考。

//Heap.h

//堆类
#pragma once
#include<iostream>
using namespace std;
#include<vector>
#include<cassert>
//仿函数（函数对象）--建小堆
template<typename T>
struct Less
{
    bool operator()(const T& left,const T& right)
    {
        return left<right;
    }
};
//仿函数（函数对象）--建大堆
template<typename T>
struct Great
{
    bool operator()(const T& left,const T& right)
    {
        return left>right;
    }
};

template<typename T,typename Compare>
class Heap
{
public:
    Heap()//默认构造函数（全缺省构造函数）
    {}
    Heap(const T* arr,size_t sz)//构造函数
    {
         assert(arr);
         //1.将数组中的元素放入堆中
         for (int i=0;i<sz;++i)
         {
             _heap.push_back(arr[i]);
         }
         //2.从堆中(sz-2)/2的位置（即最后一个非叶子节点位置）开始利用向下调整建堆
         for (int i=(sz-2)/2;i>=0;++i)
         {
             AdjustDown(i);
         }
    }
    void Push(const T& x)//插入--插入到最后一个位置，然后沿着该节点到根结点的路径向上调整
    {
         _heap.push_back(x);
         AdjustUp(_heap.size()-1);
    }
    void Pop()//删除（堆顶元素）---将堆顶元素和堆底元素交换；删除堆底元素；然后从堆顶向下调整
    {
        if (_heap.empty())
        {
            cout<<"heap empty"<<endl;
        }
        else
        {
            //先交换堆底和堆顶元素
            std::swap(_heap[0],_heap[_heap.size()-1]);
            _heap.pop_back();
            //然后从堆顶开始向下调整
            AdjustDown(0);
        }
    }
    const T& Top()//取最值（堆顶元素）
    {
        if (!_heap.empty())
        {
             return _heap[0];
        }
        else
        {
            cout<<"heap empty!"<<endl;
        }
    }
    size_t Size()//返回堆中元素个数
    {
       return _heap.size();
    }
    bool Empty()//判断堆是否为空
    {
       return _heap.empty();
    }
private:
    //向下调整
    void AdjustDown(size_t index)
    {
        size_t leftchild=index*2+1;//左孩子
        Compare com;//大小堆

        while (leftchild<_heap.size())
        {
            //1.寻找左右孩子的最值下标
            if ( leftchild+1<_heap.size() && com(_heap[leftchild+1],_heap[leftchild]))
            {
                leftchild++;//记录左右孩子的最值下标
            }
            //2.比较并决定是否交换
            //2.1 交换,继续向下调整
            if (com(_heap[leftchild],_heap[index]))
            {
                std::swap(_heap[leftchild],_heap[index]);
                //继续向下调整
                index=leftchild;
                leftchild=index*2+1;
            }
            //2.2不交换，那么不用继续向下调整
            else
                 break;
        }
    }
    //向上调整
    void AdjustUp(int index)
    {
        int father=(index-2)/2;//父节点
        Compare com;
        while (father>=0)
        {
            //交换,继续向上调整
            if (com(_heap[index],_heap[father]))
            {
                std::swap(_heap[index],_heap[father]);
                index=father;
                father=(index-2)/2;//继续向上调整
            }
            //不交换，不用向上调整
            else
                break;
        }
    }
private:
    vector<T> _heap;
};

//HuffmanTree.h

//哈夫曼树类
#pragma  once
#include<iostream>
using namespace std;
#include<cassert>
#include "Heap.h"//借助堆构建哈夫曼树

template<typename T>
struct HuffmanTreeNode
{
    HuffmanTreeNode<T>* _leftchild;//左孩子
    HuffmanTreeNode<T>* _rightchild;//右孩子

    T _weight;//权值
    HuffmanTreeNode(const T& weight=0)
        :_leftchild(NULL)
        ,_rightchild(NULL)
        ,_weight(weight)
    {}
};

template<typename T>
class HuffmanTree
{
    typedef HuffmanTreeNode<T> Node;
public:
    HuffmanTree()//默认构造函数（全缺省构造函数）
        :_root(NULL)
    {}
    //利用贪心算法构造哈夫曼树
    HuffmanTree(const T* arr,size_t sz,const T& invaild)//构造
    { 
        _root=CreatHuffmanTree(arr,sz,invaild);
    }
    ~HuffmanTree()//析构
    {
        _Destroy(_root);
    }
    Node* GetRoot()//获取哈夫曼树的根结点
    {
         return _root;
    }
private:
    //构建哈夫曼树
    Node* CreatHuffmanTree( const T* arr,size_t sz,const T& invaild)
    {
        struct LessNode
        {
             bool operator()(const Node* left,const  Node* right)
             {
                 return left->_weight < right->_weight;
             }
        };
         Heap<Node*,LessNode> MinHeap;
         //1.依据权值建结点的MinHeap
         for (size_t i=0;i<sz;i++)
         {
             if (arr[i]!=invaild)
             {
                 Node* node=new Node(arr[i]);
                 MinHeap.Push(node);
             }
         }
         //2.构建huffmanTree
        while (MinHeap.Size()>1)
        {
            //2.从堆中依次去除两个最小的结点建树
            Node* left=MinHeap.Top();
            MinHeap.Pop();
            Node* right=MinHeap.Top();
            MinHeap.Pop();
            Node* father=new Node(left->_weight+right->_weight);
            father->_leftchild=left;
            father->_rightchild=right;
            //3.将两个最小值的和入堆
            MinHeap.Push(father);
        }
        _root=MinHeap.Top();
        MinHeap.Pop();
        return _root;
    }

    void _Destroy(Node* root)
    {
        if (root==NULL)
        {
            return ;
        }
        else
        {
            _Destroy(root->_leftchild);
            _Destroy(root->_rightchild);
            delete root;
            root=NULL;
        }
    }
private:
    Node* _root;
};

//FileCompression.h

//文件压缩类
#pragma once 
#include "HuffmanTree.h"
#include<string>
#include<cstdlib>
#include <cstdio>
typedef long long LongType;
//字符信息类---充当哈夫曼树的结点权值
struct CharInformation
{

    unsigned char _c;//出现的字符

    //ASCLL表中一共有0-255十进制编号的256个字符
    //unsigned char 是单字节的全8位表示字符类型，可以表示这256个字符

    string _code;//该字符的哈夫曼编码
    LongType _count;//该字符在文件中出现的次数

    CharInformation()//构造函数
        :_c(0)
        ,_count(0)
        //_code不用初始化，会调用string类的默认构造函数初始化
    {}

    //运算符重载，让该自定义类也能像内置类型一样，进行运算
    //建哈夫曼树时，和非法值比较使用
    bool operator!=(const CharInformation& ch)const 
    {
        return _count!=ch._count;
    }

    //建小堆使用
    bool operator<(const CharInformation& ch)const
    {
        return _count<ch._count;
    }

    //构造父节点时使用
    CharInformation operator+(const CharInformation& ch)
    {
        CharInformation cf;
        cf._count=_count+ch._count; 
        return cf;
    }
};

//文件压缩与解压缩类
class FileCompression
{
public:
    //构造函数
    FileCompression()
    {
        //初始化字符表数组的每个字符信息的字符成员
        for (int i=0;i<256;++i)
        {
            _ArrCh[i]._c=i;
        }
    }
    //压缩函数
    //传入一个待压缩的原始文件名，返回一个压缩之后的文件名，方便解压缩函数传参
    void  Compression(string PrimaryFileName)
    {

          //1.打开源文件
          FILE* fout=fopen(PrimaryFileName.c_str(),"rb");//以只读的二进制方式打开文件，否则读汉字有问题
          if (fout==NULL)
          {
              perror("fopen");
              exit(1);
          }
          //2.统计256个字符出现的次数，并赋值给相应的数组的字符
          int ch=getc(fout);
          while (!feof(fout))
          {
              _ArrCh[ch]._count++;
              ch=getc(fout);
          }
          //3.根据字符的出现次数构建哈夫曼树，从而得到每个字符对应的哈夫曼编码
            //3.1根据字符信息构建哈夫曼树
          CharInformation invaild;//count=0相当于一个非法值，不用构建到huffman树中
          HuffmanTree<CharInformation>  huffmantree(_ArrCh,256,invaild);
            //3.2根据哈夫曼树得到对应的哈夫曼编码，并赋值正确的字符信息中
          string code;//用于递归记录字符的哈夫曼编码
          GetHuffmanCode(huffmantree.GetRoot(),code);

          //至此已经根据已经将源文件的全部信息（字符，字符的出现次数，字符对应的哈夫曼编码）
          //全部统计出来，放入成员变量字符表数组中，接下来就要操作数组，压缩文件

          //4.压缩文件---即将字符的全部哈夫曼编码以字节为单位放入压缩文件中
            //4.1创建压缩文件
            string CompressFileName=PrimaryFileName;
            size_t index= CompressFileName.find('.',0);
            CompressFileName=CompressFileName.substr(0,index);
            CompressFileName+=".compress";
            FILE* Input=fopen(CompressFileName.c_str(),"wb");//以只写防止打开二进制文件,文件不存在则创建新文件
            if (Input==NULL)
            {
                perror("fopen");
                exit(1);
            }
            //4.2将所有字符的哈夫曼编码以一字节为单位放入压缩文件中
            fseek(fout,0,SEEK_SET);//更正文原件流的读写位置到文件起始处
            ch=getc(fout);
            char c=0;//以一字节为单位放入压缩文件
            int size=0;//记录一个字节八位是否写满
            while (!feof(fout))
            {
                //处理一个字符的所有哈夫曼编码
                 for (size_t i=0;i<_ArrCh[ch]._code.size();i++)
                 {
                      c<<=1;
                      if (_ArrCh[ch]._code[i]=='1')
                      {
                          c|=1;//先把1放到最低位
                      }
                      //如果为'0'不用管；
                      size++;
                      //当一个字节八位装满，则写入压缩文件
                      if (size==8)
                      {
                          //写入压缩文件
                          if (fputc(c,Input)==EOF)
                          {
                              cout<<"压缩时哈夫曼编码写入失败"<<endl;
                          }
                          //重置
                          size=0;
                          c=0;
                      }
                 }
                 //一个字符处理完，读取下一个字符
                 ch=getc(fout);
            }
            //处理最后一个不满八位的字符，后面全加0
            if (size>0)
            {
                c<<=(8-size);
                fputc(c,Input);
            }
            fclose(fout);//关闭源文件
            fclose(Input);//关闭压缩文件
            //至此压缩源文件完毕

            //5.写配置文件---由于解压时，收件人没有原始文件，而解压压缩文件，要根据哈夫曼树
                            //恢复字符，形成文中信息；所以，在压缩函数中要写配置文件，记录
                            //哈夫曼树的相关信息，从而在解压缩文件时，可以根据此信息，重建
                            //哈夫曼树
             //配置文件格式：
             //a:5
             //b:3
              //5.1定义配置文件名
            string ConfigurationFile=PrimaryFileName;
            ConfigurationFile=ConfigurationFile.substr(0,index);
            ConfigurationFile+=". config";
            FILE* FinConfi=fopen(ConfigurationFile.c_str(),"wb");
              //5.2给配置文件写信息
            string line;//记录一个字符的全部信息的字符串
            for (int i=0;i<256;i++)
            {
                if (_ArrCh[i]._count!=0)
                {
                    line+=_ArrCh[i]._c;//字符
                    line+=',';//分隔符
                    char StringCount[25]={0};
                    _itoa(_ArrCh[i]._count,StringCount,10);//将次数转化为字符串
                    line+=StringCount;
                    line+='n';
                    if (fputs(line.c_str(),FinConfi)==EOF)
                    {
                        cout<<"配置文件写入失败"<<endl;
                    }
                    line.clear();//清除字符串信息，方便下一次统计
                }
            }

            fclose(FinConfi);//关闭配置文件
    }
    //解压缩函数
    //传入一个压缩文件名
    void UnCompression(string CompressFileName)
    {
        //1.通过配置信息构建哈夫曼树
           //1.1还原字符表数组
        string ConfigurationFile=CompressFileName;
        size_t index=ConfigurationFile.find('.',0);
        ConfigurationFile=ConfigurationFile.substr(0,index);
        ConfigurationFile+=". config";
        FILE* FinConfi=fopen(ConfigurationFile.c_str(),"rb");
        if (FinConfi==NULL)
        {
            perror("fopen");
            exit(1);
        }
        string line;//记录每行配置文件的信息
        while(ReadLine(FinConfi,line))
        {
            //该行为空
            if (line.empty())
            {
                line+='n';
                continue;
            }
            else
            {
                unsigned char ch=line[0];
                //使用string::substr(pos)函数提取字符出现的次数
                _ArrCh[ch]._count=atoi(line.substr(2).c_str());
                line.clear();
            }
        }

          //1.2构建哈夫曼树
        CharInformation invaild;
        HuffmanTree<CharInformation> ht(_ArrCh,256,invaild);

          //1.3解压缩文件--通过从根结点遍历哈夫曼树得到原始文件
        string UnCompressFileName=CompressFileName;
        UnCompressFileName=UnCompressFileName.substr(0,index);
        UnCompressFileName+=" .unCompress";
        FILE* Func=fopen(UnCompressFileName.c_str(),"wb");
        if (Func==NULL)
        {
            perror("fopen");
            exit(1);
        }
           //1.4读取压缩文件
        FILE* Fcom=fopen(CompressFileName.c_str(),"rb");
        if (Fcom==NULL)
        {
            perror("fopen");
            exit(1);
        }

        HuffmanTreeNode<CharInformation>* root=ht.GetRoot();
        LongType TotalCites=root->_weight._count;//原始文件所有字符的个数
        HuffmanTreeNode<CharInformation>* cur=root;//遍历二叉树
        int ch=fgetc(Fcom);
        int pos=7;
        while (TotalCites>0)
        {
             if (ch&(1<<pos))//右1
             {
                 cur=cur->_rightchild;
             }
             else//左0
             {
                 cur=cur->_leftchild;
             }

             if (cur->_leftchild==NULL&&cur->_rightchild==NULL)
             {
                  fputc(cur->_weight._c,Func);//将对应的字符写入解压缩文件中
                  cur=root;//回到根结点继续寻找下一个字符
                  --TotalCites;//总还原字符减少1个
                  if(TotalCites==0)
                  {
                       break;
                  }
             }
             pos--;
             //一个压缩字符每位处理完毕
             if (pos<0)
             {
                 ch=fgetc(Fcom);//从压缩文件中读下一个压缩字符
                 pos=7;
             }
        }
        fclose(FinConfi);
        fclose(Fcom);
        fclose(Func);
    }
protected:
    //获取对应字符的哈夫曼编码
    void GetHuffmanCode(HuffmanTreeNode<CharInformation>* root,string code)
    {
        if (root==NULL)
        {
            return;
        }
        //遍历到叶子节点，即找到了对应字符的哈夫曼编码
        if (root->_leftchild==NULL&&root->_rightchild==NULL)
        {
            _ArrCh[root->_weight._c]._code=code;
        }
        //左0，右1
        GetHuffmanCode(root->_leftchild,code+'0');
        GetHuffmanCode(root->_rightchild,code+'1');

    }
    //读取配置文件的一行
    bool ReadLine(FILE* FinConfi,string& line)
    {
        int ch=fgetc(FinConfi);
        if (feof(FinConfi))//如果读到文件末尾，返回假
        {
            return false;
        }
        while (!feof(FinConfi)&&ch!='n')
        {
            line+=ch;
            ch=fgetc(FinConfi);
        }
        //没有读到文件末尾
        return true;
    }
protected:
    CharInformation _ArrCh[256];//文件操作的对象，字符信息表
};

测试用例

#include "FileCompression.h"
#include<windows.h>
void test()
{

    FileCompression fc;

    cout<<"开始压缩"<<endl;
    cout<<"压缩用时";
    int start = GetTickCount();
    //第一组：文本文件
    //fc.Compression("123.txt");----✔

    //第二组：图片文件
    //fc.Compression("picture.png");//----✔

    //第三组：视频文件
    //fc.Compression("video.flv");
    //第四组：音频文件
    //fc.Compression("music.mp3");----✔

    //第五组：大文件字符串文章---压缩效果最好
    fc.Compression("yingyu.txt");

    int end=GetTickCount();
    cout<<"compress time:"<<(end-start)<<endl;

    cout<<"开始解压"<<endl;
    cout<<"解压用时:";
    start=GetTickCount();
    //第一组：文本文件
    //fc.UnCompression("123.compress");----✔

    //第二组：图片文件
    //fc.UnCompression("picture.compress");//----✔

    //第三组：视频文件
    //fc.UnCompression("video.compress");

    //第四组：音频文件
    //fc.UnCompression("music.compress");//----✔

    //第五组：大文件字符串文章---压缩效果最好
    fc.UnCompression("yingyu.compress");

    end=GetTickCount();
    cout<<"uncompress time:"<<(end-start)<<endl;

};