「哈夫曼编码」基于哈夫曼树的编码和译码

哈夫曼编码

1.实验要求：

利用哈夫曼编码:要传输一些数据(比如英文单词), 设计一个利用哈夫曼算法的编码系统, 为这些单词编码, 并在接收端进行译码.

基本要求:

(1).将需要传输的数据存放在数据文件data.txt 中.

(2).读入数据文件并为其编码, 将编码后的内容存入文件code.txt中.

(3).读入code.txt, 译码, 并将译码后的内容输出在屏幕上.

2.基本思路：

(1).编码: 统计 date.txt 中不同种类的字符的数目, 以及同一字符出现的次数, 以该次数作为字符的权值. 根据这些权值建立哈夫曼树, 并且为每个字符编码.

遍历 date.txt 中的字符, 为每个字符匹配编码, 保存所有匹配得到的编码, 最后存入code.txt中.

(2).译码:从 code.txt 读取编码, 根据每段编码匹配对应的字符, 储存所有匹配到的字符, 并输出.

3.代码和注释如下:

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<sys/stat.h>
#include<sys/types.h>
#include<fcntl.h>
#include<unistd.h>
#include<errno.h>

#define    N          10000

int count = 0;        //每增加一个新的字符, count增加1, 可表示a中的字符种类数, 也即哈夫曼树叶子点个数 

/*定义哈夫曼树结构体*/
typedef struct HuffmanTree{
  int weight;
  int parent;
  int Lchild;
  int Rchild;
}HuffmanTree[2*N];

/*定义储存字符及其出现次数的结构体*/
typedef struct DifferentCharacter{
  char char_date;
  int num;           //相同字符出现的次数
  char a_code[100];  //每种字符对应的编码
}difcha[N];

/*在一定范围内选择两个weight最小的结点, 并将两个结点的序号赋给s1, s2*/
void select_two(HuffmanTree ht, int j, int *s1, int *s2) {
  int i = 1, temp;
  int min1 = 0, min2 = 0;
  while( (ht[i].parent != 0) && (i <= j) )
    i++;
  *s1 = i;
  min1 = ht[i++].weight;
  
  while( (ht[i].parent != 0) && (i <= j) )
    i++;
  *s2 = i;
  min2 = ht[i++].weight;
  
  if(min1 > min2){
    temp = min1;
    min1 = min2;
    min2 = temp;
  }                                    
  
  for(; i <= j; i++){                    //遍历parent不为0的结点
    if(ht[i].parent != 0)
      continue;
    if(ht[i].weight <= min1){
      min2 = min1;
      min1 = ht[i].weight;
      *s2 = *s1;
      *s1 = i;
    }
    else if( (ht[i].weight < min2) && (ht[i].weight > min1) ) {
      min2 = ht[i].weight;
      *s2 = i;
    }
  }
}
         
/*建哈夫曼树*/
void EstHuffmanTree(HuffmanTree ht, int *w, int n){
  int i;
  int s1 = 0, s2 = 0;
  for(i = 1; i <= n; i++){                 //初始化哈夫曼树, 前n个单元存放叶子点
    ht[i].weight = w[i];
    ht[i].parent = 0;
    ht[i].Lchild = 0;
    ht[i].Rchild = 0;
  }
  for(i = n+1; i <= 2*n-1; i++){           //后n-1个单元存放非叶子点
    ht[i].weight = 0;
    ht[i].parent = 0;
    ht[i].Lchild = 0;
    ht[i].Rchild = 0;
  }

  for(i = n+1; i <= 2*n-1; i++){
    select_two(ht, i-1, &s1, &s2);         //创建非叶子点, 建立哈夫曼树, 每次在ht[1]~ht[i-1]范围内选两个最小的weight结点,并将其序号赋给s1, s2
    
    ht[i].weight = ht[s1].weight + ht[s2].weight;
    ht[i].Lchild = s1;
    ht[i].Rchild = s2;
    ht[s1].parent = i;
    ht[s2].parent = i;
  }                                       //哈夫曼树建立完毕
}

/*求哈弗曼编码*/
void CrtHuffmanCode(HuffmanTree ht, char **hcd, int n){
  int start = 0, c = 0, p = 0, i;
  char *cd = (char*)malloc(n*sizeof(char));      //分配求当前编码的工作空间
  cd[n-1] = '\0';                                //从左向右存放编码
  for(i = 1; i <= n; i++) {
    start = n-1;                                 //初始化编码起始指针
    c = i;
    p = ht[i].parent;
    while(p != 0){
      start--;
      if(ht[p].Lchild == c)
	cd[start] = '0';          //左分支标0
      else
	cd[start] = '1';          //右分支标1

      c = p;                      //向上倒推                      
      p = ht[c].parent;
    }
    hcd[i] = (char*)malloc((n-start)*sizeof(char));
    strcpy(hcd[i], &cd[start]);
  }
  free(cd);
}

/*自定义错误处理函数*/
void my_err(char *err_string, int line){
  printf("Line %d:\n", line);
  pERROR(err_string);
  exit(1);
}

/*从 buf_read 中统计每个字符出现的次数,将次数作为该字符的权值*/
void Statistics(difcha a, char *buf_read){
  int i, j = 0;
  
  for(i = 0; i < strlen(buf_read) - 1; i++){        //对buf_read中的字符遍历
    for(j = 0; j < count; j++){                     //检查是否是新的字符
      if(a[j].char_date == buf_read[i]){
	a[j].num++;                                 //若是旧字符, 则num++;
	break;
      }
    }
    if(j == count){                                 //若是新字符, 则记录到a中, 且对应的num++
      a[count].char_date = buf_read[i];
      a[count].num++;
      count++;                                      //更新count
    }
  }
}

/*从 date.txt 读取数据到 buf_read */
void ReadFile(char *pathName, char *buf_read){
  int fd_date;
  int len = 0;
  
  if( (fd_date = open(pathName, O_RDWR)) < 0)   //以读写方式打开date.txt文件
      my_err("open date.txt", __LINE__);
  
  if(lseek(fd_date, 0, SEEK_END) < 0)             //获取文件长度,并保持文件读写指针在文件开始处
    my_err("lseek", __LINE__);
  if( (len = lseek(fd_date, 0, SEEK_CUR)) < 0 )
    my_err("lseek", __LINE__);
  if(lseek(fd_date, 0, SEEK_SET) < 0)
    my_err("lseek", __LINE__);
  
  if(read(fd_date, buf_read, len) > len)         //从date.txt中读取内容
    my_err("read date.txt", __LINE__);
}
 
/*将 buf_code 写入 code.txt 中*/
void WriteFile(char *pathName, char *buf_code){
  int fd_code;
  
  if((fd_code = open(pathName, O_CREAT|O_TRUNC|O_RDWR, S_IRWXU)) < 0)      //创建code.txt文件
    my_err("open code.txt", __LINE__); 
  if( write(fd_code, buf_code, strlen(buf_code)) != strlen(buf_code) )       //将 buf_code 写入code.txt
    my_err("write code.txt", __LINE__);
}

/*主函数*/
void main(){
  char buf_read[N] = {'\0'};
  char buf_code[N] = {'\0'};
  char buf_yima[N] = {'\0'};
  char *hcd[N];
  char temp[50] = {'\0'};
  difcha a;
  int i, j, n, k = 0, m = 0;
  int w[N] = {0};
  HuffmanTree ht;
  
  ReadFile("date.txt", buf_read);
  Statistics(a, buf_read);
  for(i = 0; i < count; i++)
    w[i+1] = a[i].num;
  EstHuffmanTree(ht, w, count);             //建HuffmanTree
  CrtHuffmanCode(ht, hcd, count);           //对树中字符进行编码
  for(i = 1; i <= count; i++)               //将每个字符对应的编码存入结构体 a 中
    strcpy(a[i-1].a_code, hcd[i]);
  
  /*for(i = 0; i < count; i++)                //查看每个字符的权值和对应的编码
    printf("%c  %d  %s\n", a[i].char_date, a[i].num, a[i].a_code);*/
  
  for(i = 0; i < strlen(buf_read) - 1; i++){                   //遍历 buf_read, 给 date.txt 中每个字符匹配编码, 存入 buf_code 中
    for(j = 0; j < count; j++){                                
      if(buf_read[i] == a[j].char_date){
	strcat(buf_code, a[j].a_code);
	break;
      }
    }
    if(j == count)                          //匹配异常
      printf("Unknown Character: %c\n", buf_read[i]);
  }
  
  WriteFile("code.txt", buf_code);                      //将 buf_code 写入 code.txt 中  
  ReadFile("code.txt", buf_read);                       //从 code.txt 中读取全部编码
  n = strlen(buf_read);
  for(i = 0; i < n; i++){                               //为 code.txt 中的编码匹配字符
    temp[k++] = buf_read[i];
    for(j = 0; j < count; j++){
      if(strcmp(temp, a[j].a_code) == 0){
	buf_yima[m++] = a[j].char_date;
	break;
      }
    }
    if(j < count){                                      //匹配成功, 对 temp 初始化
      for(;k > 0; k--)
	temp[k] = '\0';
    }
  }
  printf("The result of decoding is:\n\n%s\n", buf_yima);
}