欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

数据压缩实验三

程序员文章站 2022-07-14 21:58:17
...

实验原理

一、熵

在信息论中,熵是信息的度量单位。信息论的创始人shannon在其著作《通信的数学理论》中提出了建立在概率统计模型上的信息度量。他把信息定义为用来消除不确定性的东西 一般用符号 表示,单位是比特。变量的不确定性越大,熵也就越大。换句话说,了解它所需要的信息量也就越大。 Huffman Coding (霍夫曼编码)是一种无失真编码的编码方式,Huffman 编码是可变字长编码(VLC)的一种。Huffman 编码基于信源的概率统计模型,它的基本思路是,出现概率大的信源符号编长码,出现概率小的信源符号编短码,从而使平均码长最小。在程序实现中常使用一种叫做树的数据结构实现 Huffman 编码,由它编出的码是即时码。 

二、 Huffman 编码

  Huffman Coding (霍夫曼编码)是一种无失真编码的编码方式,Huffman 编码是可变字长编码(VLC)的一种。 Huffman 编码基于信源的概率统计模型,它的基本思路是,出现概率大的信源符号编长码,出现概率小的信源符号编短码,从而使平均码长最小。在程序实现中常使用一种叫做树的数据结构实现 Huffman 编码,由它编出的码是即时码。 

实验流程

数据压缩实验三

1、统计符号的发生概率;

2、把频率按从小到大的顺序排列

3、每一次选出最小的两个值,作为二叉树的两个叶子节点,将和作为它们的根节点,这两个叶子节点不再参与比较,新的根节点参与比较;

4、 重复 3,直到最后得到和为 的根节点;

5、将形成的二叉树的左节点标 0,右节点标 1,把从最上面的根节点到最下面的叶子节点途中遇到的 0,序列串起来,就得到了各个符号的编码。

关键代码及其分析

huff code.c

//定义数据类型

//节点

typedef struct huffman_node_tag

{

unsigned char isLeaf;//判断是否是叶节点

unsigned long count;//出现次数

struct huffman_node_tag *parent;

union

{

struct

{

struct huffman_node_tag *zero, *one;//01

};

unsigned char symbol;

};

} huffman_node;

//码字数据类型

typedef struct huffman_code_tag

{

/* The length of this code in bits. */

unsigned long numbits;//长度

/* The bits that make up this code. The first

  bit is at position 0 in bits[0]. The second

  bit is at position 1 in bits[0]. The eighth

  bit is at position 7 in bits[0]. The ninth

  bit is at position 0 in bits[1]. */

unsigned char *bits;

} huffman_code;

static unsigned long

numbytes_from_numbits(unsigned long numbits)

{

return numbits / 8 + (numbits % 8 ? 1 : 0);//确定字节数

}

/*

* get_bit returns the ith bit in the bits array

* in the 0th position of the return value.

*/

static unsigned char

get_bit(unsigned char* bits, unsigned long i)

{

return (bits[i / 8] >> i % 8) & 1;

//i/8取整,i%8取余,表示第几字节第几位

}

//码字逆序

static void

reverse_bits(unsigned char* bits, unsigned long numbits)

{

unsigned long numbytes = numbytes_from_numbits(numbits);//码字占用字节数

unsigned char *tmp =

(unsigned char*)alloca(numbytes);//开辟空间

unsigned long curbit;

long curbyte = 0;

memset(tmp, 0, numbytes);

for (curbit = 0; curbit < numbits; ++curbit)

{

unsigned int bitpos = curbit % 8;

if (curbit > 0 && curbit % 8 == 0)//判断当前位在字节中的位数

++curbyte;//到下一字节

tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);//从后往前取每一位再移位

}

memcpy(bits, tmp, numbytes);

}



//新建码字

static huffman_code*

new_code(const huffman_node* leaf)

{

/* Build the huffman code by walking up to

* the root node and then reversing the bits,

* since the Huffman code is calculated by

* walking down the tree. */

unsigned long numbits = 0;//码长

unsigned char* bits = NULL;//存储码字的数组

huffman_code *p;

while (leaf && leaf->parent)//不是根节点

{

huffman_node *parent = leaf->parent;//得到码字位置和码字的字节数

unsigned char cur_bit = (unsigned char)(numbits % 8);

unsigned long cur_byte = numbits / 8;

/* If we need another byte to hold the code,

then allocate it. */

if (cur_bit == 0)

{

size_t newSize = cur_byte + 1;

bits = (char*)realloc(bits, newSize);//新建字节保存高位的码字

bits[newSize - 1] = 0;//新增字节初始化为0

}

if (leaf == parent->one)//是否是右节点

bits[cur_byte] |= 1 << cur_bit;//将当前字节设为1

++numbits;//码长++

leaf = parent;//将下一个节点挪到父节点

}

if (bits)

reverse_bits(bits, numbits);//码字逆序

p = (huffman_code*)malloc(sizeof(huffman_code));

p->numbits = numbits;

p->bits = bits;

return p;

}

#define MAX_SYMBOLS 256

typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];//信源符号数组

typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];//编码后符号数组

//新建结点

static huffman_node*

new_leaf_node(unsigned char symbol)

{

huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));//开辟空间

p->isLeaf = 1;

//为叶结点

p->symbol = symbol;

p->count = 0;

p->parent = 0;

return p;

}

//建立内部节点

static huffman_node*

new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)

{

huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));

p->isLeaf = 0;//不是叶节点

p->count = count;

p->zero = zero;

p->one = one;

p->parent = 0;

return p;

}

//统计信源符号概率

static unsigned int

get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)

{

int c;

unsigned int total_count = 0;

/* Set all frequencies to 0. */

init_frequencies(pSF);//初始化频率为0

while ((c = fgetc(in)) != EOF)//读取文件中每个信源符号

{

unsigned char uc = c;

if (!(*pSF)[uc])

(*pSF)[uc] = new_leaf_node(uc);//若没有此符号,建立新节点

++(*pSF)[uc]->count;//累计发生次数

++total_count;//累计信源发生次数

}

return total_count;

}

//将节点按出现概率从小到大排序,qsort函数中用到

static int

SFComp(const void *p1, const void *p2)

{

const huffman_node *hn1 = *(const huffman_node**)p1;

const huffman_node *hn2 = *(const huffman_node**)p2;

//将所有空节点排序

if (hn1 == NULL && hn2 == NULL) //两个节点都为空

return 0;//返回0

if (hn1 == NULL)//第一个节点为空,第二个节点大

return 1;//返回1

if (hn2 == NULL)//第二个节点空,第一个节点大

return -1;//返回-1

if (hn1->count > hn2->count)//两个节点都不为空,比较count值,1大于2

return 1;//返回1

else if (hn1->count < hn2->count)//1小于2

return -1;//返回-1

return 0;

}

/*

* build_symbol_encoder builds a SymbolEncoder by walking

* down to the leaves of the Huffman tree and then,

* for each leaf, determines its code.

*/

//生成码字

static void

build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)

{

if (subtree == NULL)//树为空则返回

return;

if (subtree->isLeaf)//叶节点进行编码

(*pSF)[subtree->symbol] = new_code(subtree);

else//不是叶节点先走左节点

{

build_symbol_encoder(subtree->zero, pSF);

build_symbol_encoder(subtree->one, pSF);

}

}

/*

* calculate_huffman_codes turns pSF into an array

* with a single entry that is the root of the

* huffman tree. The return value is a SymbolEncoder,

* which is an array of huffman codes index by symbol value.

*/

//建立Huffman

static SymbolEncoder*

calculate_huffman_codes(SymbolFrequencies * pSF)

{

unsigned int i = 0;

unsigned int n = 0;

huffman_node *m1 = NULL, *m2 = NULL;

SymbolEncoder *pSE = NULL;

#if 1

printf("BEFORE SORT\n");

print_freqs(pSF);

#endif

qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);//使用qshort函数将信源符号按出现频率大小排序,下标小的在前

#if 1

printf("AFTER SORT\n");

print_freqs(pSF);

#endif

for (n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)//统计种类总数,一个字节8bit,共256

;

/*

* Construct a Huffman tree. This code is based

* on the algorithm given in Managing Gigabytes

* by Ian Witten et al, 2nd edition, page 34.

* Note that this implementation uses a simple

* count instead of probability.

*/

for (i = 0; i < n - 1; ++i)

{

m1 = (*pSF)[0];

m2 = (*pSF)[1];//将出现次数最少的两个节点设为m1,m2

/* Replace m1 and m2 with a set {m1, m2} whose probability

* is the sum of that of m1 and m2. */

(*pSF)[0] = m1->parent = m2->parent =

new_nonleaf_node(m1->count + m2->count, m1, m2);

(*pSF)[1] = NULL;

qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);//重新排序

}

/* Build the SymbolEncoder array from the tree. */

pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));

memset(pSE, 0, sizeof(SymbolEncoder));

build_symbol_encoder((*pSF)[0], pSE);//从树根开始构建码字

return pSE;

}

/*写入码表*/

static int

write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)

{

unsigned long i, count = 0;

/* Determine the number of entries in se. */

for (i = 0; i < MAX_SYMBOLS; ++i)//统计码字种类

{

if ((*se)[i])

++count;

}

/* Write the number of entries in network byte order. */

i = htonl(count);    //在网络传输中,采用big-endian序,对于0x0A0B0C0D ,传输顺序就是0A 0B 0C 0D

//因此big-endian作为network byte orderlittle-endian作为host byte order

//little-endian的优势在于unsigned char/short/int/long类型转换时,存储位置无需改变

if (fwrite(&i, sizeof(i), 1, out) != 1)

return 1;

/* Write the number of bytes that will be encoded. */

symbol_count = htonl(symbol_count);

if (fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)

return 1;

/* Write the entries. */

for (i = 0; i < MAX_SYMBOLS; ++i)//写入码表

{

huffman_code *p = (*se)[i];

if (p)

{

unsigned int numbytes;

/* Write the 1 byte symbol. */

fputc((unsigned char)i, out);//写入字节符号

/* Write the 1 byte code bit length. */

fputc(p->numbits, out);//写入码长

/* Write the code bytes. */

numbytes = numbytes_from_numbits(p->numbits);//写入码字

if (fwrite(p->bits, 1, numbytes, out) != numbytes)

return 1;

}

}

return 0;

}

//对文件解码,读取码表

static huffman_node*

read_code_table(FILE* in, unsigned int *pDataBytes)

{

huffman_node *root = new_nonleaf_node(0, NULL, NULL);

unsigned int count;

/* Read the number of entries.

(it is stored in network byte order). */

if (fread(&count, sizeof(count), 1, in) != 1)//读取码表中的符号数

{

free_huffman_tree(root);

return NULL;

}

count = ntohl(count);

/* Read the number of data bytes this encoding represents. */

if (fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1)

{

free_huffman_tree(root);

return NULL;

}

*pDataBytes = ntohl(*pDataBytes);

/* Read the entries. */

while (count-- > 0)//读取码表

{

int c;

unsigned int curbit;

unsigned char symbol;

unsigned char numbits;

unsigned char numbytes;

unsigned char *bytes;

huffman_node *p = root;

if ((c = fgetc(in)) == EOF)//读入

{

free_huffman_tree(root);

return NULL;

}

symbol = (unsigned char)c;

if ((c = fgetc(in)) == EOF)

{

free_huffman_tree(root);

return NULL;

}

numbits = (unsigned char)c;

numbytes = (unsigned char)numbytes_from_numbits(numbits);//计算存储一个码长需要多少字节

bytes = (unsigned char*)malloc(numbytes);//开辟空间

if (fread(bytes, 1, numbytes, in) != numbytes)//读取码字

{

free(bytes);

free_huffman_tree(root);

return NULL;

}

/*

* Add the entry to the Huffman tree. The value

* of the current bit is used switch between

* zero and one child nodes in the tree. New nodes

* are added as needed in the tree.

*/

for (curbit = 0; curbit < numbits; ++curbit)

{

if (get_bit(bytes, curbit))//1建右节点

{

if (p->one == NULL)

{

p->one = curbit == (unsigned char)(numbits - 1)

? new_leaf_node(symbol)

: new_nonleaf_node(0, NULL, NULL);

p->one->parent = p;

//如果是最后一位,则建立树叶节点,不是则建立非叶节点

}

p = p->one;

}

else//建立左节点

{

if (p->zero == NULL)

{

p->zero = curbit == (unsigned char)(numbits - 1)

? new_leaf_node(symbol)

: new_nonleaf_node(0, NULL, NULL);

p->zero->parent = p;

}

p = p->zero;

}

}

free(bytes);

}

return root;//返回根节点

}

//解码

int

huffman_decode_file(FILE *in, FILE *out)

{

huffman_node *root, *p;

int c;

unsigned int data_count;

root = read_code_table(in, &data_count);//读取码表

if (!root)

return 1;

p = root;

while (data_count > 0 && (c = fgetc(in)) != EOF)

{

unsigned char byte = (unsigned char)c;

unsigned char mask = 1;//逐位读码字

while (data_count > 0 && mask)

{

p = byte & mask ? p->one : p->zero;

mask <<= 1;//左移1

if (p->isLeaf)

{

fputc(p->symbol, out);//输出叶节点存储符号

p = root;//转到根节点

--data_count;//没解码符号数-——

}

}

}

free_huffman_tree(root);

return 0;

//huffman.c

main(int argc, char** argv) { char memory = 0;//1对内存数据进行操作,0不操作 char compress = 1;//1解码,0编码 int opt; const char *file_in = NULL, *file_out = NULL; //step1:add by yzhang for huffman statistics const char *file_out_table = NULL; //end by yzhang FILE *in = stdin; FILE *out = stdout; //step1:add by yzhang for huffman statistics FILE * outTable = NULL; //end by yzhang /* Get the command line arguments. */ while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1) //读取命令行参数 { switch(opt) { case 'i': file_in = optarg;//输入文件 break; case 'o': file_out = optarg;//输出文件 break; case 'c': compress = 1;//编码 break; case 'd': compress = 0;//解码 break; case 'h': usage(stdout);//输出参数用法的说明 return 0; case 'v': version(stdout);//输出版本号的信息 return 0; case 'm': memory = 1;//对内存数据进行操作 break; // by yzhang for huffman statistics case 't': file_out_table = optarg;//输出中间数据信息 break; //end by yzhang default: usage(stderr); return 1; } } if(file_in)//输入文件 { in = fopen(file_in, "rb"); if(!in) { fprintf(stderr, "Can't open input file '%s': %s\n", file_in, strerror(errno)); return 1; } } /* If an output file is given then create it. */ if(file_out)//创建输出文件 { out = fopen(file_out, "wb"); if(!out) { fprintf(stderr, "Can't open output file '%s': %s\n", file_out, strerror(errno)); return 1; } } 。。。

}


实验结果

数据压缩实验三

数据压缩实验三

观察可得平均码长和熵的值很接近,与此同时如果文件过小,则很有可能出现huffman编码之后文件变大的情况,由于还有码表的存在,很占空间。