LZ77 coding
1、LZ77算法介绍
LZ77和LZ78都是自适应词典编码技术,两者都源于J. Ziv 和A. Lempel在1977年和1988年发表的两篇里程碑式论文,这两篇论文使用了不同的方法,用于自适应地构建词典,每种方法都衍生出很多变体。研究人员将基于1977年论文方法称为LZ77(也称LZ1)系类,基于1978年论文方法称为LZ78(也称LZ2).
1.1、LZ77算法原理(滑动窗口)
在LZ77算法中,词典就是在先前已经编码序列的一部分。编码器通过一个滑动窗口来查看输入序列,滑动窗口由两部分组成,查找缓冲区(search buffer)和先行缓冲区(look-ahead buffer)如下图所示。在竖杠左侧的是查找缓冲区,在竖杠右侧的是先行缓冲区。编码器在搜索缓冲区中搜索与先行缓冲区中匹配的字符,输出三元组。LZ77算法做出了一种隐含假设:相似的模式会在一起聚集出现。该方法一最近已经编码的数据作为词典,从而利用上述假设。但这种模式存在一个问题,只要其重复周期长于编码器窗口的覆盖长度,就不会被捕捉到。最糟糕的情景是:待编码的序列是周期序列,但其周期长于查找缓冲区。
1.2、编码器工作步骤
- 1 编码器从右往左搜索待搜索区域,寻找和已编码区域第一个符号e相同的符号,找到了easily中的e。
- 2 从第一个匹配字符e向后(右)尽可能寻找连续的匹配字符,得到匹配字符串eas,匹配长度为3。
- 3 编码器继续往左搜索,重复步骤1、2.
- 4 使用匹配长度最长的三元组来表示,如果没有匹配的数据使用(0, 0, 未匹配的字符)表示。
1.2.1、Example
首先,搜索缓冲区是空的,没有匹配的数据,将(0,0,“s”)输出,符号“s”进入搜索缓冲区。重复上述步骤,“ sir ”(包含空格)进入搜索缓冲区。此时先行缓冲区中左侧第一个字符是“s”,找到搜索缓冲区中的“s”字符和其相匹配,继续查看下一个字符“i”,同样和搜索缓冲区中的字符“s”中的下一个字符“i”相匹配。重复同样的步骤,下一个字符不匹配。输出(4,2,“d”),其中,4是代表在搜索缓冲区的距离,重右往左计数,符号“s”是第四个符号,2代表匹配字符串“si”的长度,“d”代表先行缓冲区中下一个待匹配的字符。
1.2.2、注意事项
- 1、刚开始编码时,搜索缓冲区是空的,此时没有字典可查询;
- 2、在没有匹配字符时,三元组中的距离和匹配长度都为0;
- 3、距离从1开始,而不是0.
- 4、匹配字符串在先行字符串中有延申。
- 5、如果在搜索缓冲区中存在多个长度相等的匹配字符,选择距离最长的一个。原因:距离最长的是最后查找到的,不增加算法复杂度,不回头查找距离短的。如果长度使用变长编码,较小数字可以用较短码表示。
1.3、解码器工作
解码器工作相对编码器工作较简单,根据距离和长度,将搜索字符串中的数据输出。如果距离和长度为0,将第三个元素输出。
1.4、测试demo
/*
* LZ77压缩算法测试代码,完成文件的压缩并保存,然后解压缩恢复新的文件,
* 对比原始文件和压缩解压后的文件的差异,计算压缩比。
*/
#include <stdio.h>
#include <malloc.h>
#include <mem.h>
#define SEARCH_BUFFER_SIZE 10
#define LOOKAHEAD_BUFFER_SIZE 5
#define BUFFER_SIZE (SEARCH_BUFFER_SIZE+LOOKAHEAD_BUFFER_SIZE)
int compressor(FILE *ifp, FILE *ofp);
int decompressor(FILE *ifp, FILE *ofp);
int main() {
printf("Hello, World!\n");
printf("LZ77 compressor demo test by lab601_ch.\n");
FILE *ifp=NULL, *ofp=NULL, *dofp=NULL;
ifp = fopen("../21.data", "rb");
if(ifp == NULL){
printf("ERROR, cannot open file!!!");
return 1;
}
ofp = fopen("../21.2.compressed.data", "wb");
if(ofp == NULL){
printf("ERROR, cannot open compressed file!!!");
return 1;
}
compressor(ifp, ofp);
fclose(ifp);
fclose(ofp);
/*解压缩*/
ofp = fopen("../21.2.compressed.data", "rb");
if(ofp == NULL){
printf("ERROR, cannot open compressed file!!!");
return 1;
}
dofp = fopen("../21.2.decompressed.data", "wb");
if(dofp == NULL){
printf("ERROR, cannot open decompressed output file!!!");
return 1;
}
printf("------open file is ok!\n");
decompressor(ofp, dofp);
fclose(ofp);
fclose(dofp);
return 0;
}
/*
* 查找搜索缓冲区指定区域和先行缓冲区匹配长度
* input: 缓冲区制定区域首尾地址
* output: 匹配长度
*/
static int findMatchLength(const char* buffer, char *searchStart, char *searchEnd, char *lookaheadStart, char *lookaheadEnd)
{
/*搜索缓冲区或先行缓冲区长度为0,返回-1*/
if(searchStart == searchEnd){
printf("searchStart == searchEnd\n");
return -1;
}
if(lookaheadStart == lookaheadEnd) {
printf("lookaheadStart == lookaheadEnd\n");
return -1;
}
int length = 0;
/*待匹配数据位置指针*/
char *searchdata = searchStart;
char *lookdata = lookaheadStart;
while(1){
/*数据匹配,先行缓冲区仍有数据*/
if(*searchdata == *lookdata && lookdata!=lookaheadEnd){
/*循环队列,边界溢出,循环回到首地址*/
if(searchdata-buffer == BUFFER_SIZE)
searchdata = buffer;
else
searchdata++;
if(lookdata-buffer == BUFFER_SIZE)
lookdata = buffer;
else
lookdata++;
/*匹配长度计数*/
length++;
}else
break;
}
return length;
}
/*
* 查找缓冲区最大匹配字符串
* input: 缓冲区首尾地址
* output: 匹配长度,偏移
*/
static int findMaxMatchLength(const char* buffer, char *searchStart, char *searchEnd, char *lookaheadStart, char *lookaheadEnd, int *offset)
{
int length = 0;
*offset=0;
int searchSize = 0;
if(searchStart <= searchEnd)
searchSize = searchEnd - searchStart;
else
searchSize = searchEnd - searchStart + BUFFER_SIZE + 1;
/*搜索缓冲区空, 返回-1,出错*/
if(searchSize == 0)
return -1;
for(int i=0; i<searchSize; i++){
int len=0;
if((searchStart+i) > (buffer+BUFFER_SIZE))
len = findMatchLength(buffer, searchStart+i-(BUFFER_SIZE+1), searchEnd, lookaheadStart, lookaheadEnd);
else
len = findMatchLength(buffer, searchStart+i, searchEnd, lookaheadStart, lookaheadEnd);
if(len == -1) {
printf("len is -1, continue!\n");
continue;
}
if(length < len){
length = len;
*offset = searchSize-i;
}
}
return length;
}
int compressor(FILE *ifp, FILE *ofp)
{
/*100字节为搜索缓冲区,50字节为先行缓冲区,使用循环队列的方法模拟滑动窗口(Slide Window)*/
const char *buffer = malloc(BUFFER_SIZE+1);
/*初始化滑动窗口*/
memset(buffer, 0, BUFFER_SIZE+1);
/*搜索缓冲区,先行缓冲区,头尾地址指针*/
char *searchBufferStart = buffer;
char *searchBufferEnd = buffer;
char *lookaheadBufferStart = buffer;
char *lookaheadBufferEnd = buffer;
/*先读取数据,将先行缓冲区填满*/
int c=0;
for(int i=0; i<LOOKAHEAD_BUFFER_SIZE; i++){
c=getc(ifp);
if(c == EOF) {
printf("WARNING! data which will be compressed is too small.\n");
return -1;
}
*lookaheadBufferEnd = (char)c;
lookaheadBufferEnd++;
}
printf("------lookahead buffer data pre load is ok.\n");
/*搜索缓冲区一开始为空,对先行缓冲区第一个字符编码*/
putc(0, ofp);
putc(0, ofp);
putc(*lookaheadBufferStart, ofp);
/*从源文件读取一个字符,准备跟新滑动窗口*/
c = getc(ifp);
/*如果到文件末尾,break,不再读取文件,将先行缓冲区压缩完就结束*/
if (c == EOF)
goto final;
/*跟新搜索缓冲区指针,尾指针向后移动一个字节*/
searchBufferEnd++;
/*更新先行缓冲区指针,先行缓冲区头指针和搜索缓冲区尾指针指向同一地址
往先行缓冲区尾部放入新元素*/
lookaheadBufferStart = searchBufferEnd;
*lookaheadBufferEnd = (char) c;
lookaheadBufferEnd++;
printf("------searchBuffer size is 0, direct output.\n");
/*查找最长匹配字符串,更新数据*/
while(1)
{
/*获取搜索缓冲区长度*/
/*不同搜索缓冲区长度,操作不同,一开始,搜索缓冲区未满阶段,和搜索缓冲区已满阶段操作不同。*/
int size=0;
if(searchBufferEnd >= searchBufferStart)
size = searchBufferEnd - searchBufferStart;
else
size = searchBufferEnd + (BUFFER_SIZE+1) - searchBufferStart;
if(size == 0) {
free(buffer);
printf("ERROR, search buffer size is 0!!!");
return -1;
}
/*搜索缓冲区未满*/
if(SEARCH_BUFFER_SIZE > size) {
int offset=0;
int matchSize = findMaxMatchLength(buffer, searchBufferStart, searchBufferEnd, lookaheadBufferStart, lookaheadBufferEnd, &offset);
/*no match data*/
if(0 == matchSize) {
putc(0, ofp);
putc(0, ofp);
putc(*lookaheadBufferStart, ofp);
/*跟新搜索缓冲区指针,尾指针向后移动一个字节*/
searchBufferEnd++;
/*更新先行缓冲区指针,先行缓冲区头指针和搜索缓冲区尾指针指向同一地址*/
lookaheadBufferStart = searchBufferEnd;
/*从源文件读取一个字符,准备跟新滑动窗口*/
c = getc(ifp);
/*如果到文件末尾,break,不再读取文件,将先行缓冲区压缩完就结束*/
if (c == EOF)
goto final;
/*往先行缓冲区尾部放入新元素*/
*lookaheadBufferEnd = (char) c;
lookaheadBufferEnd++;
printf("------match size is 0, direct output.\n");
}else{
/*需要考虑搜索缓冲区头指针*/
if(lookaheadBufferEnd+matchSize > buffer+BUFFER_SIZE){
/*have match data.*/
/*update buffer point, careful about point over*/
/*look-ahead buffer point*/
lookaheadBufferStart = lookaheadBufferStart + matchSize;
/*encoder output*/
putc(offset, ofp);
putc(matchSize, ofp);
putc(*(lookaheadBufferStart), ofp);
searchBufferEnd = lookaheadBufferStart;
for (int i = 0; i < matchSize; i++) {
c = getc(ifp);
if (c == EOF) {
/*输入结束,只需将buffer内的压缩完即可*/
printf("------read all data, goto final!\n");
goto final;
}
*lookaheadBufferEnd = (char) c;
if (lookaheadBufferEnd - buffer >= BUFFER_SIZE)
lookaheadBufferEnd = buffer;
else
lookaheadBufferEnd++;
}
/*update search buffer start point*/
searchBufferStart = lookaheadBufferEnd+1;
}else{
/*不需要考虑搜索缓冲区头指针*/
/*have match data.*/
/*update buffer point, careful about point over*/
/*look-ahead buffer point*/
lookaheadBufferStart = lookaheadBufferStart + matchSize;
/*encoder output*/
putc(offset, ofp);
putc(matchSize, ofp);
putc(*(lookaheadBufferStart), ofp);
searchBufferEnd = lookaheadBufferStart;
for (int i = 0; i < matchSize; i++) {
c = getc(ifp);
if (c == EOF) {
/*输入结束,只需将buffer内的压缩完即可*/
printf("------read all data, goto final!\n");
goto final;
}
*lookaheadBufferEnd = (char) c;
lookaheadBufferEnd++;
}
}
}
}else{
/*搜索缓冲区已满*/
/*get length, offset*/
int offset= 0;
int matchSize = findMaxMatchLength(buffer, searchBufferStart, searchBufferEnd, lookaheadBufferStart, lookaheadBufferEnd, &offset);
/*存在匹配和不存在匹配数据,操作不一样*/
/*No match data.*/
if(0 == matchSize){
/*encoder output*/
/*No match data, LZ77 token has three parts, offset, length, and with the unmatched symbol*/
putc(0, ofp);
putc(0, ofp);
putc(*(lookaheadBufferStart), ofp);
//printf("------output is %d.\n", *(lookaheadBufferStart));
/*update buffer point, careful about point over*/
if (searchBufferStart - buffer >= BUFFER_SIZE)
searchBufferStart = buffer;
else
searchBufferStart = searchBufferStart + 1;
/*look-ahead buffer point*/
if (lookaheadBufferStart - buffer >= BUFFER_SIZE)
lookaheadBufferStart = buffer;
else
lookaheadBufferStart = lookaheadBufferStart + 1;
searchBufferEnd = lookaheadBufferStart;
c = getc(ifp);
if (c == EOF) {
/*输入结束,只需将buffer内的压缩完即可*/
printf("------0.read all data, goto final!\n");
goto final;
}
*lookaheadBufferEnd = (char)c;
if(lookaheadBufferEnd - buffer >= BUFFER_SIZE)
lookaheadBufferEnd = buffer;
else
lookaheadBufferEnd++;
}else{
/*have match data.*/
/*update buffer point, careful about point over*/
if (searchBufferStart + matchSize > buffer + BUFFER_SIZE)
searchBufferStart = searchBufferStart + matchSize - BUFFER_SIZE - 1;
else
searchBufferStart = searchBufferStart + matchSize;
/*look-ahead buffer point*/
if (lookaheadBufferStart + matchSize > buffer + BUFFER_SIZE)
lookaheadBufferStart = lookaheadBufferStart + matchSize - BUFFER_SIZE - 1;
else
lookaheadBufferStart = lookaheadBufferStart + matchSize;
/*encoder output*/
putc(offset, ofp);
putc(matchSize, ofp);
putc(*(lookaheadBufferStart), ofp);
searchBufferEnd = lookaheadBufferStart;
for (int i = 0; i < matchSize; i++) {
c = getc(ifp);
if (c == EOF) {
/*输入结束,只需将buffer内的压缩完即可*/
printf("------read all data, goto final!\n");
goto final;
}
*lookaheadBufferEnd = (char) c;
if (lookaheadBufferEnd - buffer >= BUFFER_SIZE)
lookaheadBufferEnd = buffer;
else
lookaheadBufferEnd++;
}
}
printf("------buffer match size is %d, offset is %d!\n", matchSize, offset);
for(int i=0; i<= BUFFER_SIZE; i++){
printf("%d ", buffer[i]);
}
printf("\n");
printf("search start is %d, lookahead start is %d.\n", searchBufferStart-buffer, lookaheadBufferStart-buffer);
}
int searchOffset = searchBufferStart - buffer;
int searchOffset1 = searchBufferEnd - buffer;
int lookOffset = lookaheadBufferStart - buffer;
int lookOffset1 = lookaheadBufferEnd - buffer;
//printf("%d, %d, %d, %d\n", searchOffset, searchOffset1, lookOffset, lookOffset1);
}
final:
while(lookaheadBufferStart != lookaheadBufferEnd){
int searchOffset = searchBufferStart - buffer;
int searchOffset1 = searchBufferEnd - buffer;
int lookOffset = lookaheadBufferStart - buffer;
int lookOffset1 = lookaheadBufferEnd - buffer;
//printf("%d, %d, %d, %d\n", searchOffset, searchOffset1, lookOffset, lookOffset1);
/*get length, offset*/
int offset= 0;
int matchSize = findMaxMatchLength(buffer, searchBufferStart, searchBufferEnd, lookaheadBufferStart, lookaheadBufferEnd, &offset);
/*存在匹配和不存在匹配数据,操作不一样*/
/*No match data.*/
if(0 == matchSize){
/*encoder output*/
/*No match data, LZ77 token has three parts, offset, length, and with the unmatched symbol*/
putc(0, ofp);
putc(0, ofp);
putc(*(lookaheadBufferStart), ofp);
//printf("------output is %d.\n", *(lookaheadBufferStart));
/*update buffer point, careful about point over*/
if (searchBufferStart - buffer >= BUFFER_SIZE)
searchBufferStart = buffer;
else
searchBufferStart = searchBufferStart + 1;
/*look-ahead buffer point*/
if (lookaheadBufferStart - buffer >= BUFFER_SIZE)
lookaheadBufferStart = buffer;
else
lookaheadBufferStart = lookaheadBufferStart + 1;
searchBufferEnd = lookaheadBufferStart;
}else{
/*have match data.*/
/*update buffer point, careful about point over*/
if (searchBufferStart + matchSize > buffer + BUFFER_SIZE)
searchBufferStart = searchBufferStart + matchSize - BUFFER_SIZE - 1;
else
searchBufferStart = searchBufferStart + matchSize;
/*look-ahead buffer point*/
if (lookaheadBufferStart + matchSize > buffer + BUFFER_SIZE)
lookaheadBufferStart = lookaheadBufferStart + matchSize - BUFFER_SIZE - 1;
else
lookaheadBufferStart = lookaheadBufferStart + matchSize;
/*encoder output*/
putc(offset, ofp);
putc(matchSize, ofp);
putc(*(lookaheadBufferStart), ofp);
searchBufferEnd = lookaheadBufferStart;
}
printf("------last buffer match size is %d!\n", matchSize);
}
printf("------compress is end!\n");
free(buffer);
return 0;
}
int decompressor(FILE *ifp, FILE *ofp)
{
char *buffer = malloc(SEARCH_BUFFER_SIZE+1);
memset(buffer, 0, SEARCH_BUFFER_SIZE+1);
char *SearchStart = buffer;
char *SearchEnd = buffer;
char *matchStart = NULL;
int length = 0;
int offset = 0;
int symbol = 0;
while(!feof(ifp))
{
/*get three parts*/
offset = getc(ifp);
length = getc(ifp);
symbol = getc(ifp);
if(length == 0) {
putc(symbol, ofp);
/*update search buffer*/
*SearchEnd = (char)symbol;
if(SearchEnd == buffer+SEARCH_BUFFER_SIZE)
SearchEnd = buffer;
else
SearchEnd++;
if(SearchEnd == SearchStart){
if(SearchStart == buffer + SEARCH_BUFFER_SIZE)
SearchStart = buffer;
else
SearchStart++;
}
}
else{
if(SearchEnd-buffer < offset)
matchStart = SearchEnd - offset + SEARCH_BUFFER_SIZE + 1;
else
matchStart = SearchEnd - offset;
for(int i=0; i<length; i++) {
char c = *matchStart;
putc((int)c, ofp);
/*update search buffer*/
*SearchEnd = c;
if(SearchEnd == buffer+SEARCH_BUFFER_SIZE)
SearchEnd = buffer;
else
SearchEnd++;
if(SearchEnd == SearchStart){
if(SearchStart == buffer + SEARCH_BUFFER_SIZE)
SearchStart = buffer;
else
SearchStart++;
}
if(buffer+SEARCH_BUFFER_SIZE == matchStart)
matchStart = buffer;
else
matchStart++;
}
}
printf("start is %d, end is %d\n", SearchStart-buffer, SearchEnd-buffer);
}
free(buffer);
return 0;
}