//1// Name: lz.c2// Author: Marcus Geelnard3// Description: LZ77 coder/decoder implementation.4// Reentrant: Yes5// $ATH_LICENSE_NULL$6//7// The LZ77 compression scheme is a substitutional compression scheme8// proposed by Abraham Lempel and Jakob Ziv in 1977. It is very simple in9// its design, and uses no fancy bit level compression.10//11// This is my first attempt at an implementation of a LZ77 code/decoder.12//13// The principle of the LZ77 compression algorithm is to store repeated14// occurrences of strings as references to previous occurrences of the same15// string. The point is that the reference consumes less space than the16// string itself, provided that the string is long enough (in this17// implementation, the string has to be at least 4 bytes long, since the18// minimum coded reference is 3 bytes long). Also note that the term19// "string" refers to any kind of byte sequence (it does not have to be20// an ASCII string, for instance).21//22// The coder uses a brute force approach to finding string matches in the23// history buffer (or "sliding window", if you wish), which is very, very24// slow. I recon the complexity is somewhere between O(n^2) and O(n^3),25// depending on the input data.26//27// There is also a faster implementation that uses a large working buffer28// in which a "jump table" is stored, which is used to quickly find29// possible string matches (see the source code for LZ_CompressFast() for30// more information). The faster method is an order of magnitude faster,31// but still quite slow compared to other compression methods.32//33// The upside is that decompression is very fast, and the compression ratio34// is often very good.35//36// The reference to a string is coded as a (length,offset) pair, where the37// length indicates the length of the string, and the offset gives the38// offset from the current data position. To distinguish between string39// references and literal strings (uncompressed bytes), a string reference40// is preceded by a marker byte, which is chosen as the least common byte41// symbol in the input data stream (this marker byte is stored in the42// output stream as the first byte).43//44// Occurrences of the marker byte in the stream are encoded as the marker45// byte followed by a zero byte, which means that occurrences of the marker46// byte have to be coded with two bytes.47//48// The lengths and offsets are coded in a variable length fashion, allowing49// values of any magnitude (up to 4294967295 in this implementation).50//51// With this compression scheme, the worst case compression result is52// (257/256)*insize + 1.53//54//------------------------------------------------------------------------55// Copyright (c) 2003-2006 Marcus Geelnard56//57// This software is provided 'as-is', without any express or implied58// warranty. In no event will the authors be held liable for any damages59// arising from the use of this software.60//61// Permission is granted to anyone to use this software for any purpose,62// including commercial applications, and to alter it and redistribute it63// freely, subject to the following restrictions:64//65// 1. The origin of this software must not be misrepresented; you must not66// claim that you wrote the original software. If you use this software67// in a product, an acknowledgment in the product documentation would68// be appreciated but is not required.69//70// 2. Altered source versions must be plainly marked as such, and must not71// be misrepresented as being the original software.72//73// 3. This notice may not be removed or altered from any source74// distribution.75//76// Marcus Geelnard77// marcus.geelnard at home.se78//7980//81// This file has been altered from the original version.82//8384/*************************************************************************85* Constants used for LZ77 coding86*************************************************************************/8788/* Maximum offset (can be any size < 2^31). Lower values give faster89compression, while higher values gives better compression. The default90value of 100000 is quite high. Experiment to see what works best for91you. */92#define LZ_MAX_OFFSET 10000093949596/*************************************************************************97* INTERNAL FUNCTIONS *98*************************************************************************/99100101/*************************************************************************102* _LZ_StringCompare() - Return maximum length string match.103*************************************************************************/104105static unsigned int _LZ_StringCompare( unsigned char * str1,106unsigned char * str2, unsigned int minlen, unsigned int maxlen )107{108unsigned int len;109110for( len = minlen; (len < maxlen) && (str1[len] == str2[len]); ++ len );111112return len;113}114115116/*************************************************************************117* _LZ_WriteVarSize() - Write unsigned integer with variable number of118* bytes depending on value.119*************************************************************************/120121static int _LZ_WriteVarSize( unsigned int x, unsigned char * buf )122{123unsigned int y;124int num_bytes, i, b;125126/* Determine number of bytes needed to store the number x */127y = x >> 3;128for( num_bytes = 5; num_bytes >= 2; -- num_bytes )129{130if( y & 0xfe000000 ) break;131y <<= 7;132}133134/* Write all bytes, seven bits in each, with 8:th bit set for all */135/* but the last byte. */136for( i = num_bytes-1; i >= 0; -- i )137{138b = (x >> (i*7)) & 0x0000007f;139if( i > 0 )140{141b |= 0x00000080;142}143*buf ++ = (unsigned char) b;144}145146/* Return number of bytes written */147return num_bytes;148}149150151/*************************************************************************152* _LZ_ReadVarSize() - Read unsigned integer with variable number of153* bytes depending on value.154*************************************************************************/155156static int _LZ_ReadVarSize( unsigned int * x, unsigned char * buf )157{158unsigned int y, b, num_bytes;159160/* Read complete value (stop when byte contains zero in 8:th bit) */161y = 0;162num_bytes = 0;163do164{165b = (unsigned int) (*buf ++);166y = (y << 7) | (b & 0x0000007f);167++ num_bytes;168}169while( b & 0x00000080 );170171/* Store value in x */172*x = y;173174/* Return number of bytes read */175return num_bytes;176}177178179180/*************************************************************************181* PUBLIC FUNCTIONS *182*************************************************************************/183184185/*************************************************************************186* LZ_Compress() - Compress a block of data using an LZ77 coder.187* in - Input (uncompressed) buffer.188* out - Output (compressed) buffer. This buffer must be 0.4% larger189* than the input buffer, plus one byte.190* insize - Number of input bytes.191* The function returns the size of the compressed data.192*************************************************************************/193194int LZ_Compress( unsigned char *in, unsigned char *out,195unsigned int insize )196{197unsigned char marker, symbol;198unsigned int inpos, outpos, bytesleft, i;199unsigned int maxoffset, offset, bestoffset;200unsigned int maxlength, length, bestlength;201unsigned int histogram[ 256 ];202unsigned char *ptr1, *ptr2;203204/* Do we have anything to compress? */205if( insize < 1 )206{207return 0;208}209210/* Create histogram */211for( i = 0; i < 256; ++ i )212{213histogram[ i ] = 0;214}215for( i = 0; i < insize; ++ i )216{217++ histogram[ in[ i ] ];218}219220/* Find the least common byte, and use it as the marker symbol */221marker = 0;222for( i = 1; i < 256; ++ i )223{224if( histogram[ i ] < histogram[ marker ] )225{226marker = i;227}228}229230/* Remember the marker symbol for the decoder */231out[ 0 ] = marker;232233/* Start of compression */234inpos = 0;235outpos = 1;236237/* Main compression loop */238bytesleft = insize;239do240{241/* Determine most distant position */242if( inpos > LZ_MAX_OFFSET ) maxoffset = LZ_MAX_OFFSET;243else maxoffset = inpos;244245/* Get pointer to current position */246ptr1 = &in[ inpos ];247248/* Search history window for maximum length string match */249bestlength = 3;250bestoffset = 0;251for( offset = 3; offset <= maxoffset; ++ offset )252{253/* Get pointer to candidate string */254ptr2 = &ptr1[ -(int)offset ];255256/* Quickly determine if this is a candidate (for speed) */257if( (ptr1[ 0 ] == ptr2[ 0 ]) &&258(ptr1[ bestlength ] == ptr2[ bestlength ]) )259{260/* Determine maximum length for this offset */261maxlength = (bytesleft < offset ? bytesleft : offset);262263/* Count maximum length match at this offset */264length = _LZ_StringCompare( ptr1, ptr2, 0, maxlength );265266/* Better match than any previous match? */267if( length > bestlength )268{269bestlength = length;270bestoffset = offset;271}272}273}274275/* Was there a good enough match? */276if( (bestlength >= 8) ||277((bestlength == 4) && (bestoffset <= 0x0000007f)) ||278((bestlength == 5) && (bestoffset <= 0x00003fff)) ||279((bestlength == 6) && (bestoffset <= 0x001fffff)) ||280((bestlength == 7) && (bestoffset <= 0x0fffffff)) )281{282out[ outpos ++ ] = (unsigned char) marker;283outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] );284outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] );285inpos += bestlength;286bytesleft -= bestlength;287}288else289{290/* Output single byte (or two bytes if marker byte) */291symbol = in[ inpos ++ ];292out[ outpos ++ ] = symbol;293if( symbol == marker )294{295out[ outpos ++ ] = 0;296}297-- bytesleft;298}299}300while( bytesleft > 3 );301302/* Dump remaining bytes, if any */303while( inpos < insize )304{305if( in[ inpos ] == marker )306{307out[ outpos ++ ] = marker;308out[ outpos ++ ] = 0;309}310else311{312out[ outpos ++ ] = in[ inpos ];313}314++ inpos;315}316317return outpos;318}319320321/*************************************************************************322* LZ_CompressFast() - Compress a block of data using an LZ77 coder.323* in - Input (uncompressed) buffer.324* out - Output (compressed) buffer. This buffer must be 0.4% larger325* than the input buffer, plus one byte.326* insize - Number of input bytes.327* work - Pointer to a temporary buffer (internal working buffer), which328* must be able to hold (insize+65536) unsigned integers.329* The function returns the size of the compressed data.330*************************************************************************/331332int LZ_CompressFast( unsigned char *in, unsigned char *out,333unsigned int insize, unsigned int *work )334{335unsigned char marker, symbol;336unsigned int inpos, outpos, bytesleft, i, index, symbols;337unsigned int offset, bestoffset;338unsigned int maxlength, length, bestlength;339unsigned int histogram[ 256 ], *lastindex, *jumptable;340unsigned char *ptr1, *ptr2;341342/* Do we have anything to compress? */343if( insize < 1 )344{345return 0;346}347348/* Assign arrays to the working area */349lastindex = work;350jumptable = &work[ 65536 ];351352/* Build a "jump table". Here is how the jump table works:353jumptable[i] points to the nearest previous occurrence of the same354symbol pair as in[i]:in[i+1], so in[i] == in[jumptable[i]] and355in[i+1] == in[jumptable[i]+1], and so on... Following the jump table356gives a dramatic boost for the string search'n'match loop compared357to doing a brute force search. The jump table is built in O(n) time,358so it is a cheap operation in terms of time, but it is expensice in359terms of memory consumption. */360for( i = 0; i < 65536; ++ i )361{362lastindex[ i ] = 0xffffffff;363}364for( i = 0; i < insize-1; ++ i )365{366symbols = (((unsigned int)in[i]) << 8) | ((unsigned int)in[i+1]);367index = lastindex[ symbols ];368lastindex[ symbols ] = i;369jumptable[ i ] = index;370}371jumptable[ insize-1 ] = 0xffffffff;372373/* Create histogram */374for( i = 0; i < 256; ++ i )375{376histogram[ i ] = 0;377}378for( i = 0; i < insize; ++ i )379{380++ histogram[ in[ i ] ];381}382383/* Find the least common byte, and use it as the marker symbol */384marker = 0;385for( i = 1; i < 256; ++ i )386{387if( histogram[ i ] < histogram[ marker ] )388{389marker = i;390}391}392393/* Remember the marker symbol for the decoder */394out[ 0 ] = marker;395396/* Start of compression */397inpos = 0;398outpos = 1;399400/* Main compression loop */401bytesleft = insize;402do403{404/* Get pointer to current position */405ptr1 = &in[ inpos ];406407/* Search history window for maximum length string match */408bestlength = 3;409bestoffset = 0;410index = jumptable[ inpos ];411while( (index != 0xffffffff) && ((inpos - index) < LZ_MAX_OFFSET) )412{413/* Get pointer to candidate string */414ptr2 = &in[ index ];415416/* Quickly determine if this is a candidate (for speed) */417if( ptr2[ bestlength ] == ptr1[ bestlength ] )418{419/* Determine maximum length for this offset */420offset = inpos - index;421maxlength = (bytesleft < offset ? bytesleft : offset);422423/* Count maximum length match at this offset */424length = _LZ_StringCompare( ptr1, ptr2, 2, maxlength );425426/* Better match than any previous match? */427if( length > bestlength )428{429bestlength = length;430bestoffset = offset;431}432}433434/* Get next possible index from jump table */435index = jumptable[ index ];436}437438/* Was there a good enough match? */439if( (bestlength >= 8) ||440((bestlength == 4) && (bestoffset <= 0x0000007f)) ||441((bestlength == 5) && (bestoffset <= 0x00003fff)) ||442((bestlength == 6) && (bestoffset <= 0x001fffff)) ||443((bestlength == 7) && (bestoffset <= 0x0fffffff)) )444{445out[ outpos ++ ] = (unsigned char) marker;446outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] );447outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] );448inpos += bestlength;449bytesleft -= bestlength;450}451else452{453/* Output single byte (or two bytes if marker byte) */454symbol = in[ inpos ++ ];455out[ outpos ++ ] = symbol;456if( symbol == marker )457{458out[ outpos ++ ] = 0;459}460-- bytesleft;461}462}463while( bytesleft > 3 );464465/* Dump remaining bytes, if any */466while( inpos < insize )467{468if( in[ inpos ] == marker )469{470out[ outpos ++ ] = marker;471out[ outpos ++ ] = 0;472}473else474{475out[ outpos ++ ] = in[ inpos ];476}477++ inpos;478}479480return outpos;481}482483484/*************************************************************************485* LZ_Uncompress() - Uncompress a block of data using an LZ77 decoder.486* in - Input (compressed) buffer.487* out - Output (uncompressed) buffer. This buffer must be large488* enough to hold the uncompressed data.489* insize - Number of input bytes.490*************************************************************************/491492int LZ_Uncompress( unsigned char *in, unsigned char *out,493unsigned int insize )494{495unsigned char marker, symbol;496unsigned int i, inpos, outpos, length, offset;497498/* Do we have anything to uncompress? */499if( insize < 1 )500{501return 0;502}503504/* Get marker symbol from input stream */505marker = in[ 0 ];506inpos = 1;507508/* Main decompression loop */509outpos = 0;510do511{512symbol = in[ inpos ++ ];513if( symbol == marker )514{515/* We had a marker byte */516if( in[ inpos ] == 0 )517{518/* It was a single occurrence of the marker byte */519out[ outpos ++ ] = marker;520++ inpos;521}522else523{524/* Extract true length and offset */525inpos += _LZ_ReadVarSize( &length, &in[ inpos ] );526inpos += _LZ_ReadVarSize( &offset, &in[ inpos ] );527528/* Copy corresponding data from history window */529for( i = 0; i < length; ++ i )530{531out[ outpos ] = out[ outpos - offset ];532++ outpos;533}534}535}536else537{538/* No marker, plain copy */539out[ outpos ++ ] = symbol;540}541}542while( inpos < insize );543544return outpos;545}546547548