#pragma prototyped12/*3* zip deflate decoder4*/56/* inflate.c -- Not copyrighted 1992 by Mark Adler7version c10p1, 10 January 1993 */89/* You can do whatever you like with this source file, though I would10prefer that if you modify it and redistribute it that you include11comments to that effect with your name and the date. Thank you.12[The history has been moved to the file ChangeLog.]13*/1415/*16Inflate deflated (PKZIP's method 8 compressed) data. The compression17method searches for as much of the current string of bytes (up to a18length of 258) in the previous 32K bytes. If it doesn't find any19matches (of at least length 3), it codes the next byte. Otherwise, it20codes the length of the matched string and its distance backwards from21the current position. There is a single Huffman code that codes both22single bytes (called "literals") and match lengths. A second Huffman23code codes the distance information, which follows a length code. Each24length or distance code actually represents a base value and a number25of "extra" (sometimes zero) bits to get to add to the base value. At26the end of each deflated block is a special end-of-block (EOB) literal/27length code. The decoding process is basically: get a literal/length28code; if EOB then done; if a literal, emit the decoded byte; if a29length then get the distance and emit the referred-to bytes from the30sliding window of previously emitted data.3132There are (currently) three kinds of inflate blocks: stored, fixed, and33dynamic. The compressor outputs a chunk of data at a time and decides34which method to use on a chunk-by-chunk basis. A chunk might typically35be 32K to 64K, uncompressed. If the chunk is uncompressible, then the36"stored" method is used. In this case, the bytes are simply stored as37is, eight bits per byte, with none of the above coding. The bytes are38preceded by a count, since there is no longer an EOB code.3940If the data are compressible, then either the fixed or dynamic methods41are used. In the dynamic method, the compressed data are preceded by42an encoding of the literal/length and distance Huffman codes that are43to be used to decode this block. The representation is itself Huffman44coded, and so is preceded by a description of that code. These code45descriptions take up a little space, and so for small blocks, there is46a predefined set of codes, called the fixed codes. The fixed method is47used if the block ends up smaller that way (usually for quite small48chunks); otherwise the dynamic method is used. In the latter case, the49codes are customized to the probabilities in the current block and so50can code it much better than the pre-determined fixed codes can.5152The Huffman codes themselves are decoded using a multi-level table53lookup, in order to maximize the speed of decoding plus the speed of54building the decoding tables. See the comments below that precede the55lbits and dbits tuning parameters.56*/575859/*60Notes beyond the 1.93a appnote.txt:61621. Distance pointers never point before the beginning of the output63stream.642. Distance pointers can point back across blocks, up to 32k away.653. There is an implied maximum of 7 bits for the bit length table and6615 bits for the actual data.674. If only one code exists, then it is encoded using one bit. (Zero68would be more efficient, but perhaps a little confusing.) If two69codes exist, they are coded using one bit each (0 and 1).705. There is no way of sending zero distance codes--a dummy must be71sent if there are none. (History: a pre 2.0 version of PKZIP would72store blocks with no distance codes, but this was discovered to be73too harsh a criterion.) Valid only for 1.93a. 2.04c does allow74zero distance codes, which is sent as one code of zero bits in75length.766. There are up to 286 literal/length codes. Code 256 represents the77end-of-block. Note however that the static length tree defines78288 codes just to fill out the Huffman codes. Codes 286 and 28779cannot be used though, since there is no length base or extra bits80defined for them. Similarily, there are up to 30 distance codes.81However, static trees define 32 codes (all 5 bits) to fill out the82Huffman codes, but the last two had better not show up in the data.837. Unzip can check dynamic Huffman blocks for complete code sets.84The exception is that a single code would not be complete (see #4).858. The five bits following the block type is really the number of86literal codes sent minus 257.879. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits88(1+6+6). Therefore, to output three times the length, you output89three codes (1+1+1), whereas to output four times the same length,90you only need two codes (1+3). Hmm.9110. In the tree reconstruction algorithm, Code = Code + Increment92only if BitLength(i) is not zero. (Pretty obvious.)9311. Correction: 4 Bits: # of Bit Length codes - 4 (4 - 19)9412. Note: length code 284 can represent 227-258, but length code 28595really is 258. The last length deserves its own, short code96since it gets used a lot in very redundant files. The length97258 is special since 258 - 3 (the min match length) is 255.9813. The literal/length and distance code bit lengths are read as a99single stream of lengths. It is possible (and advantageous) for100a repeat code (16, 17, or 18) to go across the boundary between101the two sets of lengths.102*/103104#include "zip.h"105#include "huff.h"106107#define WINDOW 32768108109#define STORED_BLOCK 0110#define STATIC_TREES 1111#define DYNAMIC_TREES 2112113/* Save to local */114#define BITS_LOCAL \115ulg bit_buf = state->bit_buf; \116ulg bit_len = state->bit_len;117118/* Restore to state */119#define BITS_GLOBAL \120state->bit_buf = bit_buf; \121state->bit_len = bit_len;122123#define MASK_BITS(n) ((((ulg)1)<<(n))-1)124#define NEXTBYTE(p) (((p)->ip < (p)->ie) ? (int)*(p)->ip++ : fill(p))125#define NEEDBITS(p,n) {while (bit_len<(n)){bit_buf|=((ulg)NEXTBYTE(p))<<bit_len;bit_len+=8;}}126#define GETBITS(n) (bit_buf & MASK_BITS(n))127#define DUMPBITS(n) {bit_buf>>=(n);bit_len-=(n);}128129struct State_s; typedef struct State_s State_t;130131struct State_s132{133Codex_t* codex;134135Vmalloc_t* vm; /* memory region for tl, td */136137uch buf[SF_BUFSIZE];138uch slide[WINDOW];139140int method;141142int eof; /* init to 0 from this member on */143int last;144145uch* ip;146uch* ie;147148ulg wp; /* current position in slide */149Huff_t* fixed_tl; /* inflate static */150Huff_t* fixed_td; /* inflate static */151int fixed_bl; /* inflate static */152int fixed_bd; /* inflate static */153ulg bit_buf; /* bit buffer */154ulg bit_len; /* bits in bit buffer */155ulg copy_len;156ulg copy_pos;157Huff_t* tl; /* literal length state table */158Huff_t* td; /* literal distance state table */159int bl; /* number of bits decoded by tl[] */160int bd; /* number of bits decoded by td[] */161};162163static int164fill(State_t* state)165{166ssize_t r;167168if (state->eof)169return 0;170if ((r = sfrd(state->codex->sp, state->buf, sizeof(state->buf), &state->codex->sfdisc)) <= 0)171{172state->eof = 1;173return 0;174}175state->ie = (state->ip = state->buf) + r;176return *state->ip++;177}178179/* The inflate algorithm uses a sliding 32K byte window on the uncompressed180stream to find repeated byte strings. This is implemented here as a181circular buffer. The index is updated simply by incrementing and then182and'ing with 0x7fff (32K-1). */183/* It is left to other modules to supply the 32K area. It is assumed184to be usable as if it were declared "uch slide[32768];" or as just185"uch *slide;" and then malloc'ed in the latter case. The definition186must be in unzip.h, included above. */187188#define lbits 9 /* bits in base literal/length lookup table */189#define dbits 6 /* bits in base distance lookup table */190191/* Tables for deflate from PKZIP's appnote.txt. */192static ush cplens[] = { /* Copy lengths for literal codes 257..285 */1933, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,19435, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};195/* note: see note #13 above about the 258 in this list. */196static ush cplext[] = { /* Extra bits for literal codes 257..285 */1970, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,1983, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99}; /* 99==invalid */199static ush cpdist[] = { /* Copy offsets for distance codes 0..29 */2001, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,201257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,2028193, 12289, 16385, 24577};203static ush cpdext[] = { /* Extra bits for distance codes */2040, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,2057, 7, 8, 8, 9, 9, 10, 10, 11, 11,20612, 12, 13, 13};207208/* inflate (decompress) the codes in a deflated (compressed) block.209Return an error code or zero if it all goes ok. */210211static uch*212inflate_codes(State_t* state, register uch* p, register uch* e)213{214register ulg x; /* table entry flag/number of extra bits */215Huff_t* t; /* pointer to table entry */216Huff_t* tl; /* literal length state table */217Huff_t* td; /* literal distance state table */218int bl; /* number of bits decoded by tl[] */219int bd; /* number of bits decoded by td[] */220ulg l;221ulg w;222ulg d;223uch* slide;224BITS_LOCAL;225226if (p == e)227return p;228slide = state->slide;229tl = state->tl;230td = state->td;231bl = state->bl;232bd = state->bd;233w = state->wp;234235/* inflate the coded data until end of block */236237for (;;)238{239NEEDBITS(state, (ulg)bl);240t = tl + GETBITS(bl);241x = t->e;242while (x > 16)243{244if (x == 99)245return 0;246DUMPBITS(t->b);247x -= 16;248NEEDBITS(state, x);249t = t->v.t + GETBITS(x);250x = t->e;251}252DUMPBITS(t->b);253if (x == 16)254{255/* literal */256w &= WINDOW - 1;257*p++ = slide[w++] = (uch)t->v.n;258if (p >= e)259{260state->wp = w;261BITS_GLOBAL;262return p;263}264}265else if (x == 15)266break;267else268{269/* get length of block to copy */270271NEEDBITS(state, x);272l = t->v.n + GETBITS(x);273DUMPBITS(x);274275/* decode distance of block to copy */276277NEEDBITS(state, (ulg)bd);278t = td + GETBITS(bd);279x = t->e;280while (x > 16)281{282if (x == 99)283return 0;284DUMPBITS(t->b);285x -= 16;286NEEDBITS(state, x);287t = t->v.t + GETBITS(x);288x = t->e;289}290DUMPBITS(t->b);291NEEDBITS(state, x);292d = w - t->v.n - GETBITS(x);293DUMPBITS(x);294295/* do the copy */296297while (l--)298{299d &= WINDOW - 1;300w &= WINDOW - 1;301*p++ = slide[w++] = slide[d++];302if (p >= e)303{304state->copy_len = l;305state->wp = w;306state->copy_pos = d;307BITS_GLOBAL;308return p;309}310}311}312}313state->wp = w;314state->method = -1;315BITS_GLOBAL;316return p;317}318319/* "decompress" an inflated type 0 (stored) block. */320321static uch*322inflate_stored(State_t* state, register uch* p, register uch* e)323{324ulg l;325ulg w;326BITS_LOCAL;327328/* go to byte boundary */329330l = bit_len & 7;331DUMPBITS(l);332333/* get the length and its complement */334335NEEDBITS(state, 16);336l = GETBITS(16);337DUMPBITS(16);338NEEDBITS(state, 16);339if (l != (ulg)((~bit_buf) & 0xffff))340{341BITS_GLOBAL;342return 0;343}344DUMPBITS(16);345346/* read and output the compressed data */347348w = state->wp;349for (;;)350{351if (!l--)352{353state->method = -1;354break;355}356w &= WINDOW - 1;357NEEDBITS(state, 8);358*p++ = state->slide[w++] = (uch)GETBITS(8);359DUMPBITS(8);360if (p >= e)361{362state->copy_len = l;363break;364}365}366state->wp = w;367BITS_GLOBAL;368return p;369}370371/* decompress an inflated type 1 (fixed Huffman codes) block. We should372either replace this with a custom state, or at least precompute the373Huffman tables. */374375static int376inflate_fixed(State_t* state)377{378/* if first time, set up tables for fixed blocks */379380if (!state->fixed_tl)381{382Huff_t* tl; /* literal/length code table */383int i; /* temporary variable */384ulg l[288]; /* length list for huff() */385386/* literal table */387388for (i = 0; i < 144; i++)389l[i] = 8;390for (; i < 256; i++)391l[i] = 9;392for (; i < 280; i++)393l[i] = 7;394395/* make a complete, but wrong code set */396397for (; i < 288; i++)398l[i] = 8;399state->fixed_bl = 7;400if (huff(l, 288, 257, cplens, cplext, &tl, &state->fixed_bl, state->vm))401return -1;402403/* distance table -- make an incomplete code set */404405for (i = 0; i < 30; i++)406l[i] = 5;407state->fixed_bd = 5;408if (huff(l, 30, 0, cpdist, cpdext, &state->fixed_td, &state->fixed_bd, state->vm) > 1)409return -1;410state->fixed_tl = tl;411}412state->tl = state->fixed_tl;413state->td = state->fixed_td;414state->bl = state->fixed_bl;415state->bd = state->fixed_bd;416return 0;417}418419/* decompress an inflated type 2 (dynamic Huffman codes) block. */420421static int422inflate_dynamic(State_t* state)423{424int i; /* temporary variables */425ulg j;426ulg l; /* last length */427ulg n; /* number of lengths to get */428Huff_t* tl; /* literal/length code table */429Huff_t* td; /* distance code table */430int bl; /* lookup bits for tl */431int bd; /* lookup bits for td */432ulg nb; /* number of bit length codes */433ulg nl; /* number of literal/length codes */434ulg nd; /* number of distance codes */435#ifdef PKZIP_BUG_WORKAROUND436ulg ll[288+32]; /* literal/length and distance code lengths */437#else438ulg ll[286+30]; /* literal/length and distance code lengths */439#endif440441static ulg border[] = { /* Order of the bit length code lengths */44216, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};443444BITS_LOCAL;445446state->fixed_tl = 0;447vmclear(state->vm);448449/* read in table lengths */450451NEEDBITS(state, 5);452nl = 257 + GETBITS(5); /* number of literal/length codes */453DUMPBITS(5);454NEEDBITS(state, 5);455nd = 1 + GETBITS(5); /* number of distance codes */456DUMPBITS(5);457NEEDBITS(state, 4);458nb = 4 + GETBITS(4); /* number of bit length codes */459DUMPBITS(4);460#ifdef PKZIP_BUG_WORKAROUND461if (nl > 288 || nd > 32)462#else463if (nl > 286 || nd > 30)464#endif465{466/* bad lengths */467468BITS_GLOBAL;469return -1;470}471472/* read in bit-length-code lengths */473474for (j = 0; j < nb; j++)475{476NEEDBITS(state, 3);477ll[border[j]] = GETBITS(3);478DUMPBITS(3);479}480for (; j < 19; j++)481ll[border[j]] = 0;482483/* build decoding table for trees--single level, 7 bit lookup */484485bl = 7;486if (i = huff(ll, 19, 19, NULL, NULL, &tl, &bl, state->vm))487{488/* incomplete code set */489490BITS_GLOBAL;491return -1;492}493494/* read in literal and distance code lengths */495496n = nl + nd;497i = l = 0;498while ((ulg)i < n)499{500NEEDBITS(state, (ulg)bl);501j = (td = tl + (GETBITS(bl)))->b;502DUMPBITS(j);503j = td->v.n;504if (j < 16) /* length of code in bits (0..15) */505ll[i++] = l = j;/* save last length in l */506else if (j == 16) /* repeat last length 3 to 6 times */507{508NEEDBITS(state, 2);509j = 3 + GETBITS(2);510DUMPBITS(2);511if ((ulg)i + j > n)512{513BITS_GLOBAL;514return -1;515}516while (j--)517ll[i++] = l;518}519else if (j == 17) /* 3 to 10 zero length codes */520{521NEEDBITS(state, 3);522j = 3 + GETBITS(3);523DUMPBITS(3);524if ((ulg)i + j > n)525{526BITS_GLOBAL;527return -1;528}529while (j--)530ll[i++] = 0;531l = 0;532}533else /* j == 18: 11 to 138 0 length codes */534{535NEEDBITS(state, 7);536j = 11 + GETBITS(7);537DUMPBITS(7);538if ((ulg)i + j > n)539{540BITS_GLOBAL;541return -1;542}543while (j--)544ll[i++] = 0;545l = 0;546}547}548BITS_GLOBAL;549550/* free decoding table for trees */551552vmclear(state->vm);553554/* build the decoding tables for literal/length and distance codes */555556bl = lbits;557i = huff(ll, nl, 257, cplens, cplext, &tl, &bl, state->vm);558if (bl == 0) /* no literals or lengths */559i = 1;560if (i)561{562/* incomplete code set */563564if (i == 1 && state->codex->disc->errorf)565(*state->codex->disc->errorf)(NiL, state->codex->disc, 2, "%s: incomplete literal tree", state->codex->meth->name);566return -1;567}568bd = dbits;569i = huff(ll + nl, nd, 0, cpdist, cpdext, &td, &bd, state->vm);570if (bd == 0 && nl > 257)571{572/* lengths but no distances */573574if (state->codex->disc->errorf)575(*state->codex->disc->errorf)(NiL, state->codex->disc, 2, "%s: incomplete distance tree", state->codex->meth->name);576return -1;577}578if (i == 1)579{580#ifdef PKZIP_BUG_WORKAROUND581i = 0;582#else583if (state->codex->disc->errorf)584(*state->codex->disc->errorf)(NiL, state->codex->disc, 2, "%s: incomplete distance tree", state->codex->meth->name);585#endif586}587if (i)588return -1;589state->tl = tl;590state->td = td;591state->bl = bl;592state->bd = bd;593return 0;594}595596static int597deflate_open(Codex_t* p, char* const args[], Codexnum_t flags)598{599State_t* state;600Vmalloc_t* vm;601602if (!(vm = vmopen(Vmdcheap, Vmbest, 0)))603return -1;604if (!(state = newof(0, State_t, 1, 0)))605{606vmclose(vm);607return -1;608}609state->vm = vm;610state->codex = p;611p->data = state;612return 0;613}614615static int616deflate_close(Codex_t* p)617{618State_t* state = (State_t*)p->data;619620if (!state)621return -1;622if (state->vm)623vmclose(state->vm);624free(state);625return 0;626}627628static int629deflate_init(Codex_t* p)630{631register State_t* state = (State_t*)p->data;632633vmclear(state->vm);634state->tl = state->fixed_tl = 0;635state->method = -1;636memset((char*)state + offsetof(State_t, eof), 0, sizeof(*state) - offsetof(State_t, eof));637return 0;638}639640static ssize_t641deflate_read(Sfio_t* sp, void* buf, size_t size, Sfdisc_t* disc)642{643register State_t* state = (State_t*)CODEX(disc)->data;644register uch* p = (uch*)buf;645register uch* e = p + size;646register uch* q;647ulg l;648ulg w;649ulg d;650651652if (state->copy_len > 0)653{654l = state->copy_len;655w = state->wp;656if (state->method != STORED_BLOCK)657{658d = state->copy_pos;659while (l && p < e)660{661l--;662d &= WINDOW - 1;663w &= WINDOW - 1;664*p++ = state->slide[w++] = state->slide[d++];665}666state->copy_pos = d;667}668else669{670BITS_LOCAL;671while (l && p < e)672{673l--;674w &= WINDOW - 1;675NEEDBITS(state, 8);676*p++ = state->slide[w++] = (uch)GETBITS(8);677DUMPBITS(8);678}679BITS_GLOBAL;680if (!l)681state->method = -1;682}683state->copy_len = l;684state->wp = w;685}686while (p < e)687{688if (state->method == -1)689{690BITS_LOCAL;691if (state->eof)692{693BITS_GLOBAL;694break;695}696if (state->last)697{698state->last = state->wp = 0;699l = bit_len & 7;700DUMPBITS(l);701}702703/* read in last block bit */704705NEEDBITS(state, 1);706if (GETBITS(1))707state->last = 1;708DUMPBITS(1);709710/* read in block type */711712NEEDBITS(state, 2);713state->method = (int)GETBITS(2);714DUMPBITS(2);715state->tl = 0;716state->copy_len = 0;717BITS_GLOBAL;718}719switch(state->method)720{721case STORED_BLOCK:722q = inflate_stored(state, p, e);723break;724case STATIC_TREES:725q = (state->tl || !inflate_fixed(state)) ? inflate_codes(state, p, e) : (uch*)0;726break;727case DYNAMIC_TREES:728q = (state->tl || !inflate_dynamic(state)) ? inflate_codes(state, p, e) : (uch*)0;729break;730default:731q = 0;732break;733}734if (!q)735{736if (p < e || state->eof)737break;738return -1;739}740p = q;741}742return p - (uch*)buf;743}744745Codexmeth_t codex_zip_deflate =746{747"deflate",748"zip deflate compression (PKZIP method 8). Option is level"749" { 6:faster ... 9:better }.",7500,751CODEX_DECODE|CODEX_COMPRESS,7520,7530,754deflate_open,755deflate_close,756deflate_init,7570,758deflate_read,7590,7600,7610,7620,7630,7640,7650766};767768769