/*-1* SPDX-License-Identifier: BSD-3-Clause2*3* Copyright (c) 1985, 1986, 1992, 19934* The Regents of the University of California. All rights reserved.5*6* This code is derived from software contributed to Berkeley by7* Diomidis Spinellis and James A. Woods, derived from original8* work by Spencer Thomas and Joseph Orost.9*10* Redistribution and use in source and binary forms, with or without11* modification, are permitted provided that the following conditions12* are met:13* 1. Redistributions of source code must retain the above copyright14* notice, this list of conditions and the following disclaimer.15* 2. Redistributions in binary form must reproduce the above copyright16* notice, this list of conditions and the following disclaimer in the17* documentation and/or other materials provided with the distribution.18* 3. Neither the name of the University nor the names of its contributors19* may be used to endorse or promote products derived from this software20* without specific prior written permission.21*22* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND23* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE24* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE25* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE26* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL27* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS28* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)29* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT30* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY31* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF32* SUCH DAMAGE.33*/343536#include <sys/cdefs.h>37/*-38* fcompress.c - File compression ala IEEE Computer, June 1984.39*40* Compress authors:41* Spencer W. Thomas (decvax!utah-cs!thomas)42* Jim McKie (decvax!mcvax!jim)43* Steve Davies (decvax!vax135!petsd!peora!srd)44* Ken Turkowski (decvax!decwrl!turtlevax!ken)45* James A. Woods (decvax!ihnp4!ames!jaw)46* Joe Orost (decvax!vax135!petsd!joe)47*48* Cleaned up and converted to library returning I/O streams by49* Diomidis Spinellis <[email protected]>.50*51* zopen(filename, mode, bits)52* Returns a FILE * that can be used for read or write. The modes53* supported are only "r" and "w". Seeking is not allowed. On54* reading the file is decompressed, on writing it is compressed.55* The output is compatible with compress(1) with 16 bit tables.56* Any file produced by compress(1) can be read.57*/5859#include <sys/param.h>60#include <sys/stat.h>6162#include <ctype.h>63#include <errno.h>64#include <signal.h>65#include <stdio.h>66#include <stdlib.h>67#include <string.h>68#include <unistd.h>69#include "zopen.h"7071#define BITS 16 /* Default bits. */72#define HSIZE 69001 /* 95% occupancy */7374/* A code_int must be able to hold 2**BITS values of type int, and also -1. */75typedef long code_int;76typedef long count_int;7778typedef u_char char_type;79static char_type magic_header[] =80{'\037', '\235'}; /* 1F 9D */8182#define BIT_MASK 0x1f /* Defines for third byte of header. */83#define BLOCK_MASK 0x808485/*86* Masks 0x40 and 0x20 are free. I think 0x20 should mean that there is87* a fourth header byte (for expansion).88*/89#define INIT_BITS 9 /* Initial number of bits/code. */9091#define MAXCODE(n_bits) ((1 << (n_bits)) - 1)9293struct s_zstate {94FILE *zs_fp; /* File stream for I/O */95char zs_mode; /* r or w */96enum {97S_START, S_MIDDLE, S_EOF98} zs_state; /* State of computation */99u_int zs_n_bits; /* Number of bits/code. */100u_int zs_maxbits; /* User settable max # bits/code. */101code_int zs_maxcode; /* Maximum code, given n_bits. */102code_int zs_maxmaxcode; /* Should NEVER generate this code. */103count_int zs_htab [HSIZE];104u_short zs_codetab [HSIZE];105code_int zs_hsize; /* For dynamic table sizing. */106code_int zs_free_ent; /* First unused entry. */107/*108* Block compression parameters -- after all codes are used up,109* and compression rate changes, start over.110*/111int zs_block_compress;112int zs_clear_flg;113long zs_ratio;114count_int zs_checkpoint;115u_int zs_offset;116long zs_in_count; /* Length of input. */117long zs_bytes_out; /* Length of compressed output. */118long zs_out_count; /* # of codes output (for debugging). */119char_type zs_buf[BITS];120union {121struct {122long zs_fcode;123code_int zs_ent;124code_int zs_hsize_reg;125int zs_hshift;126} w; /* Write parameters */127struct {128char_type *zs_stackp;129int zs_finchar;130code_int zs_code, zs_oldcode, zs_incode;131int zs_roffset, zs_size;132char_type zs_gbuf[BITS];133} r; /* Read parameters */134} u;135};136137/* Definitions to retain old variable names */138#define fp zs->zs_fp139#define zmode zs->zs_mode140#define state zs->zs_state141#define n_bits zs->zs_n_bits142#define maxbits zs->zs_maxbits143#define maxcode zs->zs_maxcode144#define maxmaxcode zs->zs_maxmaxcode145#define htab zs->zs_htab146#define codetab zs->zs_codetab147#define hsize zs->zs_hsize148#define free_ent zs->zs_free_ent149#define block_compress zs->zs_block_compress150#define clear_flg zs->zs_clear_flg151#define ratio zs->zs_ratio152#define checkpoint zs->zs_checkpoint153#define offset zs->zs_offset154#define in_count zs->zs_in_count155#define bytes_out zs->zs_bytes_out156#define out_count zs->zs_out_count157#define buf zs->zs_buf158#define fcode zs->u.w.zs_fcode159#define hsize_reg zs->u.w.zs_hsize_reg160#define ent zs->u.w.zs_ent161#define hshift zs->u.w.zs_hshift162#define stackp zs->u.r.zs_stackp163#define finchar zs->u.r.zs_finchar164#define code zs->u.r.zs_code165#define oldcode zs->u.r.zs_oldcode166#define incode zs->u.r.zs_incode167#define roffset zs->u.r.zs_roffset168#define size zs->u.r.zs_size169#define gbuf zs->u.r.zs_gbuf170171/*172* To save much memory, we overlay the table used by compress() with those173* used by decompress(). The tab_prefix table is the same size and type as174* the codetab. The tab_suffix table needs 2**BITS characters. We get this175* from the beginning of htab. The output stack uses the rest of htab, and176* contains characters. There is plenty of room for any possible stack177* (stack used to be 8000 characters).178*/179180#define htabof(i) htab[i]181#define codetabof(i) codetab[i]182183#define tab_prefixof(i) codetabof(i)184#define tab_suffixof(i) ((char_type *)(htab))[i]185#define de_stack ((char_type *)&tab_suffixof(1 << BITS))186187#define CHECK_GAP 10000 /* Ratio check interval. */188189/*190* the next two codes should not be changed lightly, as they must not191* lie within the contiguous general code space.192*/193#define FIRST 257 /* First free entry. */194#define CLEAR 256 /* Table clear output code. */195196static int cl_block(struct s_zstate *);197static void cl_hash(struct s_zstate *, count_int);198static code_int getcode(struct s_zstate *);199static int output(struct s_zstate *, code_int);200static int zclose(void *);201static int zread(void *, char *, int);202static int zwrite(void *, const char *, int);203204/*-205* Algorithm from "A Technique for High Performance Data Compression",206* Terry A. Welch, IEEE Computer Vol 17, No 6 (June 1984), pp 8-19.207*208* Algorithm:209* Modified Lempel-Ziv method (LZW). Basically finds common210* substrings and replaces them with a variable size code. This is211* deterministic, and can be done on the fly. Thus, the decompression212* procedure needs no input table, but tracks the way the table was built.213*/214215/*-216* compress write217*218* Algorithm: use open addressing double hashing (no chaining) on the219* prefix code / next character combination. We do a variant of Knuth's220* algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime221* secondary probe. Here, the modular division first probe is gives way222* to a faster exclusive-or manipulation. Also do block compression with223* an adaptive reset, whereby the code table is cleared when the compression224* ratio decreases, but after the table fills. The variable-length output225* codes are re-sized at this point, and a special CLEAR code is generated226* for the decompressor. Late addition: construct the table according to227* file size for noticeable speed improvement on small files. Please direct228* questions about this implementation to ames!jaw.229*/230static int231zwrite(void *cookie, const char *wbp, int num)232{233code_int i;234int c, disp;235struct s_zstate *zs;236const u_char *bp;237u_char tmp;238int count;239240if (num == 0)241return (0);242243zs = cookie;244count = num;245bp = (const u_char *)wbp;246if (state == S_MIDDLE)247goto middle;248state = S_MIDDLE;249250maxmaxcode = 1L << maxbits;251if (fwrite(magic_header,252sizeof(char), sizeof(magic_header), fp) != sizeof(magic_header))253return (-1);254tmp = (u_char)((maxbits) | block_compress);255if (fwrite(&tmp, sizeof(char), sizeof(tmp), fp) != sizeof(tmp))256return (-1);257258offset = 0;259bytes_out = 3; /* Includes 3-byte header mojo. */260out_count = 0;261clear_flg = 0;262ratio = 0;263in_count = 1;264checkpoint = CHECK_GAP;265maxcode = MAXCODE(n_bits = INIT_BITS);266free_ent = ((block_compress) ? FIRST : 256);267268ent = *bp++;269--count;270271hshift = 0;272for (fcode = (long)hsize; fcode < 65536L; fcode *= 2L)273hshift++;274hshift = 8 - hshift; /* Set hash code range bound. */275276hsize_reg = hsize;277cl_hash(zs, (count_int)hsize_reg); /* Clear hash table. */278279middle: for (i = 0; count--;) {280c = *bp++;281in_count++;282fcode = (long)(((long)c << maxbits) + ent);283i = ((c << hshift) ^ ent); /* Xor hashing. */284285if (htabof(i) == fcode) {286ent = codetabof(i);287continue;288} else if ((long)htabof(i) < 0) /* Empty slot. */289goto nomatch;290disp = hsize_reg - i; /* Secondary hash (after G. Knott). */291if (i == 0)292disp = 1;293probe: if ((i -= disp) < 0)294i += hsize_reg;295296if (htabof(i) == fcode) {297ent = codetabof(i);298continue;299}300if ((long)htabof(i) >= 0)301goto probe;302nomatch: if (output(zs, (code_int) ent) == -1)303return (-1);304out_count++;305ent = c;306if (free_ent < maxmaxcode) {307codetabof(i) = free_ent++; /* code -> hashtable */308htabof(i) = fcode;309} else if ((count_int)in_count >=310checkpoint && block_compress) {311if (cl_block(zs) == -1)312return (-1);313}314}315return (num);316}317318static int319zclose(void *cookie)320{321struct s_zstate *zs;322int rval;323324zs = cookie;325if (zmode == 'w') { /* Put out the final code. */326if (output(zs, (code_int) ent) == -1) {327(void)fclose(fp);328free(zs);329return (-1);330}331out_count++;332if (output(zs, (code_int) - 1) == -1) {333(void)fclose(fp);334free(zs);335return (-1);336}337}338rval = fclose(fp) == EOF ? -1 : 0;339free(zs);340return (rval);341}342343/*-344* Output the given code.345* Inputs:346* code: A n_bits-bit integer. If == -1, then EOF. This assumes347* that n_bits =< (long)wordsize - 1.348* Outputs:349* Outputs code to the file.350* Assumptions:351* Chars are 8 bits long.352* Algorithm:353* Maintain a BITS character long buffer (so that 8 codes will354* fit in it exactly). Use the VAX insv instruction to insert each355* code in turn. When the buffer fills up empty it and start over.356*/357358static char_type lmask[9] =359{0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00};360static char_type rmask[9] =361{0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff};362363static int364output(struct s_zstate *zs, code_int ocode)365{366int r_off;367u_int bits;368char_type *bp;369370r_off = offset;371bits = n_bits;372bp = buf;373if (ocode >= 0) {374/* Get to the first byte. */375bp += (r_off >> 3);376r_off &= 7;377/*378* Since ocode is always >= 8 bits, only need to mask the first379* hunk on the left.380*/381*bp = (*bp & rmask[r_off]) | ((ocode << r_off) & lmask[r_off]);382bp++;383bits -= (8 - r_off);384ocode >>= 8 - r_off;385/* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */386if (bits >= 8) {387*bp++ = ocode;388ocode >>= 8;389bits -= 8;390}391/* Last bits. */392if (bits)393*bp = ocode;394offset += n_bits;395if (offset == (n_bits << 3)) {396bp = buf;397bits = n_bits;398bytes_out += bits;399if (fwrite(bp, sizeof(char), bits, fp) != bits)400return (-1);401bp += bits;402bits = 0;403offset = 0;404}405/*406* If the next entry is going to be too big for the ocode size,407* then increase it, if possible.408*/409if (free_ent > maxcode || (clear_flg > 0)) {410/*411* Write the whole buffer, because the input side won't412* discover the size increase until after it has read it.413*/414if (offset > 0) {415if (fwrite(buf, 1, n_bits, fp) != n_bits)416return (-1);417bytes_out += n_bits;418}419offset = 0;420421if (clear_flg) {422maxcode = MAXCODE(n_bits = INIT_BITS);423clear_flg = 0;424} else {425n_bits++;426if (n_bits == maxbits)427maxcode = maxmaxcode;428else429maxcode = MAXCODE(n_bits);430}431}432} else {433/* At EOF, write the rest of the buffer. */434if (offset > 0) {435offset = (offset + 7) / 8;436if (fwrite(buf, 1, offset, fp) != offset)437return (-1);438bytes_out += offset;439}440offset = 0;441}442return (0);443}444445/*446* Decompress read. This routine adapts to the codes in the file building447* the "string" table on-the-fly; requiring no table to be stored in the448* compressed file. The tables used herein are shared with those of the449* compress() routine. See the definitions above.450*/451static int452zread(void *cookie, char *rbp, int num)453{454u_int count;455struct s_zstate *zs;456u_char *bp, header[3];457458if (num == 0)459return (0);460461zs = cookie;462count = num;463bp = (u_char *)rbp;464switch (state) {465case S_START:466state = S_MIDDLE;467break;468case S_MIDDLE:469goto middle;470case S_EOF:471goto eof;472}473474/* Check the magic number */475if (fread(header,476sizeof(char), sizeof(header), fp) != sizeof(header) ||477memcmp(header, magic_header, sizeof(magic_header)) != 0) {478errno = EFTYPE;479return (-1);480}481maxbits = header[2]; /* Set -b from file. */482block_compress = maxbits & BLOCK_MASK;483maxbits &= BIT_MASK;484maxmaxcode = 1L << maxbits;485if (maxbits > BITS || maxbits < 12) {486errno = EFTYPE;487return (-1);488}489/* As above, initialize the first 256 entries in the table. */490maxcode = MAXCODE(n_bits = INIT_BITS);491for (code = 255; code >= 0; code--) {492tab_prefixof(code) = 0;493tab_suffixof(code) = (char_type) code;494}495free_ent = block_compress ? FIRST : 256;496497finchar = oldcode = getcode(zs);498if (oldcode == -1) /* EOF already? */499return (0); /* Get out of here */500501/* First code must be 8 bits = char. */502*bp++ = (u_char)finchar;503count--;504stackp = de_stack;505506while ((code = getcode(zs)) > -1) {507508if ((code == CLEAR) && block_compress) {509for (code = 255; code >= 0; code--)510tab_prefixof(code) = 0;511clear_flg = 1;512free_ent = FIRST;513oldcode = -1;514continue;515}516incode = code;517518/* Special case for kWkWk string. */519if (code >= free_ent) {520if (code > free_ent || oldcode == -1) {521/* Bad stream. */522errno = EINVAL;523return (-1);524}525*stackp++ = finchar;526code = oldcode;527}528/*529* The above condition ensures that code < free_ent.530* The construction of tab_prefixof in turn guarantees that531* each iteration decreases code and therefore stack usage is532* bound by 1 << BITS - 256.533*/534535/* Generate output characters in reverse order. */536while (code >= 256) {537*stackp++ = tab_suffixof(code);538code = tab_prefixof(code);539}540*stackp++ = finchar = tab_suffixof(code);541542/* And put them out in forward order. */543middle: do {544if (count-- == 0)545return (num);546*bp++ = *--stackp;547} while (stackp > de_stack);548549/* Generate the new entry. */550if ((code = free_ent) < maxmaxcode && oldcode != -1) {551tab_prefixof(code) = (u_short) oldcode;552tab_suffixof(code) = finchar;553free_ent = code + 1;554}555556/* Remember previous code. */557oldcode = incode;558}559state = S_EOF;560eof: return (num - count);561}562563/*-564* Read one code from the standard input. If EOF, return -1.565* Inputs:566* stdin567* Outputs:568* code or -1 is returned.569*/570static code_int571getcode(struct s_zstate *zs)572{573code_int gcode;574int r_off, bits;575char_type *bp;576577bp = gbuf;578if (clear_flg > 0 || roffset >= size || free_ent > maxcode) {579/*580* If the next entry will be too big for the current gcode581* size, then we must increase the size. This implies reading582* a new buffer full, too.583*/584if (free_ent > maxcode) {585n_bits++;586if (n_bits == maxbits) /* Won't get any bigger now. */587maxcode = maxmaxcode;588else589maxcode = MAXCODE(n_bits);590}591if (clear_flg > 0) {592maxcode = MAXCODE(n_bits = INIT_BITS);593clear_flg = 0;594}595size = fread(gbuf, 1, n_bits, fp);596if (size <= 0) /* End of file. */597return (-1);598roffset = 0;599/* Round size down to integral number of codes. */600size = (size << 3) - (n_bits - 1);601}602r_off = roffset;603bits = n_bits;604605/* Get to the first byte. */606bp += (r_off >> 3);607r_off &= 7;608609/* Get first part (low order bits). */610gcode = (*bp++ >> r_off);611bits -= (8 - r_off);612r_off = 8 - r_off; /* Now, roffset into gcode word. */613614/* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */615if (bits >= 8) {616gcode |= *bp++ << r_off;617r_off += 8;618bits -= 8;619}620621/* High order bits. */622if (bits > 0)623gcode |= (*bp & rmask[bits]) << r_off;624roffset += n_bits;625626return (gcode);627}628629static int630cl_block(struct s_zstate *zs) /* Table clear for block compress. */631{632long rat;633634checkpoint = in_count + CHECK_GAP;635636if (in_count > 0x007fffff) { /* Shift will overflow. */637rat = bytes_out >> 8;638if (rat == 0) /* Don't divide by zero. */639rat = 0x7fffffff;640else641rat = in_count / rat;642} else643rat = (in_count << 8) / bytes_out; /* 8 fractional bits. */644if (rat > ratio)645ratio = rat;646else {647ratio = 0;648cl_hash(zs, (count_int) hsize);649free_ent = FIRST;650clear_flg = 1;651if (output(zs, (code_int) CLEAR) == -1)652return (-1);653}654return (0);655}656657static void658cl_hash(struct s_zstate *zs, count_int cl_hsize) /* Reset code table. */659{660count_int *htab_p;661long i, m1;662663m1 = -1;664htab_p = htab + cl_hsize;665i = cl_hsize - 16;666do { /* Might use Sys V memset(3) here. */667*(htab_p - 16) = m1;668*(htab_p - 15) = m1;669*(htab_p - 14) = m1;670*(htab_p - 13) = m1;671*(htab_p - 12) = m1;672*(htab_p - 11) = m1;673*(htab_p - 10) = m1;674*(htab_p - 9) = m1;675*(htab_p - 8) = m1;676*(htab_p - 7) = m1;677*(htab_p - 6) = m1;678*(htab_p - 5) = m1;679*(htab_p - 4) = m1;680*(htab_p - 3) = m1;681*(htab_p - 2) = m1;682*(htab_p - 1) = m1;683htab_p -= 16;684} while ((i -= 16) >= 0);685for (i += 16; i > 0; i--)686*--htab_p = m1;687}688689FILE *690zopen(const char *fname, const char *mode, int bits)691{692struct s_zstate *zs;693694if ((mode[0] != 'r' && mode[0] != 'w') || mode[1] != '\0' ||695bits < 0 || bits > BITS) {696errno = EINVAL;697return (NULL);698}699700if ((zs = calloc(1, sizeof(struct s_zstate))) == NULL)701return (NULL);702703maxbits = bits ? bits : BITS; /* User settable max # bits/code. */704maxmaxcode = 1L << maxbits; /* Should NEVER generate this code. */705hsize = HSIZE; /* For dynamic table sizing. */706free_ent = 0; /* First unused entry. */707block_compress = BLOCK_MASK;708clear_flg = 0;709ratio = 0;710checkpoint = CHECK_GAP;711in_count = 1; /* Length of input. */712out_count = 0; /* # of codes output (for debugging). */713state = S_START;714roffset = 0;715size = 0;716717/*718* Layering compress on top of stdio in order to provide buffering,719* and ensure that reads and write work with the data specified.720*/721if ((fp = fopen(fname, mode)) == NULL) {722free(zs);723return (NULL);724}725switch (*mode) {726case 'r':727zmode = 'r';728return (funopen(zs, zread, NULL, NULL, zclose));729case 'w':730zmode = 'w';731return (funopen(zs, NULL, zwrite, NULL, zclose));732}733/* NOTREACHED */734return (NULL);735}736737738