Path: blob/main/sys/contrib/xz-embedded/linux/lib/xz/xz_dec_bcj.c
48521 views
/*1* Branch/Call/Jump (BCJ) filter decoders2*3* Authors: Lasse Collin <[email protected]>4* Igor Pavlov <https://7-zip.org/>5*6* This file has been put into the public domain.7* You can do whatever you want with this file.8*/910#include "xz_private.h"1112/*13* The rest of the file is inside this ifdef. It makes things a little more14* convenient when building without support for any BCJ filters.15*/16#ifdef XZ_DEC_BCJ1718struct xz_dec_bcj {19/* Type of the BCJ filter being used */20enum {21BCJ_X86 = 4, /* x86 or x86-64 */22BCJ_POWERPC = 5, /* Big endian only */23BCJ_IA64 = 6, /* Big or little endian */24BCJ_ARM = 7, /* Little endian only */25BCJ_ARMTHUMB = 8, /* Little endian only */26BCJ_SPARC = 9 /* Big or little endian */27} type;2829/*30* Return value of the next filter in the chain. We need to preserve31* this information across calls, because we must not call the next32* filter anymore once it has returned XZ_STREAM_END.33*/34enum xz_ret ret;3536/* True if we are operating in single-call mode. */37bool single_call;3839/*40* Absolute position relative to the beginning of the uncompressed41* data (in a single .xz Block). We care only about the lowest 3242* bits so this doesn't need to be uint64_t even with big files.43*/44uint32_t pos;4546/* x86 filter state */47uint32_t x86_prev_mask;4849/* Temporary space to hold the variables from struct xz_buf */50uint8_t *out;51size_t out_pos;52size_t out_size;5354struct {55/* Amount of already filtered data in the beginning of buf */56size_t filtered;5758/* Total amount of data currently stored in buf */59size_t size;6061/*62* Buffer to hold a mix of filtered and unfiltered data. This63* needs to be big enough to hold Alignment + 2 * Look-ahead:64*65* Type Alignment Look-ahead66* x86 1 467* PowerPC 4 068* IA-64 16 069* ARM 4 070* ARM-Thumb 2 271* SPARC 4 072*/73uint8_t buf[16];74} temp;75};7677#ifdef XZ_DEC_X8678/*79* This is used to test the most significant byte of a memory address80* in an x86 instruction.81*/82static inline int bcj_x86_test_msbyte(uint8_t b)83{84return b == 0x00 || b == 0xFF;85}8687static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size)88{89static const bool mask_to_allowed_status[8]90= { true, true, true, false, true, false, false, false };9192static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };9394size_t i;95size_t prev_pos = (size_t)-1;96uint32_t prev_mask = s->x86_prev_mask;97uint32_t src;98uint32_t dest;99uint32_t j;100uint8_t b;101102if (size <= 4)103return 0;104105size -= 4;106for (i = 0; i < size; ++i) {107if ((buf[i] & 0xFE) != 0xE8)108continue;109110prev_pos = i - prev_pos;111if (prev_pos > 3) {112prev_mask = 0;113} else {114prev_mask = (prev_mask << (prev_pos - 1)) & 7;115if (prev_mask != 0) {116b = buf[i + 4 - mask_to_bit_num[prev_mask]];117if (!mask_to_allowed_status[prev_mask]118|| bcj_x86_test_msbyte(b)) {119prev_pos = i;120prev_mask = (prev_mask << 1) | 1;121continue;122}123}124}125126prev_pos = i;127128if (bcj_x86_test_msbyte(buf[i + 4])) {129src = get_unaligned_le32(buf + i + 1);130while (true) {131dest = src - (s->pos + (uint32_t)i + 5);132if (prev_mask == 0)133break;134135j = mask_to_bit_num[prev_mask] * 8;136b = (uint8_t)(dest >> (24 - j));137if (!bcj_x86_test_msbyte(b))138break;139140src = dest ^ (((uint32_t)1 << (32 - j)) - 1);141}142143dest &= 0x01FFFFFF;144dest |= (uint32_t)0 - (dest & 0x01000000);145put_unaligned_le32(dest, buf + i + 1);146i += 4;147} else {148prev_mask = (prev_mask << 1) | 1;149}150}151152prev_pos = i - prev_pos;153s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);154return i;155}156#endif157158#ifdef XZ_DEC_POWERPC159static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)160{161size_t i;162uint32_t instr;163164for (i = 0; i + 4 <= size; i += 4) {165instr = get_unaligned_be32(buf + i);166if ((instr & 0xFC000003) == 0x48000001) {167instr &= 0x03FFFFFC;168instr -= s->pos + (uint32_t)i;169instr &= 0x03FFFFFC;170instr |= 0x48000001;171put_unaligned_be32(instr, buf + i);172}173}174175return i;176}177#endif178179#ifdef XZ_DEC_IA64180static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size)181{182static const uint8_t branch_table[32] = {1830, 0, 0, 0, 0, 0, 0, 0,1840, 0, 0, 0, 0, 0, 0, 0,1854, 4, 6, 6, 0, 0, 7, 7,1864, 4, 0, 0, 4, 4, 0, 0187};188189/*190* The local variables take a little bit stack space, but it's less191* than what LZMA2 decoder takes, so it doesn't make sense to reduce192* stack usage here without doing that for the LZMA2 decoder too.193*/194195/* Loop counters */196size_t i;197size_t j;198199/* Instruction slot (0, 1, or 2) in the 128-bit instruction word */200uint32_t slot;201202/* Bitwise offset of the instruction indicated by slot */203uint32_t bit_pos;204205/* bit_pos split into byte and bit parts */206uint32_t byte_pos;207uint32_t bit_res;208209/* Address part of an instruction */210uint32_t addr;211212/* Mask used to detect which instructions to convert */213uint32_t mask;214215/* 41-bit instruction stored somewhere in the lowest 48 bits */216uint64_t instr;217218/* Instruction normalized with bit_res for easier manipulation */219uint64_t norm;220221for (i = 0; i + 16 <= size; i += 16) {222mask = branch_table[buf[i] & 0x1F];223for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {224if (((mask >> slot) & 1) == 0)225continue;226227byte_pos = bit_pos >> 3;228bit_res = bit_pos & 7;229instr = 0;230for (j = 0; j < 6; ++j)231instr |= (uint64_t)(buf[i + j + byte_pos])232<< (8 * j);233234norm = instr >> bit_res;235236if (((norm >> 37) & 0x0F) == 0x05237&& ((norm >> 9) & 0x07) == 0) {238addr = (norm >> 13) & 0x0FFFFF;239addr |= ((uint32_t)(norm >> 36) & 1) << 20;240addr <<= 4;241addr -= s->pos + (uint32_t)i;242addr >>= 4;243244norm &= ~((uint64_t)0x8FFFFF << 13);245norm |= (uint64_t)(addr & 0x0FFFFF) << 13;246norm |= (uint64_t)(addr & 0x100000)247<< (36 - 20);248249instr &= (1 << bit_res) - 1;250instr |= norm << bit_res;251252for (j = 0; j < 6; j++)253buf[i + j + byte_pos]254= (uint8_t)(instr >> (8 * j));255}256}257}258259return i;260}261#endif262263#ifdef XZ_DEC_ARM264static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size)265{266size_t i;267uint32_t addr;268269for (i = 0; i + 4 <= size; i += 4) {270if (buf[i + 3] == 0xEB) {271addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)272| ((uint32_t)buf[i + 2] << 16);273addr <<= 2;274addr -= s->pos + (uint32_t)i + 8;275addr >>= 2;276buf[i] = (uint8_t)addr;277buf[i + 1] = (uint8_t)(addr >> 8);278buf[i + 2] = (uint8_t)(addr >> 16);279}280}281282return i;283}284#endif285286#ifdef XZ_DEC_ARMTHUMB287static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size)288{289size_t i;290uint32_t addr;291292for (i = 0; i + 4 <= size; i += 2) {293if ((buf[i + 1] & 0xF8) == 0xF0294&& (buf[i + 3] & 0xF8) == 0xF8) {295addr = (((uint32_t)buf[i + 1] & 0x07) << 19)296| ((uint32_t)buf[i] << 11)297| (((uint32_t)buf[i + 3] & 0x07) << 8)298| (uint32_t)buf[i + 2];299addr <<= 1;300addr -= s->pos + (uint32_t)i + 4;301addr >>= 1;302buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));303buf[i] = (uint8_t)(addr >> 11);304buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));305buf[i + 2] = (uint8_t)addr;306i += 2;307}308}309310return i;311}312#endif313314#ifdef XZ_DEC_SPARC315static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)316{317size_t i;318uint32_t instr;319320for (i = 0; i + 4 <= size; i += 4) {321instr = get_unaligned_be32(buf + i);322if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {323instr <<= 2;324instr -= s->pos + (uint32_t)i;325instr >>= 2;326instr = ((uint32_t)0x40000000 - (instr & 0x400000))327| 0x40000000 | (instr & 0x3FFFFF);328put_unaligned_be32(instr, buf + i);329}330}331332return i;333}334#endif335336/*337* Apply the selected BCJ filter. Update *pos and s->pos to match the amount338* of data that got filtered.339*340* NOTE: This is implemented as a switch statement to avoid using function341* pointers, which could be problematic in the kernel boot code, which must342* avoid pointers to static data (at least on x86).343*/344static void bcj_apply(struct xz_dec_bcj *s,345uint8_t *buf, size_t *pos, size_t size)346{347size_t filtered;348349buf += *pos;350size -= *pos;351352switch (s->type) {353#ifdef XZ_DEC_X86354case BCJ_X86:355filtered = bcj_x86(s, buf, size);356break;357#endif358#ifdef XZ_DEC_POWERPC359case BCJ_POWERPC:360filtered = bcj_powerpc(s, buf, size);361break;362#endif363#ifdef XZ_DEC_IA64364case BCJ_IA64:365filtered = bcj_ia64(s, buf, size);366break;367#endif368#ifdef XZ_DEC_ARM369case BCJ_ARM:370filtered = bcj_arm(s, buf, size);371break;372#endif373#ifdef XZ_DEC_ARMTHUMB374case BCJ_ARMTHUMB:375filtered = bcj_armthumb(s, buf, size);376break;377#endif378#ifdef XZ_DEC_SPARC379case BCJ_SPARC:380filtered = bcj_sparc(s, buf, size);381break;382#endif383default:384/* Never reached but silence compiler warnings. */385filtered = 0;386break;387}388389*pos += filtered;390s->pos += filtered;391}392393/*394* Flush pending filtered data from temp to the output buffer.395* Move the remaining mixture of possibly filtered and unfiltered396* data to the beginning of temp.397*/398static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)399{400size_t copy_size;401402copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);403memcpy(b->out + b->out_pos, s->temp.buf, copy_size);404b->out_pos += copy_size;405406s->temp.filtered -= copy_size;407s->temp.size -= copy_size;408memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);409}410411/*412* The BCJ filter functions are primitive in sense that they process the413* data in chunks of 1-16 bytes. To hide this issue, this function does414* some buffering.415*/416XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s,417struct xz_dec_lzma2 *lzma2,418struct xz_buf *b)419{420size_t out_start;421422/*423* Flush pending already filtered data to the output buffer. Return424* immediately if we couldn't flush everything, or if the next425* filter in the chain had already returned XZ_STREAM_END.426*/427if (s->temp.filtered > 0) {428bcj_flush(s, b);429if (s->temp.filtered > 0)430return XZ_OK;431432if (s->ret == XZ_STREAM_END)433return XZ_STREAM_END;434}435436/*437* If we have more output space than what is currently pending in438* temp, copy the unfiltered data from temp to the output buffer439* and try to fill the output buffer by decoding more data from the440* next filter in the chain. Apply the BCJ filter on the new data441* in the output buffer. If everything cannot be filtered, copy it442* to temp and rewind the output buffer position accordingly.443*444* This needs to be always run when temp.size == 0 to handle a special445* case where the output buffer is full and the next filter has no446* more output coming but hasn't returned XZ_STREAM_END yet.447*/448if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) {449out_start = b->out_pos;450memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);451b->out_pos += s->temp.size;452453s->ret = xz_dec_lzma2_run(lzma2, b);454if (s->ret != XZ_STREAM_END455&& (s->ret != XZ_OK || s->single_call))456return s->ret;457458bcj_apply(s, b->out, &out_start, b->out_pos);459460/*461* As an exception, if the next filter returned XZ_STREAM_END,462* we can do that too, since the last few bytes that remain463* unfiltered are meant to remain unfiltered.464*/465if (s->ret == XZ_STREAM_END)466return XZ_STREAM_END;467468s->temp.size = b->out_pos - out_start;469b->out_pos -= s->temp.size;470memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);471472/*473* If there wasn't enough input to the next filter to fill474* the output buffer with unfiltered data, there's no point475* to try decoding more data to temp.476*/477if (b->out_pos + s->temp.size < b->out_size)478return XZ_OK;479}480481/*482* We have unfiltered data in temp. If the output buffer isn't full483* yet, try to fill the temp buffer by decoding more data from the484* next filter. Apply the BCJ filter on temp. Then we hopefully can485* fill the actual output buffer by copying filtered data from temp.486* A mix of filtered and unfiltered data may be left in temp; it will487* be taken care on the next call to this function.488*/489if (b->out_pos < b->out_size) {490/* Make b->out{,_pos,_size} temporarily point to s->temp. */491s->out = b->out;492s->out_pos = b->out_pos;493s->out_size = b->out_size;494b->out = s->temp.buf;495b->out_pos = s->temp.size;496b->out_size = sizeof(s->temp.buf);497498s->ret = xz_dec_lzma2_run(lzma2, b);499500s->temp.size = b->out_pos;501b->out = s->out;502b->out_pos = s->out_pos;503b->out_size = s->out_size;504505if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)506return s->ret;507508bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);509510/*511* If the next filter returned XZ_STREAM_END, we mark that512* everything is filtered, since the last unfiltered bytes513* of the stream are meant to be left as is.514*/515if (s->ret == XZ_STREAM_END)516s->temp.filtered = s->temp.size;517518bcj_flush(s, b);519if (s->temp.filtered > 0)520return XZ_OK;521}522523return s->ret;524}525526XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call)527{528struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL);529if (s != NULL)530s->single_call = single_call;531532return s;533}534535XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id)536{537switch (id) {538#ifdef XZ_DEC_X86539case BCJ_X86:540#endif541#ifdef XZ_DEC_POWERPC542case BCJ_POWERPC:543#endif544#ifdef XZ_DEC_IA64545case BCJ_IA64:546#endif547#ifdef XZ_DEC_ARM548case BCJ_ARM:549#endif550#ifdef XZ_DEC_ARMTHUMB551case BCJ_ARMTHUMB:552#endif553#ifdef XZ_DEC_SPARC554case BCJ_SPARC:555#endif556break;557558default:559/* Unsupported Filter ID */560return XZ_OPTIONS_ERROR;561}562563s->type = id;564s->ret = XZ_OK;565s->pos = 0;566s->x86_prev_mask = 0;567s->temp.filtered = 0;568s->temp.size = 0;569570return XZ_OK;571}572573#endif574575576