/* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2)12023-04-02 : Igor Pavlov : Public domain */23#include "Precomp.h"45/* #define SHOW_STAT */6#ifdef SHOW_STAT7#include <stdio.h>8#define PRF2(s) printf("%s ip=%8x tempPos=%d src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src));9#else10#define PRF2(s)11#endif1213#include "Bcj2.h"14#include "CpuArch.h"1516#define kTopValue ((UInt32)1 << 24)17#define kNumBitModelTotalBits 1118#define kBitModelTotal (1 << kNumBitModelTotalBits)19#define kNumMoveBits 52021void Bcj2Enc_Init(CBcj2Enc *p)22{23unsigned i;24p->state = BCJ2_ENC_STATE_ORIG;25p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;26p->context = 0;27p->flushRem = 5;28p->isFlushState = 0;29p->cache = 0;30p->range = 0xffffffff;31p->low = 0;32p->cacheSize = 1;33p->ip64 = 0;34p->fileIp64 = 0;35p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED;36p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT;37// p->relatExcludeBits = 0;38p->tempPos = 0;39for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)40p->probs[i] = kBitModelTotal >> 1;41}4243// Z7_NO_INLINE44Z7_FORCE_INLINE45static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)46{47const UInt32 low = (UInt32)p->low;48const unsigned high = (unsigned)49#if defined(Z7_MSC_VER_ORIGINAL) \50&& defined(MY_CPU_X86) \51&& defined(MY_CPU_LE) \52&& !defined(MY_CPU_64BIT)53// we try to rid of __aullshr() call in MSVS-x8654(((const UInt32 *)&p->low)[1]); // [1] : for little-endian only55#else56(p->low >> 32);57#endif58if (low < (UInt32)0xff000000 || high != 0)59{60Byte *buf = p->bufs[BCJ2_STREAM_RC];61do62{63if (buf == p->lims[BCJ2_STREAM_RC])64{65p->state = BCJ2_STREAM_RC;66p->bufs[BCJ2_STREAM_RC] = buf;67return True;68}69*buf++ = (Byte)(p->cache + high);70p->cache = 0xff;71}72while (--p->cacheSize);73p->bufs[BCJ2_STREAM_RC] = buf;74p->cache = (Byte)(low >> 24);75}76p->cacheSize++;77p->low = low << 8;78return False;79}808182/*83We can use 2 alternative versions of code:841) non-marker version:85Byte CBcj2Enc::context86Byte temp[8];87Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer.88Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction89with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call.90912) marker version:92UInt32 CBcj2Enc::context93Byte CBcj2Enc::temp[4];94MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker.95it's allowed that96one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest,97and another call of Bcj2Enc_Encode_2() does offset conversion.98So different values of (fileIp) and (fileSize) are possible99in these different Bcj2Enc_Encode_2() calls.100101Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop.102So we use non-marker version.103*/104105/*106Corner cases with overlap in multi-block.107before v23: there was one corner case, where converted instruction108could start in one sub-stream and finish in next sub-stream.109If multi-block (solid) encoding is used,110and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream.111and (0f) is last byte of previous sub-stream112and (8x) is first byte of current sub-stream113then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder.114BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage,115if that offset meets limit requirements.116If encoder allows 32-bit offset conversion for such overlap case,117then the data in 3 uncompressed BCJ2 streams for some sub-stream118can depend from data of previous sub-stream.119That corner case is not big problem, and it's rare case.120Since v23.00 we do additional check to prevent conversions in such overlap cases.121*/122123/*124Bcj2Enc_Encode_2() output variables at exit:125{126if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG))127{128it means that encoder needs more input data.129if (p->srcLim == p->src) at exit, then130{131(p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)132all input data were read and processed, and we are ready for133new input data.134}135else136{137(p->srcLim != p->src)138(p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)139The encoder have found e8/e9/0f_8x marker,140and p->src points to last byte of that marker,141Bcj2Enc_Encode_2() needs more input data to get totally1425 bytes (last byte of marker and 32-bit branch offset)143as continuous array starting from p->src.144(p->srcLim - p->src < 5) requirement is met after exit.145So non-processed resedue from p->src to p->srcLim is always less than 5 bytes.146}147}148}149*/150151Z7_NO_INLINE152static void Bcj2Enc_Encode_2(CBcj2Enc *p)153{154if (!p->isFlushState)155{156const Byte *src;157UInt32 v;158{159const unsigned state = p->state;160if (BCJ2_IS_32BIT_STREAM(state))161{162Byte *cur = p->bufs[state];163if (cur == p->lims[state])164return;165SetBe32a(cur, p->tempTarget)166p->bufs[state] = cur + 4;167}168}169p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit170src = p->src;171v = p->context;172173// #define WRITE_CONTEXT p->context = v; // for marker version174#define WRITE_CONTEXT p->context = (Byte)v;175#define WRITE_CONTEXT_AND_SRC p->src = src; WRITE_CONTEXT176177for (;;)178{179// const Byte *src;180// UInt32 v;181CBcj2Enc_ip_unsigned ip;182if (p->range < kTopValue)183{184// to reduce register pressure and code size: we save and restore local variables.185WRITE_CONTEXT_AND_SRC186if (Bcj2_RangeEnc_ShiftLow(p))187return;188p->range <<= 8;189src = p->src;190v = p->context;191}192// src = p->src;193// #define MARKER_FLAG ((UInt32)1 << 17)194// if ((v & MARKER_FLAG) == 0) // for marker version195{196const Byte *srcLim;197Byte *dest = p->bufs[BCJ2_STREAM_MAIN];198{199const SizeT remSrc = (SizeT)(p->srcLim - src);200SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);201if (rem >= remSrc)202rem = remSrc;203srcLim = src + rem;204}205/* p->context contains context of previous byte:206bits [0 : 7] : src[-1], if (src) was changed in this call207bits [8 : 31] : are undefined for non-marker version208*/209// v = p->context;210#define NUM_SHIFT_BITS 24211#define CONV_FLAG ((UInt32)1 << 16)212#define ONE_ITER { \213b = src[0]; \214*dest++ = (Byte)b; \215v = (v << NUM_SHIFT_BITS) | b; \216if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \217if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \218((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \219src++; if (src == srcLim) { break; } }220221if (src != srcLim)222for (;;)223{224/* clang can generate ineffective code with setne instead of two jcc instructions.225we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */226unsigned b;227ONE_ITER228ONE_ITER229}230231ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]);232p->bufs[BCJ2_STREAM_MAIN] = dest;233p->ip64 = ip;234235if (src == srcLim)236{237WRITE_CONTEXT_AND_SRC238if (src != p->srcLim)239{240p->state = BCJ2_STREAM_MAIN;241return;242}243/* (p->src == p->srcLim)244(p->state == BCJ2_ENC_STATE_ORIG) */245if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)246return;247/* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */248// (p->flushRem == 5);249p->isFlushState = 1;250break;251}252src++;253// p->src = src;254}255// ip = p->ip; // for marker version256/* marker was found */257/* (v) contains marker that was found:258bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7]259: value of src[-2] : xx/xx/0f260bits [0 : 7] : value of src[-1] : e8/e9/8x261*/262{263{264#if NUM_SHIFT_BITS != 24265v &= ~(UInt32)CONV_FLAG;266#endif267// UInt32 relat = 0;268if ((SizeT)(p->srcLim - src) >= 4)269{270/*271if (relat != 0 || (Byte)v != 0xe8)272BoolInt isBigOffset = True;273*/274const UInt32 relat = GetUi32(src);275/*276#define EXCLUDE_FLAG ((UInt32)1 << 4)277#define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0)278if (p->relatExcludeBits != 0)279{280const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1);281isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0);282}283// isBigOffset = False; // for debug284*/285ip -= p->fileIp64;286// Use the following if check, if (ip) is 64-bit:287if (ip > (((v + 0x20) >> 5) & 1)) // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9)288if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1)289if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit)290v |= CONV_FLAG;291}292else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)293{294// (p->srcLim - src < 4)295// /*296// for non-marker version297p->ip64--; // p->ip = ip - 1;298p->bufs[BCJ2_STREAM_MAIN]--;299src--;300v >>= NUM_SHIFT_BITS;301// (0 < p->srcLim - p->src <= 4)302// */303// v |= MARKER_FLAG; // for marker version304/* (p->state == BCJ2_ENC_STATE_ORIG) */305WRITE_CONTEXT_AND_SRC306return;307}308{309const unsigned c = ((v + 0x17) >> 6) & 1;310CBcj2Prob *prob = p->probs + (unsigned)311(((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));312/*313((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) :314((Byte)v < 0xe8 ? 0 : 1)); // ((v >> 5) & 1));315*/316const unsigned ttt = *prob;317const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt;318if ((v & CONV_FLAG) == 0)319{320// static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy);321// v = (Byte)v; // for marker version322p->range = bound;323*prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));324// WRITE_CONTEXT_AND_SRC325continue;326}327p->low += bound;328p->range -= bound;329*prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));330}331// p->context = src[3];332{333// const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP);334const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;335ip = p->ip64;336v = GetUi32(src); // relat337ip += 4;338p->ip64 = ip;339src += 4;340// p->src = src;341{342const UInt32 absol = (UInt32)ip + v;343Byte *cur = p->bufs[cj];344v >>= 24;345// WRITE_CONTEXT346if (cur == p->lims[cj])347{348p->state = cj;349p->tempTarget = absol;350WRITE_CONTEXT_AND_SRC351return;352}353SetBe32a(cur, absol)354p->bufs[cj] = cur + 4;355}356}357}358}359} // end of loop360}361362for (; p->flushRem != 0; p->flushRem--)363if (Bcj2_RangeEnc_ShiftLow(p))364return;365p->state = BCJ2_ENC_STATE_FINISHED;366}367368369/*370BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer.371So base function Bcj2Enc_Encode_2()372in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with373(p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim)374Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer.375so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(),376then (p->src == p->srcLim).377And the caller's code is simpler with Bcj2Enc_Encode().378*/379380Z7_NO_INLINE381void Bcj2Enc_Encode(CBcj2Enc *p)382{383PRF2("\n----")384if (p->tempPos != 0)385{386/* extra: number of bytes that were copied from (src) to (temp) buffer in this call */387unsigned extra = 0;388/* We will touch only minimal required number of bytes in input (src) stream.389So we will add input bytes from (src) stream to temp[] with step of 1 byte.390We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call391in first loop iteration because392- previous call of Bcj2Enc_Encode() could use another (finishMode),393- previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG).394the case with full temp[] buffer (p->tempPos == 4) is possible here.395*/396for (;;)397{398// (0 < p->tempPos <= 5) // in non-marker version399/* p->src : the current src data position including extra bytes400that were copied to temp[] buffer in this call */401const Byte *src = p->src;402const Byte *srcLim = p->srcLim;403const EBcj2Enc_FinishMode finishMode = p->finishMode;404if (src != srcLim)405{406/* if there are some src data after the data copied to temp[],407then we use MODE_CONTINUE for temp data */408p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;409}410p->src = p->temp;411p->srcLim = p->temp + p->tempPos;412PRF2(" ")413Bcj2Enc_Encode_2(p);414{415const unsigned num = (unsigned)(p->src - p->temp);416const unsigned tempPos = p->tempPos - num;417unsigned i;418p->tempPos = tempPos;419for (i = 0; i < tempPos; i++)420p->temp[i] = p->temp[(SizeT)i + num];421// tempPos : number of bytes in temp buffer422p->src = src;423p->srcLim = srcLim;424p->finishMode = finishMode;425if (p->state != BCJ2_ENC_STATE_ORIG)426{427// (p->tempPos <= 4) // in non-marker version428/* if (the reason of exit from Bcj2Enc_Encode_2()429is not BCJ2_ENC_STATE_ORIG),430then we exit from Bcj2Enc_Encode() with same reason */431// optional code begin : we rollback (src) and tempPos, if it's possible:432if (extra >= tempPos)433extra = tempPos;434p->src = src - extra;435p->tempPos = tempPos - extra;436// optional code end : rollback of (src) and tempPos437return;438}439/* (p->tempPos <= 4)440(p->state == BCJ2_ENC_STATE_ORIG)441so encoder needs more data than in temp[] */442if (src == srcLim)443return; // src buffer has no more input data.444/* (src != srcLim)445so we can provide more input data from src for Bcj2Enc_Encode_2() */446if (extra >= tempPos)447{448/* (extra >= tempPos) means that temp buffer contains449only data from src buffer of this call.450So now we can encode without temp buffer */451p->src = src - tempPos; // rollback (src)452p->tempPos = 0;453break;454}455// we append one additional extra byte from (src) to temp[] buffer:456p->temp[tempPos] = *src;457p->tempPos = tempPos + 1;458// (0 < p->tempPos <= 5) // in non-marker version459p->src = src + 1;460extra++;461}462}463}464465PRF2("++++")466// (p->tempPos == 0)467Bcj2Enc_Encode_2(p);468PRF2("====")469470if (p->state == BCJ2_ENC_STATE_ORIG)471{472const Byte *src = p->src;473const Byte *srcLim = p->srcLim;474const unsigned rem = (unsigned)(srcLim - src);475/* (rem <= 4) here.476if (p->src != p->srcLim), then477- we copy non-processed bytes from (p->src) to temp[] buffer,478- we set p->src equal to p->srcLim.479*/480if (rem)481{482unsigned i = 0;483p->src = srcLim;484p->tempPos = rem;485// (0 < p->tempPos <= 4)486do487p->temp[i] = src[i];488while (++i != rem);489}490// (p->tempPos <= 4)491// (p->src == p->srcLim)492}493}494495#undef PRF2496#undef CONV_FLAG497#undef MARKER_FLAG498#undef WRITE_CONTEXT499#undef WRITE_CONTEXT_AND_SRC500#undef ONE_ITER501#undef NUM_SHIFT_BITS502#undef kTopValue503#undef kNumBitModelTotalBits504#undef kBitModelTotal505#undef kNumMoveBits506507508