/* SPDX-License-Identifier: GPL-2.0-only */1/*2-*- linux-c -*-3drbd_receiver.c4This file is part of DRBD by Philipp Reisner and Lars Ellenberg.56Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.7Copyright (C) 1999-2008, Philipp Reisner <[email protected]>.8Copyright (C) 2002-2008, Lars Ellenberg <[email protected]>.910*/1112#ifndef _DRBD_VLI_H13#define _DRBD_VLI_H1415/*16* At a granularity of 4KiB storage represented per bit,17* and stroage sizes of several TiB,18* and possibly small-bandwidth replication,19* the bitmap transfer time can take much too long,20* if transmitted in plain text.21*22* We try to reduce the transferred bitmap information23* by encoding runlengths of bit polarity.24*25* We never actually need to encode a "zero" (runlengths are positive).26* But then we have to store the value of the first bit.27* The first bit of information thus shall encode if the first runlength28* gives the number of set or unset bits.29*30* We assume that large areas are either completely set or unset,31* which gives good compression with any runlength method,32* even when encoding the runlength as fixed size 32bit/64bit integers.33*34* Still, there may be areas where the polarity flips every few bits,35* and encoding the runlength sequence of those areas with fix size36* integers would be much worse than plaintext.37*38* We want to encode small runlength values with minimum code length,39* while still being able to encode a Huge run of all zeros.40*41* Thus we need a Variable Length Integer encoding, VLI.42*43* For some cases, we produce more code bits than plaintext input.44* We need to send incompressible chunks as plaintext, skip over them45* and then see if the next chunk compresses better.46*47* We don't care too much about "excellent" compression ratio for large48* runlengths (all set/all clear): whether we achieve a factor of 10049* or 1000 is not that much of an issue.50* We do not want to waste too much on short runlengths in the "noisy"51* parts of the bitmap, though.52*53* There are endless variants of VLI, we experimented with:54* * simple byte-based55* * various bit based with different code word length.56*57* To avoid yet an other configuration parameter (choice of bitmap compression58* algorithm) which was difficult to explain and tune, we just chose the one59* variant that turned out best in all test cases.60* Based on real world usage patterns, with device sizes ranging from a few GiB61* to several TiB, file server/mailserver/webserver/mysql/postgress,62* mostly idle to really busy, the all time winner (though sometimes only63* marginally better) is:64*/6566/*67* encoding is "visualised" as68* __little endian__ bitstream, least significant bit first (left most)69*70* this particular encoding is chosen so that the prefix code71* starts as unary encoding the level, then modified so that72* 10 levels can be described in 8bit, with minimal overhead73* for the smaller levels.74*75* Number of data bits follow fibonacci sequence, with the exception of the76* last level (+1 data bit, so it makes 64bit total). The only worse code when77* encoding bit polarity runlength is 1 plain bits => 2 code bits.78prefix data bits max val NÂș data bits790 x 0x2 18010 x 0x4 181110 xx 0x8 2821110 xxx 0x10 38311110 xxx xx 0x30 584111110 xx xxxxxx 0x130 88511111100 xxxxxxxx xxxxx 0x2130 138611111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 218711111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 348811111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 5689* maximum encodable value: 0x100000400202130 == 2**56 + some */9091/* compression "table":92transmitted x 0.2993as plaintext x ........................94x ........................95x ........................96x 0.59 0.21........................97x ........................................................98x .. c ...................................................99x 0.44.. o ...................................................100x .......... d ...................................................101x .......... e ...................................................102X............. ...................................................103x.............. b ...................................................1042.0x............... i ...................................................105#X................ t ...................................................106#................. s ........................... plain bits ..........107-+-----------------------------------------------------------------------1081 16 32 64109*/110111/* LEVEL: (total bits, prefix bits, prefix value),112* sorted ascending by number of total bits.113* The rest of the code table is calculated at compiletime from this. */114115/* fibonacci data 1, 1, ... */116#define VLI_L_1_1() do { \117LEVEL( 2, 1, 0x00); \118LEVEL( 3, 2, 0x01); \119LEVEL( 5, 3, 0x03); \120LEVEL( 7, 4, 0x07); \121LEVEL(10, 5, 0x0f); \122LEVEL(14, 6, 0x1f); \123LEVEL(21, 8, 0x3f); \124LEVEL(29, 8, 0x7f); \125LEVEL(42, 8, 0xbf); \126LEVEL(64, 8, 0xff); \127} while (0)128129/* finds a suitable level to decode the least significant part of in.130* returns number of bits consumed.131*132* BUG() for bad input, as that would mean a buggy code table. */133static inline int vli_decode_bits(u64 *out, const u64 in)134{135u64 adj = 1;136137#define LEVEL(t,b,v) \138do { \139if ((in & ((1 << b) -1)) == v) { \140*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \141return t; \142} \143adj += 1ULL << (t - b); \144} while (0)145146VLI_L_1_1();147148/* NOT REACHED, if VLI_LEVELS code table is defined properly */149BUG();150#undef LEVEL151}152153/* return number of code bits needed,154* or negative error number */155static inline int __vli_encode_bits(u64 *out, const u64 in)156{157u64 max = 0;158u64 adj = 1;159160if (in == 0)161return -EINVAL;162163#define LEVEL(t,b,v) do { \164max += 1ULL << (t - b); \165if (in <= max) { \166if (out) \167*out = ((in - adj) << b) | v; \168return t; \169} \170adj = max + 1; \171} while (0)172173VLI_L_1_1();174175return -EOVERFLOW;176#undef LEVEL177}178179#undef VLI_L_1_1180181/* code from here down is independend of actually used bit code */182183/*184* Code length is determined by some unique (e.g. unary) prefix.185* This encodes arbitrary bit length, not whole bytes: we have a bit-stream,186* not a byte stream.187*/188189/* for the bitstream, we need a cursor */190struct bitstream_cursor {191/* the current byte */192u8 *b;193/* the current bit within *b, nomalized: 0..7 */194unsigned int bit;195};196197/* initialize cursor to point to first bit of stream */198static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)199{200cur->b = s;201cur->bit = 0;202}203204/* advance cursor by that many bits; maximum expected input value: 64,205* but depending on VLI implementation, it may be more. */206static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)207{208bits += cur->bit;209cur->b = cur->b + (bits >> 3);210cur->bit = bits & 7;211}212213/* the bitstream itself knows its length */214struct bitstream {215struct bitstream_cursor cur;216unsigned char *buf;217size_t buf_len; /* in bytes */218219/* for input stream:220* number of trailing 0 bits for padding221* total number of valid bits in stream: buf_len * 8 - pad_bits */222unsigned int pad_bits;223};224225static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)226{227bs->buf = s;228bs->buf_len = len;229bs->pad_bits = pad_bits;230bitstream_cursor_reset(&bs->cur, bs->buf);231}232233static inline void bitstream_rewind(struct bitstream *bs)234{235bitstream_cursor_reset(&bs->cur, bs->buf);236memset(bs->buf, 0, bs->buf_len);237}238239/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.240* Ignores "pad_bits".241* Returns zero if bits == 0 (nothing to do).242* Returns number of bits used if successful.243*244* If there is not enough room left in bitstream,245* leaves bitstream unchanged and returns -ENOBUFS.246*/247static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)248{249unsigned char *b = bs->cur.b;250unsigned int tmp;251252if (bits == 0)253return 0;254255if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)256return -ENOBUFS;257258/* paranoia: strip off hi bits; they should not be set anyways. */259if (bits < 64)260val &= ~0ULL >> (64 - bits);261262*b++ |= (val & 0xff) << bs->cur.bit;263264for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)265*b++ |= (val >> tmp) & 0xff;266267bitstream_cursor_advance(&bs->cur, bits);268return bits;269}270271/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.272*273* If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.274*275* If there are less than the requested number of valid bits left in the276* bitstream, still fetches all available bits.277*278* Returns number of actually fetched bits.279*/280static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)281{282u64 val;283unsigned int n;284285if (bits > 64)286return -EINVAL;287288if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)289bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)290- bs->cur.bit - bs->pad_bits;291292if (bits == 0) {293*out = 0;294return 0;295}296297/* get the high bits */298val = 0;299n = (bs->cur.bit + bits + 7) >> 3;300/* n may be at most 9, if cur.bit + bits > 64 */301/* which means this copies at most 8 byte */302if (n) {303memcpy(&val, bs->cur.b+1, n - 1);304val = le64_to_cpu(val) << (8 - bs->cur.bit);305}306307/* we still need the low bits */308val |= bs->cur.b[0] >> bs->cur.bit;309310/* and mask out bits we don't want */311val &= ~0ULL >> (64 - bits);312313bitstream_cursor_advance(&bs->cur, bits);314*out = val;315316return bits;317}318319/* encodes @in as vli into @bs;320321* return values322* > 0: number of bits successfully stored in bitstream323* -ENOBUFS @bs is full324* -EINVAL input zero (invalid)325* -EOVERFLOW input too large for this vli code (invalid)326*/327static inline int vli_encode_bits(struct bitstream *bs, u64 in)328{329u64 code;330int bits = __vli_encode_bits(&code, in);331332if (bits <= 0)333return bits;334335return bitstream_put_bits(bs, code, bits);336}337338#endif339340341