CoCalc -- drbd

GitHub Repository: torvalds/linux
Path: blob/master/drivers/block/drbd/drbd_vli.h
²⁶²⁸² views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
-*- linux-c -*-
4
   drbd_receiver.c
5
   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6

7
   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
8
   Copyright (C) 1999-2008, Philipp Reisner <[email protected]>.
9
   Copyright (C) 2002-2008, Lars Ellenberg <[email protected]>.
10

11
 */
12

13
#ifndef _DRBD_VLI_H
14
#define _DRBD_VLI_H
15

16
/*
17
 * At a granularity of 4KiB storage represented per bit,
18
 * and stroage sizes of several TiB,
19
 * and possibly small-bandwidth replication,
20
 * the bitmap transfer time can take much too long,
21
 * if transmitted in plain text.
22
 *
23
 * We try to reduce the transferred bitmap information
24
 * by encoding runlengths of bit polarity.
25
 *
26
 * We never actually need to encode a "zero" (runlengths are positive).
27
 * But then we have to store the value of the first bit.
28
 * The first bit of information thus shall encode if the first runlength
29
 * gives the number of set or unset bits.
30
 *
31
 * We assume that large areas are either completely set or unset,
32
 * which gives good compression with any runlength method,
33
 * even when encoding the runlength as fixed size 32bit/64bit integers.
34
 *
35
 * Still, there may be areas where the polarity flips every few bits,
36
 * and encoding the runlength sequence of those areas with fix size
37
 * integers would be much worse than plaintext.
38
 *
39
 * We want to encode small runlength values with minimum code length,
40
 * while still being able to encode a Huge run of all zeros.
41
 *
42
 * Thus we need a Variable Length Integer encoding, VLI.
43
 *
44
 * For some cases, we produce more code bits than plaintext input.
45
 * We need to send incompressible chunks as plaintext, skip over them
46
 * and then see if the next chunk compresses better.
47
 *
48
 * We don't care too much about "excellent" compression ratio for large
49
 * runlengths (all set/all clear): whether we achieve a factor of 100
50
 * or 1000 is not that much of an issue.
51
 * We do not want to waste too much on short runlengths in the "noisy"
52
 * parts of the bitmap, though.
53
 *
54
 * There are endless variants of VLI, we experimented with:
55
 *  * simple byte-based
56
 *  * various bit based with different code word length.
57
 *
58
 * To avoid yet an other configuration parameter (choice of bitmap compression
59
 * algorithm) which was difficult to explain and tune, we just chose the one
60
 * variant that turned out best in all test cases.
61
 * Based on real world usage patterns, with device sizes ranging from a few GiB
62
 * to several TiB, file server/mailserver/webserver/mysql/postgress,
63
 * mostly idle to really busy, the all time winner (though sometimes only
64
 * marginally better) is:
65
 */
66

67
/*
68
 * encoding is "visualised" as
69
 * __little endian__ bitstream, least significant bit first (left most)
70
 *
71
 * this particular encoding is chosen so that the prefix code
72
 * starts as unary encoding the level, then modified so that
73
 * 10 levels can be described in 8bit, with minimal overhead
74
 * for the smaller levels.
75
 *
76
 * Number of data bits follow fibonacci sequence, with the exception of the
77
 * last level (+1 data bit, so it makes 64bit total).  The only worse code when
78
 * encoding bit polarity runlength is 1 plain bits => 2 code bits.
79
prefix    data bits                                    max val  Nº data bits
80
0 x                                                         0x2            1
81
10 x                                                        0x4            1
82
110 xx                                                      0x8            2
83
1110 xxx                                                   0x10            3
84
11110 xxx xx                                               0x30            5
85
111110 xx xxxxxx                                          0x130            8
86
11111100  xxxxxxxx xxxxx                                 0x2130           13
87
11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
88
11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
89
11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
90
 * maximum encodable value: 0x100000400202130 == 2**56 + some */
91

92
/* compression "table":
93
 transmitted   x                                0.29
94
 as plaintext x                                  ........................
95
             x                                   ........................
96
            x                                    ........................
97
           x    0.59                         0.21........................
98
          x      ........................................................
99
         x       .. c ...................................................
100
        x    0.44.. o ...................................................
101
       x .......... d ...................................................
102
      x  .......... e ...................................................
103
     X.............   ...................................................
104
    x.............. b ...................................................
105
2.0x............... i ...................................................
106
 #X................ t ...................................................
107
 #................. s ...........................  plain bits  ..........
108
-+-----------------------------------------------------------------------
109
 1             16              32                              64
110
*/
111

112
/* LEVEL: (total bits, prefix bits, prefix value),
113
 * sorted ascending by number of total bits.
114
 * The rest of the code table is calculated at compiletime from this. */
115

116
/* fibonacci data 1, 1, ... */
117
#define VLI_L_1_1() do { \
118
	LEVEL( 2, 1, 0x00); \
119
	LEVEL( 3, 2, 0x01); \
120
	LEVEL( 5, 3, 0x03); \
121
	LEVEL( 7, 4, 0x07); \
122
	LEVEL(10, 5, 0x0f); \
123
	LEVEL(14, 6, 0x1f); \
124
	LEVEL(21, 8, 0x3f); \
125
	LEVEL(29, 8, 0x7f); \
126
	LEVEL(42, 8, 0xbf); \
127
	LEVEL(64, 8, 0xff); \
128
	} while (0)
129

130
/* finds a suitable level to decode the least significant part of in.
131
 * returns number of bits consumed.
132
 *
133
 * BUG() for bad input, as that would mean a buggy code table. */
134
static inline int vli_decode_bits(u64 *out, const u64 in)
135
{
136
	u64 adj = 1;
137

138
#define LEVEL(t,b,v)					\
139
	do {						\
140
		if ((in & ((1 << b) -1)) == v) {	\
141
			*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj;	\
142
			return t;			\
143
		}					\
144
		adj += 1ULL << (t - b);			\
145
	} while (0)
146

147
	VLI_L_1_1();
148

149
	/* NOT REACHED, if VLI_LEVELS code table is defined properly */
150
	BUG();
151
#undef LEVEL
152
}
153

154
/* return number of code bits needed,
155
 * or negative error number */
156
static inline int __vli_encode_bits(u64 *out, const u64 in)
157
{
158
	u64 max = 0;
159
	u64 adj = 1;
160

161
	if (in == 0)
162
		return -EINVAL;
163

164
#define LEVEL(t,b,v) do {		\
165
		max += 1ULL << (t - b);	\
166
		if (in <= max) {	\
167
			if (out)	\
168
				*out = ((in - adj) << b) | v;	\
169
			return t;	\
170
		}			\
171
		adj = max + 1;		\
172
	} while (0)
173

174
	VLI_L_1_1();
175

176
	return -EOVERFLOW;
177
#undef LEVEL
178
}
179

180
#undef VLI_L_1_1
181

182
/* code from here down is independend of actually used bit code */
183

184
/*
185
 * Code length is determined by some unique (e.g. unary) prefix.
186
 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
187
 * not a byte stream.
188
 */
189

190
/* for the bitstream, we need a cursor */
191
struct bitstream_cursor {
192
	/* the current byte */
193
	u8 *b;
194
	/* the current bit within *b, nomalized: 0..7 */
195
	unsigned int bit;
196
};
197

198
/* initialize cursor to point to first bit of stream */
199
static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
200
{
201
	cur->b = s;
202
	cur->bit = 0;
203
}
204

205
/* advance cursor by that many bits; maximum expected input value: 64,
206
 * but depending on VLI implementation, it may be more. */
207
static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
208
{
209
	bits += cur->bit;
210
	cur->b = cur->b + (bits >> 3);
211
	cur->bit = bits & 7;
212
}
213

214
/* the bitstream itself knows its length */
215
struct bitstream {
216
	struct bitstream_cursor cur;
217
	unsigned char *buf;
218
	size_t buf_len;		/* in bytes */
219

220
	/* for input stream:
221
	 * number of trailing 0 bits for padding
222
	 * total number of valid bits in stream: buf_len * 8 - pad_bits */
223
	unsigned int pad_bits;
224
};
225

226
static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
227
{
228
	bs->buf = s;
229
	bs->buf_len = len;
230
	bs->pad_bits = pad_bits;
231
	bitstream_cursor_reset(&bs->cur, bs->buf);
232
}
233

234
static inline void bitstream_rewind(struct bitstream *bs)
235
{
236
	bitstream_cursor_reset(&bs->cur, bs->buf);
237
	memset(bs->buf, 0, bs->buf_len);
238
}
239

240
/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
241
 * Ignores "pad_bits".
242
 * Returns zero if bits == 0 (nothing to do).
243
 * Returns number of bits used if successful.
244
 *
245
 * If there is not enough room left in bitstream,
246
 * leaves bitstream unchanged and returns -ENOBUFS.
247
 */
248
static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
249
{
250
	unsigned char *b = bs->cur.b;
251
	unsigned int tmp;
252

253
	if (bits == 0)
254
		return 0;
255

256
	if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
257
		return -ENOBUFS;
258

259
	/* paranoia: strip off hi bits; they should not be set anyways. */
260
	if (bits < 64)
261
		val &= ~0ULL >> (64 - bits);
262

263
	*b++ |= (val & 0xff) << bs->cur.bit;
264

265
	for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
266
		*b++ |= (val >> tmp) & 0xff;
267

268
	bitstream_cursor_advance(&bs->cur, bits);
269
	return bits;
270
}
271

272
/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
273
 *
274
 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
275
 *
276
 * If there are less than the requested number of valid bits left in the
277
 * bitstream, still fetches all available bits.
278
 *
279
 * Returns number of actually fetched bits.
280
 */
281
static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
282
{
283
	u64 val;
284
	unsigned int n;
285

286
	if (bits > 64)
287
		return -EINVAL;
288

289
	if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
290
		bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
291
			- bs->cur.bit - bs->pad_bits;
292

293
	if (bits == 0) {
294
		*out = 0;
295
		return 0;
296
	}
297

298
	/* get the high bits */
299
	val = 0;
300
	n = (bs->cur.bit + bits + 7) >> 3;
301
	/* n may be at most 9, if cur.bit + bits > 64 */
302
	/* which means this copies at most 8 byte */
303
	if (n) {
304
		memcpy(&val, bs->cur.b+1, n - 1);
305
		val = le64_to_cpu(val) << (8 - bs->cur.bit);
306
	}
307

308
	/* we still need the low bits */
309
	val |= bs->cur.b[0] >> bs->cur.bit;
310

311
	/* and mask out bits we don't want */
312
	val &= ~0ULL >> (64 - bits);
313

314
	bitstream_cursor_advance(&bs->cur, bits);
315
	*out = val;
316

317
	return bits;
318
}
319

320
/* encodes @in as vli into @bs;
321

322
 * return values
323
 *  > 0: number of bits successfully stored in bitstream
324
 * -ENOBUFS @bs is full
325
 * -EINVAL input zero (invalid)
326
 * -EOVERFLOW input too large for this vli code (invalid)
327
 */
328
static inline int vli_encode_bits(struct bitstream *bs, u64 in)
329
{
330
	u64 code;
331
	int bits = __vli_encode_bits(&code, in);
332

333
	if (bits <= 0)
334
		return bits;
335

336
	return bitstream_put_bits(bs, code, bits);
337
}
338

339
#endif
340

341
Product

Resources

Company