CoCalc -- tuklib

GitHub Repository: Kitware/CMake
Path: blob/master/Utilities/cmliblzma/common/tuklib_integer.h
³¹⁵³ views
1
// SPDX-License-Identifier: 0BSD
2

3
///////////////////////////////////////////////////////////////////////////////
4
//
5
/// \file       tuklib_integer.h
6
/// \brief      Various integer and bit operations
7
///
8
/// This file provides macros or functions to do some basic integer and bit
9
/// operations.
10
///
11
/// Native endian inline functions (XX = 16, 32, or 64):
12
///   - Unaligned native endian reads: readXXne(ptr)
13
///   - Unaligned native endian writes: writeXXne(ptr, num)
14
///   - Aligned native endian reads: aligned_readXXne(ptr)
15
///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
16
///
17
/// Endianness-converting integer operations (these can be macros!)
18
/// (XX = 16, 32, or 64; Y = b or l):
19
///   - Byte swapping: byteswapXX(num)
20
///   - Byte order conversions to/from native (byteswaps if Y isn't
21
///     the native endianness): convXXYe(num)
22
///   - Unaligned reads: readXXYe(ptr)
23
///   - Unaligned writes: writeXXYe(ptr, num)
24
///   - Aligned reads: aligned_readXXYe(ptr)
25
///   - Aligned writes: aligned_writeXXYe(ptr, num)
26
///
27
/// Since the above can macros, the arguments should have no side effects
28
/// because they may be evaluated more than once.
29
///
30
/// Bit scan operations for non-zero 32-bit integers (inline functions):
31
///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
32
///   - Count leading zeros: clz32(num)
33
///   - Count trailing zeros: ctz32(num)
34
///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
35
///
36
/// The above bit scan operations return 0-31. If num is zero,
37
/// the result is undefined.
38
//
39
//  Authors:    Lasse Collin
40
//              Joachim Henke
41
//
42
///////////////////////////////////////////////////////////////////////////////
43

44
#ifndef TUKLIB_INTEGER_H
45
#define TUKLIB_INTEGER_H
46

47
#include "tuklib_common.h"
48
#include <string.h>
49

50
// Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
51
// and such functions.
52
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
53
#	include <immintrin.h>
54
// Only include <intrin.h> when it is needed. GCC and Clang can both
55
// use __builtin's, so we only need Windows instrincs when using MSVC.
56
// GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
57
// cases explicitly.
58
#elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
59
#	include <intrin.h>
60
#endif
61

62

63
///////////////////
64
// Byte swapping //
65
///////////////////
66

67
#if defined(HAVE___BUILTIN_BSWAPXX)
68
	// GCC >= 4.8 and Clang
69
#	define byteswap16(num) __builtin_bswap16(num)
70
#	define byteswap32(num) __builtin_bswap32(num)
71
#	define byteswap64(num) __builtin_bswap64(num)
72

73
#elif defined(HAVE_BYTESWAP_H)
74
	// glibc, uClibc, dietlibc
75
#	include <byteswap.h>
76
#	ifdef HAVE_BSWAP_16
77
#		define byteswap16(num) bswap_16(num)
78
#	endif
79
#	ifdef HAVE_BSWAP_32
80
#		define byteswap32(num) bswap_32(num)
81
#	endif
82
#	ifdef HAVE_BSWAP_64
83
#		define byteswap64(num) bswap_64(num)
84
#	endif
85

86
#elif defined(HAVE_SYS_ENDIAN_H)
87
	// *BSDs and Darwin
88
#	include <sys/endian.h>
89
#	ifdef __OpenBSD__
90
#		define byteswap16(num) swap16(num)
91
#		define byteswap32(num) swap32(num)
92
#		define byteswap64(num) swap64(num)
93
#	else
94
#		define byteswap16(num) bswap16(num)
95
#		define byteswap32(num) bswap32(num)
96
#		define byteswap64(num) bswap64(num)
97
#	endif
98

99
#elif defined(HAVE_SYS_BYTEORDER_H)
100
	// Solaris
101
#	include <sys/byteorder.h>
102
#	ifdef BSWAP_16
103
#		define byteswap16(num) BSWAP_16(num)
104
#	endif
105
#	ifdef BSWAP_32
106
#		define byteswap32(num) BSWAP_32(num)
107
#	endif
108
#	ifdef BSWAP_64
109
#		define byteswap64(num) BSWAP_64(num)
110
#	endif
111
#	ifdef BE_16
112
#		define conv16be(num) BE_16(num)
113
#	endif
114
#	ifdef BE_32
115
#		define conv32be(num) BE_32(num)
116
#	endif
117
#	ifdef BE_64
118
#		define conv64be(num) BE_64(num)
119
#	endif
120
#	ifdef LE_16
121
#		define conv16le(num) LE_16(num)
122
#	endif
123
#	ifdef LE_32
124
#		define conv32le(num) LE_32(num)
125
#	endif
126
#	ifdef LE_64
127
#		define conv64le(num) LE_64(num)
128
#	endif
129
#endif
130

131
#ifndef byteswap16
132
#	define byteswap16(n) (uint16_t)( \
133
		  (((n) & 0x00FFU) << 8) \
134
		| (((n) & 0xFF00U) >> 8) \
135
	)
136
#endif
137

138
#ifndef byteswap32
139
#	define byteswap32(n) (uint32_t)( \
140
		  (((n) & UINT32_C(0x000000FF)) << 24) \
141
		| (((n) & UINT32_C(0x0000FF00)) << 8) \
142
		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
143
		| (((n) & UINT32_C(0xFF000000)) >> 24) \
144
	)
145
#endif
146

147
#ifndef byteswap64
148
#	define byteswap64(n) (uint64_t)( \
149
		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
150
		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
151
		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
152
		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
153
		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
154
		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
155
		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
156
		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
157
	)
158
#endif
159

160
// Define conversion macros using the basic byte swapping macros.
161
#ifdef WORDS_BIGENDIAN
162
#	ifndef conv16be
163
#		define conv16be(num) ((uint16_t)(num))
164
#	endif
165
#	ifndef conv32be
166
#		define conv32be(num) ((uint32_t)(num))
167
#	endif
168
#	ifndef conv64be
169
#		define conv64be(num) ((uint64_t)(num))
170
#	endif
171
#	ifndef conv16le
172
#		define conv16le(num) byteswap16(num)
173
#	endif
174
#	ifndef conv32le
175
#		define conv32le(num) byteswap32(num)
176
#	endif
177
#	ifndef conv64le
178
#		define conv64le(num) byteswap64(num)
179
#	endif
180
#else
181
#	ifndef conv16be
182
#		define conv16be(num) byteswap16(num)
183
#	endif
184
#	ifndef conv32be
185
#		define conv32be(num) byteswap32(num)
186
#	endif
187
#	ifndef conv64be
188
#		define conv64be(num) byteswap64(num)
189
#	endif
190
#	ifndef conv16le
191
#		define conv16le(num) ((uint16_t)(num))
192
#	endif
193
#	ifndef conv32le
194
#		define conv32le(num) ((uint32_t)(num))
195
#	endif
196
#	ifndef conv64le
197
#		define conv64le(num) ((uint64_t)(num))
198
#	endif
199
#endif
200

201

202
////////////////////////////////
203
// Unaligned reads and writes //
204
////////////////////////////////
205

206
// No-strict-align archs like x86-64
207
// ---------------------------------
208
//
209
// The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
210
// is bad even if the uint8_pointer is properly aligned because this kind
211
// of casts break strict aliasing rules and result in undefined behavior.
212
// With unaligned pointers it's even worse: compilers may emit vector
213
// instructions that require aligned pointers even if non-vector
214
// instructions work with unaligned pointers.
215
//
216
// Using memcpy() is the standard compliant way to do unaligned access.
217
// Many modern compilers inline it so there is no function call overhead.
218
// For those compilers that don't handle the memcpy() method well, the
219
// old casting method (that violates strict aliasing) can be requested at
220
// build time. A third method, casting to a packed struct, would also be
221
// an option but isn't provided to keep things simpler (it's already a mess).
222
// Hopefully this is flexible enough in practice.
223
//
224
// Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
225
//
226
//     buf[0] | (buf[1] << 8)
227
//
228
// reads a 16-bit value and can emit a single 16-bit load and produce
229
// identical code than with the memcpy() method. In other cases Clang and GCC
230
// produce either the same or better code with memcpy(). For example, Clang 9
231
// on x86-64 can detect 32-bit load but not 16-bit load.
232
//
233
// MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
234
// code for "buf[0] | (buf[1] << 8)".
235
//
236
// Conclusion: The memcpy() method is the best choice when unaligned access
237
// is supported.
238
//
239
// Strict-align archs like SPARC
240
// -----------------------------
241
//
242
// GCC versions from around 4.x to to at least 13.2.0 produce worse code
243
// from the memcpy() method than from simple byte-by-byte shift-or code
244
// when reading a 32-bit integer:
245
//
246
//     (1) It may be constructed on stack using four 8-bit loads,
247
//         four 8-bit stores to stack, and finally one 32-bit load from stack.
248
//
249
//     (2) Especially with -Os, an actual memcpy() call may be emitted.
250
//
251
// This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
252
// RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
253
// some processors but not all so this is relevant only in the case when
254
// GCC assumes that unaligned is not supported or -mstrict-align or
255
// -mno-unaligned-access is used.
256
//
257
// For Clang it makes little difference. ARM64 with -O2 -mstrict-align
258
// was one the very few with a minor difference: the memcpy() version
259
// was one instruction longer.
260
//
261
// Conclusion: At least in case of GCC and Clang, byte-by-byte code is
262
// the best choice for strict-align archs to do unaligned access.
263
//
264
// See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
265
//
266
// Thanks to <https://godbolt.org/> it was easy to test different compilers.
267
// The following is for little endian targets:
268
/*
269
#include <stdint.h>
270
#include <string.h>
271

272
uint32_t bytes16(const uint8_t *b)
273
{
274
    return (uint32_t)b[0]
275
        | ((uint32_t)b[1] << 8);
276
}
277

278
uint32_t copy16(const uint8_t *b)
279
{
280
    uint16_t v;
281
    memcpy(&v, b, sizeof(v));
282
    return v;
283
}
284

285
uint32_t bytes32(const uint8_t *b)
286
{
287
    return (uint32_t)b[0]
288
        | ((uint32_t)b[1] << 8)
289
        | ((uint32_t)b[2] << 16)
290
        | ((uint32_t)b[3] << 24);
291
}
292

293
uint32_t copy32(const uint8_t *b)
294
{
295
    uint32_t v;
296
    memcpy(&v, b, sizeof(v));
297
    return v;
298
}
299

300
void wbytes16(uint8_t *b, uint16_t v)
301
{
302
    b[0] = (uint8_t)v;
303
    b[1] = (uint8_t)(v >> 8);
304
}
305

306
void wcopy16(uint8_t *b, uint16_t v)
307
{
308
    memcpy(b, &v, sizeof(v));
309
}
310

311
void wbytes32(uint8_t *b, uint32_t v)
312
{
313
    b[0] = (uint8_t)v;
314
    b[1] = (uint8_t)(v >> 8);
315
    b[2] = (uint8_t)(v >> 16);
316
    b[3] = (uint8_t)(v >> 24);
317
}
318

319
void wcopy32(uint8_t *b, uint32_t v)
320
{
321
    memcpy(b, &v, sizeof(v));
322
}
323
*/
324

325

326
#ifdef TUKLIB_FAST_UNALIGNED_ACCESS
327

328
static inline uint16_t
329
read16ne(const uint8_t *buf)
330
{
331
#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
332
	return *(const uint16_t *)buf;
333
#else
334
	uint16_t num;
335
	memcpy(&num, buf, sizeof(num));
336
	return num;
337
#endif
338
}
339

340

341
static inline uint32_t
342
read32ne(const uint8_t *buf)
343
{
344
#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
345
	return *(const uint32_t *)buf;
346
#else
347
	uint32_t num;
348
	memcpy(&num, buf, sizeof(num));
349
	return num;
350
#endif
351
}
352

353

354
static inline uint64_t
355
read64ne(const uint8_t *buf)
356
{
357
#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
358
	return *(const uint64_t *)buf;
359
#else
360
	uint64_t num;
361
	memcpy(&num, buf, sizeof(num));
362
	return num;
363
#endif
364
}
365

366

367
static inline void
368
write16ne(uint8_t *buf, uint16_t num)
369
{
370
#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
371
	*(uint16_t *)buf = num;
372
#else
373
	memcpy(buf, &num, sizeof(num));
374
#endif
375
	return;
376
}
377

378

379
static inline void
380
write32ne(uint8_t *buf, uint32_t num)
381
{
382
#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
383
	*(uint32_t *)buf = num;
384
#else
385
	memcpy(buf, &num, sizeof(num));
386
#endif
387
	return;
388
}
389

390

391
static inline void
392
write64ne(uint8_t *buf, uint64_t num)
393
{
394
#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
395
	*(uint64_t *)buf = num;
396
#else
397
	memcpy(buf, &num, sizeof(num));
398
#endif
399
	return;
400
}
401

402

403
static inline uint16_t
404
read16be(const uint8_t *buf)
405
{
406
	uint16_t num = read16ne(buf);
407
	return conv16be(num);
408
}
409

410

411
static inline uint16_t
412
read16le(const uint8_t *buf)
413
{
414
	uint16_t num = read16ne(buf);
415
	return conv16le(num);
416
}
417

418

419
static inline uint32_t
420
read32be(const uint8_t *buf)
421
{
422
	uint32_t num = read32ne(buf);
423
	return conv32be(num);
424
}
425

426

427
static inline uint32_t
428
read32le(const uint8_t *buf)
429
{
430
	uint32_t num = read32ne(buf);
431
	return conv32le(num);
432
}
433

434

435
static inline uint64_t
436
read64be(const uint8_t *buf)
437
{
438
	uint64_t num = read64ne(buf);
439
	return conv64be(num);
440
}
441

442

443
static inline uint64_t
444
read64le(const uint8_t *buf)
445
{
446
	uint64_t num = read64ne(buf);
447
	return conv64le(num);
448
}
449

450

451
// NOTE: Possible byte swapping must be done in a macro to allow the compiler
452
// to optimize byte swapping of constants when using glibc's or *BSD's
453
// byte swapping macros. The actual write is done in an inline function
454
// to make type checking of the buf pointer possible.
455
#define write16be(buf, num) write16ne(buf, conv16be(num))
456
#define write32be(buf, num) write32ne(buf, conv32be(num))
457
#define write64be(buf, num) write64ne(buf, conv64be(num))
458
#define write16le(buf, num) write16ne(buf, conv16le(num))
459
#define write32le(buf, num) write32ne(buf, conv32le(num))
460
#define write64le(buf, num) write64ne(buf, conv64le(num))
461

462
#else
463

464
#ifdef WORDS_BIGENDIAN
465
#	define read16ne read16be
466
#	define read32ne read32be
467
#	define read64ne read64be
468
#	define write16ne write16be
469
#	define write32ne write32be
470
#	define write64ne write64be
471
#else
472
#	define read16ne read16le
473
#	define read32ne read32le
474
#	define read64ne read64le
475
#	define write16ne write16le
476
#	define write32ne write32le
477
#	define write64ne write64le
478
#endif
479

480

481
static inline uint16_t
482
read16be(const uint8_t *buf)
483
{
484
	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
485
	return num;
486
}
487

488

489
static inline uint16_t
490
read16le(const uint8_t *buf)
491
{
492
	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
493
	return num;
494
}
495

496

497
static inline uint32_t
498
read32be(const uint8_t *buf)
499
{
500
	uint32_t num = (uint32_t)buf[0] << 24;
501
	num |= (uint32_t)buf[1] << 16;
502
	num |= (uint32_t)buf[2] << 8;
503
	num |= (uint32_t)buf[3];
504
	return num;
505
}
506

507

508
static inline uint32_t
509
read32le(const uint8_t *buf)
510
{
511
	uint32_t num = (uint32_t)buf[0];
512
	num |= (uint32_t)buf[1] << 8;
513
	num |= (uint32_t)buf[2] << 16;
514
	num |= (uint32_t)buf[3] << 24;
515
	return num;
516
}
517

518

519
static inline uint64_t
520
read64be(const uint8_t *buf)
521
{
522
	uint64_t num = (uint64_t)buf[0] << 56;
523
	num |= (uint64_t)buf[1] << 48;
524
	num |= (uint64_t)buf[2] << 40;
525
	num |= (uint64_t)buf[3] << 32;
526
	num |= (uint64_t)buf[4] << 24;
527
	num |= (uint64_t)buf[5] << 16;
528
	num |= (uint64_t)buf[6] << 8;
529
	num |= (uint64_t)buf[7];
530
	return num;
531
}
532

533

534
static inline uint64_t
535
read64le(const uint8_t *buf)
536
{
537
	uint64_t num = (uint64_t)buf[0];
538
	num |= (uint64_t)buf[1] << 8;
539
	num |= (uint64_t)buf[2] << 16;
540
	num |= (uint64_t)buf[3] << 24;
541
	num |= (uint64_t)buf[4] << 32;
542
	num |= (uint64_t)buf[5] << 40;
543
	num |= (uint64_t)buf[6] << 48;
544
	num |= (uint64_t)buf[7] << 56;
545
	return num;
546
}
547

548

549
static inline void
550
write16be(uint8_t *buf, uint16_t num)
551
{
552
	buf[0] = (uint8_t)(num >> 8);
553
	buf[1] = (uint8_t)num;
554
	return;
555
}
556

557

558
static inline void
559
write16le(uint8_t *buf, uint16_t num)
560
{
561
	buf[0] = (uint8_t)num;
562
	buf[1] = (uint8_t)(num >> 8);
563
	return;
564
}
565

566

567
static inline void
568
write32be(uint8_t *buf, uint32_t num)
569
{
570
	buf[0] = (uint8_t)(num >> 24);
571
	buf[1] = (uint8_t)(num >> 16);
572
	buf[2] = (uint8_t)(num >> 8);
573
	buf[3] = (uint8_t)num;
574
	return;
575
}
576

577

578
static inline void
579
write32le(uint8_t *buf, uint32_t num)
580
{
581
	buf[0] = (uint8_t)num;
582
	buf[1] = (uint8_t)(num >> 8);
583
	buf[2] = (uint8_t)(num >> 16);
584
	buf[3] = (uint8_t)(num >> 24);
585
	return;
586
}
587

588

589
static inline void
590
write64be(uint8_t *buf, uint64_t num)
591
{
592
	buf[0] = (uint8_t)(num >> 56);
593
	buf[1] = (uint8_t)(num >> 48);
594
	buf[2] = (uint8_t)(num >> 40);
595
	buf[3] = (uint8_t)(num >> 32);
596
	buf[4] = (uint8_t)(num >> 24);
597
	buf[5] = (uint8_t)(num >> 16);
598
	buf[6] = (uint8_t)(num >> 8);
599
	buf[7] = (uint8_t)num;
600
	return;
601
}
602

603

604
static inline void
605
write64le(uint8_t *buf, uint64_t num)
606
{
607
	buf[0] = (uint8_t)num;
608
	buf[1] = (uint8_t)(num >> 8);
609
	buf[2] = (uint8_t)(num >> 16);
610
	buf[3] = (uint8_t)(num >> 24);
611
	buf[4] = (uint8_t)(num >> 32);
612
	buf[5] = (uint8_t)(num >> 40);
613
	buf[6] = (uint8_t)(num >> 48);
614
	buf[7] = (uint8_t)(num >> 56);
615
	return;
616
}
617

618
#endif
619

620

621
//////////////////////////////
622
// Aligned reads and writes //
623
//////////////////////////////
624

625
// Separate functions for aligned reads and writes are provided since on
626
// strict-align archs aligned access is much faster than unaligned access.
627
//
628
// Just like in the unaligned case, memcpy() is needed to avoid
629
// strict aliasing violations. However, on archs that don't support
630
// unaligned access the compiler cannot know that the pointers given
631
// to memcpy() are aligned which results in slow code. As of C11 there is
632
// no standard way to tell the compiler that we know that the address is
633
// aligned but some compilers have language extensions to do that. With
634
// such language extensions the memcpy() method gives excellent results.
635
//
636
// What to do on a strict-align system when no known language extensions
637
// are available? Falling back to byte-by-byte access would be safe but ruin
638
// optimizations that have been made specifically with aligned access in mind.
639
// As a compromise, aligned reads will fall back to non-compliant type punning
640
// but aligned writes will be byte-by-byte, that is, fast reads are preferred
641
// over fast writes. This obviously isn't great but hopefully it's a working
642
// compromise for now.
643
//
644
// __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
645
#ifdef HAVE___BUILTIN_ASSUME_ALIGNED
646
#	define tuklib_memcpy_aligned(dest, src, size) \
647
		memcpy(dest, __builtin_assume_aligned(src, size), size)
648
#else
649
#	define tuklib_memcpy_aligned(dest, src, size) \
650
		memcpy(dest, src, size)
651
#	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
652
#		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
653
#	endif
654
#endif
655

656

657
static inline uint16_t
658
aligned_read16ne(const uint8_t *buf)
659
{
660
#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
661
		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
662
	return *(const uint16_t *)buf;
663
#else
664
	uint16_t num;
665
	tuklib_memcpy_aligned(&num, buf, sizeof(num));
666
	return num;
667
#endif
668
}
669

670

671
static inline uint32_t
672
aligned_read32ne(const uint8_t *buf)
673
{
674
#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
675
		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
676
	return *(const uint32_t *)buf;
677
#else
678
	uint32_t num;
679
	tuklib_memcpy_aligned(&num, buf, sizeof(num));
680
	return num;
681
#endif
682
}
683

684

685
static inline uint64_t
686
aligned_read64ne(const uint8_t *buf)
687
{
688
#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
689
		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
690
	return *(const uint64_t *)buf;
691
#else
692
	uint64_t num;
693
	tuklib_memcpy_aligned(&num, buf, sizeof(num));
694
	return num;
695
#endif
696
}
697

698

699
static inline void
700
aligned_write16ne(uint8_t *buf, uint16_t num)
701
{
702
#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
703
	*(uint16_t *)buf = num;
704
#else
705
	tuklib_memcpy_aligned(buf, &num, sizeof(num));
706
#endif
707
	return;
708
}
709

710

711
static inline void
712
aligned_write32ne(uint8_t *buf, uint32_t num)
713
{
714
#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
715
	*(uint32_t *)buf = num;
716
#else
717
	tuklib_memcpy_aligned(buf, &num, sizeof(num));
718
#endif
719
	return;
720
}
721

722

723
static inline void
724
aligned_write64ne(uint8_t *buf, uint64_t num)
725
{
726
#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
727
	*(uint64_t *)buf = num;
728
#else
729
	tuklib_memcpy_aligned(buf, &num, sizeof(num));
730
#endif
731
	return;
732
}
733

734

735
static inline uint16_t
736
aligned_read16be(const uint8_t *buf)
737
{
738
	uint16_t num = aligned_read16ne(buf);
739
	return conv16be(num);
740
}
741

742

743
static inline uint16_t
744
aligned_read16le(const uint8_t *buf)
745
{
746
	uint16_t num = aligned_read16ne(buf);
747
	return conv16le(num);
748
}
749

750

751
static inline uint32_t
752
aligned_read32be(const uint8_t *buf)
753
{
754
	uint32_t num = aligned_read32ne(buf);
755
	return conv32be(num);
756
}
757

758

759
static inline uint32_t
760
aligned_read32le(const uint8_t *buf)
761
{
762
	uint32_t num = aligned_read32ne(buf);
763
	return conv32le(num);
764
}
765

766

767
static inline uint64_t
768
aligned_read64be(const uint8_t *buf)
769
{
770
	uint64_t num = aligned_read64ne(buf);
771
	return conv64be(num);
772
}
773

774

775
static inline uint64_t
776
aligned_read64le(const uint8_t *buf)
777
{
778
	uint64_t num = aligned_read64ne(buf);
779
	return conv64le(num);
780
}
781

782

783
// These need to be macros like in the unaligned case.
784
#define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
785
#define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
786
#define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
787
#define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
788
#define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
789
#define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
790

791

792
////////////////////
793
// Bit operations //
794
////////////////////
795

796
static inline uint32_t
797
bsr32(uint32_t n)
798
{
799
	// Check for ICC first, since it tends to define __GNUC__ too.
800
#if defined(__INTEL_COMPILER)
801
	return _bit_scan_reverse(n);
802

803
#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
804
	// GCC >= 3.4 has __builtin_clz(), which gives good results on
805
	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
806
	// either plain BSR (so the XOR gets optimized away) or LZCNT and
807
	// XOR (if -march indicates that SSE4a instructions are supported).
808
	return (uint32_t)__builtin_clz(n) ^ 31U;
809

810
#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
811
	uint32_t i;
812
	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
813
	return i;
814

815
#else
816
	uint32_t i = 31;
817

818
	if ((n & 0xFFFF0000) == 0) {
819
		n <<= 16;
820
		i = 15;
821
	}
822

823
	if ((n & 0xFF000000) == 0) {
824
		n <<= 8;
825
		i -= 8;
826
	}
827

828
	if ((n & 0xF0000000) == 0) {
829
		n <<= 4;
830
		i -= 4;
831
	}
832

833
	if ((n & 0xC0000000) == 0) {
834
		n <<= 2;
835
		i -= 2;
836
	}
837

838
	if ((n & 0x80000000) == 0)
839
		--i;
840

841
	return i;
842
#endif
843
}
844

845

846
static inline uint32_t
847
clz32(uint32_t n)
848
{
849
#if defined(__INTEL_COMPILER)
850
	return _bit_scan_reverse(n) ^ 31U;
851

852
#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
853
	return (uint32_t)__builtin_clz(n);
854

855
#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
856
	uint32_t i;
857
	__asm__("bsrl %1, %0\n\t"
858
		"xorl $31, %0"
859
		: "=r" (i) : "rm" (n));
860
	return i;
861

862
#else
863
	uint32_t i = 0;
864

865
	if ((n & 0xFFFF0000) == 0) {
866
		n <<= 16;
867
		i = 16;
868
	}
869

870
	if ((n & 0xFF000000) == 0) {
871
		n <<= 8;
872
		i += 8;
873
	}
874

875
	if ((n & 0xF0000000) == 0) {
876
		n <<= 4;
877
		i += 4;
878
	}
879

880
	if ((n & 0xC0000000) == 0) {
881
		n <<= 2;
882
		i += 2;
883
	}
884

885
	if ((n & 0x80000000) == 0)
886
		++i;
887

888
	return i;
889
#endif
890
}
891

892

893
static inline uint32_t
894
ctz32(uint32_t n)
895
{
896
#if defined(__INTEL_COMPILER)
897
	return _bit_scan_forward(n);
898

899
#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
900
	return (uint32_t)__builtin_ctz(n);
901

902
#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
903
	uint32_t i;
904
	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
905
	return i;
906

907
#else
908
	uint32_t i = 0;
909

910
	if ((n & 0x0000FFFF) == 0) {
911
		n >>= 16;
912
		i = 16;
913
	}
914

915
	if ((n & 0x000000FF) == 0) {
916
		n >>= 8;
917
		i += 8;
918
	}
919

920
	if ((n & 0x0000000F) == 0) {
921
		n >>= 4;
922
		i += 4;
923
	}
924

925
	if ((n & 0x00000003) == 0) {
926
		n >>= 2;
927
		i += 2;
928
	}
929

930
	if ((n & 0x00000001) == 0)
931
		++i;
932

933
	return i;
934
#endif
935
}
936

937
#define bsf32 ctz32
938

939
#endif
940

941
Product

Resources

Company