CoCalc -- arm64.c

GitHub Repository: Kitware/CMake
Path: blob/master/Utilities/cmliblzma/liblzma/simple/arm64.c
⁵⁰⁷⁶ views
1
// SPDX-License-Identifier: 0BSD
2

3
///////////////////////////////////////////////////////////////////////////////
4
//
5
/// \file       arm64.c
6
/// \brief      Filter for ARM64 binaries
7
///
8
/// This converts ARM64 relative addresses in the BL and ADRP immediates
9
/// to absolute values to increase redundancy of ARM64 code.
10
///
11
/// Converting B or ADR instructions was also tested but it's not useful.
12
/// A majority of the jumps for the B instruction are very small (+/- 0xFF).
13
/// These are typical for loops and if-statements. Encoding them to their
14
/// absolute address reduces redundancy since many of the small relative
15
/// jump values are repeated, but very few of the absolute addresses are.
16
//
17
//  Authors:    Lasse Collin
18
//              Jia Tan
19
//              Igor Pavlov
20
//
21
///////////////////////////////////////////////////////////////////////////////
22

23
#include "simple_private.h"
24

25

26
static size_t
27
arm64_code(void *simple lzma_attribute((__unused__)),
28
		uint32_t now_pos, bool is_encoder,
29
		uint8_t *buffer, size_t size)
30
{
31
	size_t i;
32

33
	// Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
34
	// with auto-vectorization that is enabled by default with -O2.
35
	// Such vectorization bloat happens with -O2 when targeting ARM64 too
36
	// but performance hasn't been tested.
37
#ifdef __clang__
38
#	pragma clang loop vectorize(disable)
39
#endif
40
	for (i = 0; i + 4 <= size; i += 4) {
41
		uint32_t pc = (uint32_t)(now_pos + i);
42
		uint32_t instr = read32le(buffer + i);
43

44
		if ((instr >> 26) == 0x25) {
45
			// BL instruction:
46
			// The full 26-bit immediate is converted.
47
			// The range is +/-128 MiB.
48
			//
49
			// Using the full range helps quite a lot with
50
			// big executables. Smaller range would reduce false
51
			// positives in non-code sections of the input though
52
			// so this is a compromise that slightly favors big
53
			// files. With the full range, only six bits of the 32
54
			// need to match to trigger a conversion.
55
			const uint32_t src = instr;
56
			instr = 0x94000000;
57

58
			pc >>= 2;
59
			if (!is_encoder)
60
				pc = 0U - pc;
61

62
			instr |= (src + pc) & 0x03FFFFFF;
63
			write32le(buffer + i, instr);
64

65
		} else if ((instr & 0x9F000000) == 0x90000000) {
66
			// ADRP instruction:
67
			// Only values in the range +/-512 MiB are converted.
68
			//
69
			// Using less than the full +/-4 GiB range reduces
70
			// false positives on non-code sections of the input
71
			// while being excellent for executables up to 512 MiB.
72
			// The positive effect of ADRP conversion is smaller
73
			// than that of BL but it also doesn't hurt so much in
74
			// non-code sections of input because, with +/-512 MiB
75
			// range, nine bits of 32 need to match to trigger a
76
			// conversion (two 10-bit match choices = 9 bits).
77
			const uint32_t src = ((instr >> 29) & 3)
78
					| ((instr >> 3) & 0x001FFFFC);
79

80
			// With the addition only one branch is needed to
81
			// check the +/- range. This is usually false when
82
			// processing ARM64 code so branch prediction will
83
			// handle it well in terms of performance.
84
			//
85
			//if ((src & 0x001E0000) != 0
86
			// && (src & 0x001E0000) != 0x001E0000)
87
			if ((src + 0x00020000) & 0x001C0000)
88
				continue;
89

90
			instr &= 0x9000001F;
91

92
			pc >>= 12;
93
			if (!is_encoder)
94
				pc = 0U - pc;
95

96
			const uint32_t dest = src + pc;
97
			instr |= (dest & 3) << 29;
98
			instr |= (dest & 0x0003FFFC) << 3;
99
			instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
100
			write32le(buffer + i, instr);
101
		}
102
	}
103

104
	return i;
105
}
106

107

108
static lzma_ret
109
arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
110
		const lzma_filter_info *filters, bool is_encoder)
111
{
112
	return lzma_simple_coder_init(next, allocator, filters,
113
			&arm64_code, 0, 4, 4, is_encoder);
114
}
115

116

117
#ifdef HAVE_ENCODER_ARM64
118
extern lzma_ret
119
lzma_simple_arm64_encoder_init(lzma_next_coder *next,
120
		const lzma_allocator *allocator,
121
		const lzma_filter_info *filters)
122
{
123
	return arm64_coder_init(next, allocator, filters, true);
124
}
125
#endif
126

127

128
#ifdef HAVE_DECODER_ARM64
129
extern lzma_ret
130
lzma_simple_arm64_decoder_init(lzma_next_coder *next,
131
		const lzma_allocator *allocator,
132
		const lzma_filter_info *filters)
133
{
134
	return arm64_coder_init(next, allocator, filters, false);
135
}
136
#endif
137

138
Product

Resources

Company