CoCalc -- bpf_jit

GitHub Repository: torvalds/linux
Path: blob/master/arch/arc/net/bpf_jit_arcv2.c
²⁶⁴²⁴ views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * The ARCv2 backend of Just-In-Time compiler for eBPF bytecode.
4
 *
5
 * Copyright (c) 2024 Synopsys Inc.
6
 * Author: Shahab Vahedi <[email protected]>
7
 */
8
#include <linux/bug.h>
9
#include "bpf_jit.h"
10

11
/* ARC core registers. */
12
enum {
13
	ARC_R_0,  ARC_R_1,  ARC_R_2,  ARC_R_3,  ARC_R_4,  ARC_R_5,
14
	ARC_R_6,  ARC_R_7,  ARC_R_8,  ARC_R_9,  ARC_R_10, ARC_R_11,
15
	ARC_R_12, ARC_R_13, ARC_R_14, ARC_R_15, ARC_R_16, ARC_R_17,
16
	ARC_R_18, ARC_R_19, ARC_R_20, ARC_R_21, ARC_R_22, ARC_R_23,
17
	ARC_R_24, ARC_R_25, ARC_R_26, ARC_R_FP, ARC_R_SP, ARC_R_ILINK,
18
	ARC_R_30, ARC_R_BLINK,
19
	/*
20
	 * Having ARC_R_IMM encoded as source register means there is an
21
	 * immediate that must be interpreted from the next 4 bytes. If
22
	 * encoded as the destination register though, it implies that the
23
	 * output of the operation is not assigned to any register. The
24
	 * latter is helpful if we only care about updating the CPU status
25
	 * flags.
26
	 */
27
	ARC_R_IMM = 62
28
};
29

30
/*
31
 * Remarks about the rationale behind the chosen mapping:
32
 *
33
 * - BPF_REG_{1,2,3,4} are the argument registers and must be mapped to
34
 *   argument registers in ARCv2 ABI: r0-r7. The r7 registers is the last
35
 *   argument register in the ABI. Therefore BPF_REG_5, as the fifth
36
 *   argument, must be pushed onto the stack. This is a must for calling
37
 *   in-kernel functions.
38
 *
39
 * - In ARCv2 ABI, the return value is in r0 for 32-bit results and (r1,r0)
40
 *   for 64-bit results. However, because they're already used for BPF_REG_1,
41
 *   the next available scratch registers, r8 and r9, are the best candidates
42
 *   for BPF_REG_0. After a "call" to a(n) (in-kernel) function, the result
43
 *   is "mov"ed to these registers. At a BPF_EXIT, their value is "mov"ed to
44
 *   (r1,r0).
45
 *   It is worth mentioning that scratch registers are the best choice for
46
 *   BPF_REG_0, because it is very popular in BPF instruction encoding.
47
 *
48
 * - JIT_REG_TMP is an artifact needed to translate some BPF instructions.
49
 *   Its life span is one single BPF instruction. Since during the
50
 *   analyze_reg_usage(), it is not known if temporary registers are used,
51
 *   it is mapped to ARC's scratch registers: r10 and r11. Therefore, they
52
 *   don't matter in analysing phase and don't need saving. This temporary
53
 *   register is added as yet another index in the bpf2arc array, so it will
54
 *   unfold like the rest of registers during the code generation process.
55
 *
56
 * - Mapping of callee-saved BPF registers, BPF_REG_{6,7,8,9}, starts from
57
 *   (r15,r14) register pair. The (r13,r12) is not a good choice, because
58
 *   in ARCv2 ABI, r12 is not a callee-saved register and this can cause
59
 *   problem when calling an in-kernel function. Theoretically, the mapping
60
 *   could start from (r14,r13), but it is not a conventional ARCv2 register
61
 *   pair. To have a future proof design, I opted for this arrangement.
62
 *   If/when we decide to add ARCv2 instructions that do use register pairs,
63
 *   the mapping, hopefully, doesn't need to be revisited.
64
 */
65
static const u8 bpf2arc[][2] = {
66
	/* Return value from in-kernel function, and exit value from eBPF */
67
	[BPF_REG_0] = {ARC_R_8, ARC_R_9},
68
	/* Arguments from eBPF program to in-kernel function */
69
	[BPF_REG_1] = {ARC_R_0, ARC_R_1},
70
	[BPF_REG_2] = {ARC_R_2, ARC_R_3},
71
	[BPF_REG_3] = {ARC_R_4, ARC_R_5},
72
	[BPF_REG_4] = {ARC_R_6, ARC_R_7},
73
	/* Remaining arguments, to be passed on the stack per 32-bit ABI */
74
	[BPF_REG_5] = {ARC_R_22, ARC_R_23},
75
	/* Callee-saved registers that in-kernel function will preserve */
76
	[BPF_REG_6] = {ARC_R_14, ARC_R_15},
77
	[BPF_REG_7] = {ARC_R_16, ARC_R_17},
78
	[BPF_REG_8] = {ARC_R_18, ARC_R_19},
79
	[BPF_REG_9] = {ARC_R_20, ARC_R_21},
80
	/* Read-only frame pointer to access the eBPF stack. 32-bit only. */
81
	[BPF_REG_FP] = {ARC_R_FP, },
82
	/* Register for blinding constants */
83
	[BPF_REG_AX] = {ARC_R_24, ARC_R_25},
84
	/* Temporary registers for internal use */
85
	[JIT_REG_TMP] = {ARC_R_10, ARC_R_11}
86
};
87

88
#define ARC_CALLEE_SAVED_REG_FIRST ARC_R_13
89
#define ARC_CALLEE_SAVED_REG_LAST  ARC_R_25
90

91
#define REG_LO(r) (bpf2arc[(r)][0])
92
#define REG_HI(r) (bpf2arc[(r)][1])
93

94
/*
95
 * To comply with ARCv2 ABI, BPF's arg5 must be put on stack. After which,
96
 * the stack needs to be restored by ARG5_SIZE.
97
 */
98
#define ARG5_SIZE 8
99

100
/* Instruction lengths in bytes. */
101
enum {
102
	INSN_len_normal = 4,	/* Normal instructions length. */
103
	INSN_len_imm = 4	/* Length of an extra 32-bit immediate. */
104
};
105

106
/* ZZ defines the size of operation in encodings that it is used. */
107
enum {
108
	ZZ_1_byte = 1,
109
	ZZ_2_byte = 2,
110
	ZZ_4_byte = 0,
111
	ZZ_8_byte = 3
112
};
113

114
/*
115
 * AA is mostly about address write back mode. It determines if the
116
 * address in question should be updated before usage or after:
117
 *   addr += offset; data = *addr;
118
 *   data = *addr; addr += offset;
119
 *
120
 * In "scaling" mode, the effective address will become the sum
121
 * of "address" + "index"*"size". The "size" is specified by the
122
 * "ZZ" field. There is no write back when AA is set for scaling:
123
 *   data = *(addr + offset<<zz)
124
 */
125
enum {
126
	AA_none  = 0,
127
	AA_pre   = 1,	/* in assembly known as "a/aw". */
128
	AA_post  = 2,	/* in assembly known as "ab". */
129
	AA_scale = 3	/* in assembly known as "as". */
130
};
131

132
/* X flag determines the mode of extension. */
133
enum {
134
	X_zero = 0,
135
	X_sign = 1
136
};
137

138
/* Condition codes. */
139
enum {
140
	CC_always     = 0,	/* condition is true all the time */
141
	CC_equal      = 1,	/* if status32.z flag is set */
142
	CC_unequal    = 2,	/* if status32.z flag is clear */
143
	CC_positive   = 3,	/* if status32.n flag is clear */
144
	CC_negative   = 4,	/* if status32.n flag is set */
145
	CC_less_u     = 5,	/* less than (unsigned) */
146
	CC_less_eq_u  = 14,	/* less than or equal (unsigned) */
147
	CC_great_eq_u = 6,	/* greater than or equal (unsigned) */
148
	CC_great_u    = 13,	/* greater than (unsigned) */
149
	CC_less_s     = 11,	/* less than (signed) */
150
	CC_less_eq_s  = 12,	/* less than or equal (signed) */
151
	CC_great_eq_s = 10,	/* greater than or equal (signed) */
152
	CC_great_s    = 9	/* greater than (signed) */
153
};
154

155
#define IN_U6_RANGE(x)	((x) <= (0x40      - 1) && (x) >= 0)
156
#define IN_S9_RANGE(x)	((x) <= (0x100     - 1) && (x) >= -0x100)
157
#define IN_S12_RANGE(x)	((x) <= (0x800     - 1) && (x) >= -0x800)
158
#define IN_S21_RANGE(x)	((x) <= (0x100000  - 1) && (x) >= -0x100000)
159
#define IN_S25_RANGE(x)	((x) <= (0x1000000 - 1) && (x) >= -0x1000000)
160

161
/* Operands in most of the encodings. */
162
#define OP_A(x)	((x) & 0x03f)
163
#define OP_B(x)	((((x) & 0x07) << 24) | (((x) & 0x38) <<  9))
164
#define OP_C(x)	(((x) & 0x03f) << 6)
165
#define OP_IMM	(OP_C(ARC_R_IMM))
166
#define COND(x)	(OP_A((x) & 31))
167
#define FLAG(x)	(((x) & 1) << 15)
168

169
/*
170
 * The 4-byte encoding of "mov b,c":
171
 *
172
 * 0010_0bbb 0000_1010 0BBB_cccc cc00_0000
173
 *
174
 * b:  BBBbbb		destination register
175
 * c:  cccccc		source register
176
 */
177
#define OPC_MOV		0x200a0000
178

179
/*
180
 * The 4-byte encoding of "mov b,s12" (used for moving small immediates):
181
 *
182
 * 0010_0bbb 1000_1010 0BBB_ssss ssSS_SSSS
183
 *
184
 * b:  BBBbbb		destination register
185
 * s:  SSSSSSssssss	source immediate (signed)
186
 */
187
#define OPC_MOVI	0x208a0000
188
#define MOVI_S12(x)	((((x) & 0xfc0) >> 6) | (((x) & 0x3f) << 6))
189

190
/*
191
 * The 4-byte encoding of "mov[.qq] b,u6", used for conditional
192
 * moving of even smaller immediates:
193
 *
194
 * 0010_0bbb 1100_1010 0BBB_cccc cciq_qqqq
195
 *
196
 * qq: qqqqq		condition code
197
 * i:			If set, c is considered a 6-bit immediate, else a reg.
198
 *
199
 * b:  BBBbbb		destination register
200
 * c:  cccccc		source
201
 */
202
#define OPC_MOV_CC	0x20ca0000
203
#define MOV_CC_I	BIT(5)
204
#define OPC_MOVU_CC	(OPC_MOV_CC | MOV_CC_I)
205

206
/*
207
 * The 4-byte encoding of "sexb b,c" (8-bit sign extension):
208
 *
209
 * 0010_0bbb 0010_1111 0BBB_cccc cc00_0101
210
 *
211
 * b:  BBBbbb		destination register
212
 * c:  cccccc		source register
213
 */
214
#define OPC_SEXB	0x202f0005
215

216
/*
217
 * The 4-byte encoding of "sexh b,c" (16-bit sign extension):
218
 *
219
 * 0010_0bbb 0010_1111 0BBB_cccc cc00_0110
220
 *
221
 * b:  BBBbbb		destination register
222
 * c:  cccccc		source register
223
 */
224
#define OPC_SEXH	0x202f0006
225

226
/*
227
 * The 4-byte encoding of "ld[zz][.x][.aa] c,[b,s9]":
228
 *
229
 * 0001_0bbb ssss_ssss SBBB_0aaz zxcc_cccc
230
 *
231
 * zz:			size mode
232
 * aa:			address write back mode
233
 * x:			extension mode
234
 *
235
 * s9: S_ssss_ssss	9-bit signed number
236
 * b:  BBBbbb		source reg for address
237
 * c:  cccccc		destination register
238
 */
239
#define OPC_LOAD	0x10000000
240
#define LOAD_X(x)	((x) << 6)
241
#define LOAD_ZZ(x)	((x) << 7)
242
#define LOAD_AA(x)	((x) << 9)
243
#define LOAD_S9(x)	((((x) & 0x0ff) << 16) | (((x) & 0x100) <<  7))
244
#define LOAD_C(x)	((x) & 0x03f)
245
/* Unsigned and signed loads. */
246
#define OPC_LDU		(OPC_LOAD | LOAD_X(X_zero))
247
#define OPC_LDS		(OPC_LOAD | LOAD_X(X_sign))
248
/* 32-bit load. */
249
#define OPC_LD32	(OPC_LDU | LOAD_ZZ(ZZ_4_byte))
250
/* "pop reg" is merely a "ld.ab reg,[sp,4]". */
251
#define OPC_POP		\
252
	(OPC_LD32 | LOAD_AA(AA_post) | LOAD_S9(4) | OP_B(ARC_R_SP))
253

254
/*
255
 * The 4-byte encoding of "st[zz][.aa] c,[b,s9]":
256
 *
257
 * 0001_1bbb ssss_ssss SBBB_cccc cc0a_azz0
258
 *
259
 * zz: zz		size mode
260
 * aa: aa		address write back mode
261
 *
262
 * s9: S_ssss_ssss	9-bit signed number
263
 * b:  BBBbbb		source reg for address
264
 * c:  cccccc		source reg to be stored
265
 */
266
#define OPC_STORE	0x18000000
267
#define STORE_ZZ(x)	((x) << 1)
268
#define STORE_AA(x)	((x) << 3)
269
#define STORE_S9(x)	((((x) & 0x0ff) << 16) | (((x) & 0x100) <<  7))
270
/* 32-bit store. */
271
#define OPC_ST32	(OPC_STORE | STORE_ZZ(ZZ_4_byte))
272
/* "push reg" is merely a "st.aw reg,[sp,-4]". */
273
#define OPC_PUSH	\
274
	(OPC_ST32 | STORE_AA(AA_pre) | STORE_S9(-4) | OP_B(ARC_R_SP))
275

276
/*
277
 * The 4-byte encoding of "add a,b,c":
278
 *
279
 * 0010_0bbb 0i00_0000 fBBB_cccc ccaa_aaaa
280
 *
281
 * f:                   indicates if flags (carry, etc.) should be updated
282
 * i:			If set, c is considered a 6-bit immediate, else a reg.
283
 *
284
 * a:  aaaaaa		result
285
 * b:  BBBbbb		the 1st input operand
286
 * c:  cccccc		the 2nd input operand
287
 */
288
#define OPC_ADD		0x20000000
289
/* Addition with updating the pertinent flags in "status32" register. */
290
#define OPC_ADDF	(OPC_ADD | FLAG(1))
291
#define ADDI		BIT(22)
292
#define ADDI_U6(x)	OP_C(x)
293
#define OPC_ADDI	(OPC_ADD | ADDI)
294
#define OPC_ADDIF	(OPC_ADDI | FLAG(1))
295
#define OPC_ADD_I	(OPC_ADD | OP_IMM)
296

297
/*
298
 * The 4-byte encoding of "adc a,b,c" (addition with carry):
299
 *
300
 * 0010_0bbb 0i00_0001 0BBB_cccc ccaa_aaaa
301
 *
302
 * i:			if set, c is considered a 6-bit immediate, else a reg.
303
 *
304
 * a:  aaaaaa		result
305
 * b:  BBBbbb		the 1st input operand
306
 * c:  cccccc		the 2nd input operand
307
 */
308
#define OPC_ADC		0x20010000
309
#define ADCI		BIT(22)
310
#define ADCI_U6(x)	OP_C(x)
311
#define OPC_ADCI	(OPC_ADC | ADCI)
312

313
/*
314
 * The 4-byte encoding of "sub a,b,c":
315
 *
316
 * 0010_0bbb 0i00_0010 fBBB_cccc ccaa_aaaa
317
 *
318
 * f:                   indicates if flags (carry, etc.) should be updated
319
 * i:			if set, c is considered a 6-bit immediate, else a reg.
320
 *
321
 * a:  aaaaaa		result
322
 * b:  BBBbbb		the 1st input operand
323
 * c:  cccccc		the 2nd input operand
324
 */
325
#define OPC_SUB		0x20020000
326
/* Subtraction with updating the pertinent flags in "status32" register. */
327
#define OPC_SUBF	(OPC_SUB | FLAG(1))
328
#define SUBI		BIT(22)
329
#define SUBI_U6(x)	OP_C(x)
330
#define OPC_SUBI	(OPC_SUB | SUBI)
331
#define OPC_SUB_I	(OPC_SUB | OP_IMM)
332

333
/*
334
 * The 4-byte encoding of "sbc a,b,c" (subtraction with carry):
335
 *
336
 * 0010_0bbb 0000_0011 fBBB_cccc ccaa_aaaa
337
 *
338
 * f:                   indicates if flags (carry, etc.) should be updated
339
 *
340
 * a:  aaaaaa		result
341
 * b:  BBBbbb		the 1st input operand
342
 * c:  cccccc		the 2nd input operand
343
 */
344
#define OPC_SBC		0x20030000
345

346
/*
347
 * The 4-byte encoding of "cmp[.qq] b,c":
348
 *
349
 * 0010_0bbb 1100_1100 1BBB_cccc cc0q_qqqq
350
 *
351
 * qq:	qqqqq		condition code
352
 *
353
 * b:  BBBbbb		the 1st operand
354
 * c:  cccccc		the 2nd operand
355
 */
356
#define OPC_CMP		0x20cc8000
357

358
/*
359
 * The 4-byte encoding of "neg a,b":
360
 *
361
 * 0010_0bbb 0100_1110 0BBB_0000 00aa_aaaa
362
 *
363
 * a:  aaaaaa		result
364
 * b:  BBBbbb		input
365
 */
366
#define OPC_NEG		0x204e0000
367

368
/*
369
 * The 4-byte encoding of "mpy a,b,c".
370
 * mpy is the signed 32-bit multiplication with the lower 32-bit
371
 * of the product as the result.
372
 *
373
 * 0010_0bbb 0001_1010 0BBB_cccc ccaa_aaaa
374
 *
375
 * a:  aaaaaa		result
376
 * b:  BBBbbb		the 1st input operand
377
 * c:  cccccc		the 2nd input operand
378
 */
379
#define OPC_MPY		0x201a0000
380
#define OPC_MPYI	(OPC_MPY | OP_IMM)
381

382
/*
383
 * The 4-byte encoding of "mpydu a,b,c".
384
 * mpydu is the unsigned 32-bit multiplication with the lower 32-bit of
385
 * the product in register "a" and the higher 32-bit in register "a+1".
386
 *
387
 * 0010_1bbb 0001_1001 0BBB_cccc ccaa_aaaa
388
 *
389
 * a:  aaaaaa		64-bit result in registers (R_a+1,R_a)
390
 * b:  BBBbbb		the 1st input operand
391
 * c:  cccccc		the 2nd input operand
392
 */
393
#define OPC_MPYDU	0x28190000
394
#define OPC_MPYDUI	(OPC_MPYDU | OP_IMM)
395

396
/*
397
 * The 4-byte encoding of "divu a,b,c" (unsigned division):
398
 *
399
 * 0010_1bbb 0000_0101 0BBB_cccc ccaa_aaaa
400
 *
401
 * a:  aaaaaa		result (quotient)
402
 * b:  BBBbbb		the 1st input operand
403
 * c:  cccccc		the 2nd input operand (divisor)
404
 */
405
#define OPC_DIVU	0x28050000
406
#define OPC_DIVUI	(OPC_DIVU | OP_IMM)
407

408
/*
409
 * The 4-byte encoding of "div a,b,c" (signed division):
410
 *
411
 * 0010_1bbb 0000_0100 0BBB_cccc ccaa_aaaa
412
 *
413
 * a:  aaaaaa		result (quotient)
414
 * b:  BBBbbb		the 1st input operand
415
 * c:  cccccc		the 2nd input operand (divisor)
416
 */
417
#define OPC_DIVS	0x28040000
418
#define OPC_DIVSI	(OPC_DIVS | OP_IMM)
419

420
/*
421
 * The 4-byte encoding of "remu a,b,c" (unsigned remainder):
422
 *
423
 * 0010_1bbb 0000_1001 0BBB_cccc ccaa_aaaa
424
 *
425
 * a:  aaaaaa		result (remainder)
426
 * b:  BBBbbb		the 1st input operand
427
 * c:  cccccc		the 2nd input operand (divisor)
428
 */
429
#define OPC_REMU	0x28090000
430
#define OPC_REMUI	(OPC_REMU | OP_IMM)
431

432
/*
433
 * The 4-byte encoding of "rem a,b,c" (signed remainder):
434
 *
435
 * 0010_1bbb 0000_1000 0BBB_cccc ccaa_aaaa
436
 *
437
 * a:  aaaaaa		result (remainder)
438
 * b:  BBBbbb		the 1st input operand
439
 * c:  cccccc		the 2nd input operand (divisor)
440
 */
441
#define OPC_REMS	0x28080000
442
#define OPC_REMSI	(OPC_REMS | OP_IMM)
443

444
/*
445
 * The 4-byte encoding of "and a,b,c":
446
 *
447
 * 0010_0bbb 0000_0100 fBBB_cccc ccaa_aaaa
448
 *
449
 * f:                   indicates if zero and negative flags should be updated
450
 *
451
 * a:  aaaaaa		result
452
 * b:  BBBbbb		the 1st input operand
453
 * c:  cccccc		the 2nd input operand
454
 */
455
#define OPC_AND		0x20040000
456
#define OPC_ANDI	(OPC_AND | OP_IMM)
457

458
/*
459
 * The 4-byte encoding of "tst[.qq] b,c".
460
 * Checks if the two input operands have any bit set at the same
461
 * position.
462
 *
463
 * 0010_0bbb 1100_1011 1BBB_cccc cc0q_qqqq
464
 *
465
 * qq:	qqqqq		condition code
466
 *
467
 * b:  BBBbbb		the 1st input operand
468
 * c:  cccccc		the 2nd input operand
469
 */
470
#define OPC_TST		0x20cb8000
471

472
/*
473
 * The 4-byte encoding of "or a,b,c":
474
 *
475
 * 0010_0bbb 0000_0101 0BBB_cccc ccaa_aaaa
476
 *
477
 * a:  aaaaaa		result
478
 * b:  BBBbbb		the 1st input operand
479
 * c:  cccccc		the 2nd input operand
480
 */
481
#define OPC_OR		0x20050000
482
#define OPC_ORI		(OPC_OR | OP_IMM)
483

484
/*
485
 * The 4-byte encoding of "xor a,b,c":
486
 *
487
 * 0010_0bbb 0000_0111 0BBB_cccc ccaa_aaaa
488
 *
489
 * a:  aaaaaa		result
490
 * b:  BBBbbb		the 1st input operand
491
 * c:  cccccc		the 2nd input operand
492
 */
493
#define OPC_XOR		0x20070000
494
#define OPC_XORI	(OPC_XOR | OP_IMM)
495

496
/*
497
 * The 4-byte encoding of "not b,c":
498
 *
499
 * 0010_0bbb 0010_1111 0BBB_cccc cc00_1010
500
 *
501
 * b:  BBBbbb		result
502
 * c:  cccccc		input
503
 */
504
#define OPC_NOT		0x202f000a
505

506
/*
507
 * The 4-byte encoding of "btst b,u6":
508
 *
509
 * 0010_0bbb 0101_0001 1BBB_uuuu uu00_0000
510
 *
511
 * b:  BBBbbb		input number to check
512
 * u6: uuuuuu		6-bit unsigned number specifying bit position to check
513
 */
514
#define OPC_BTSTU6	0x20518000
515
#define BTST_U6(x)	(OP_C((x) & 63))
516

517
/*
518
 * The 4-byte encoding of "asl[.qq] b,b,c" (arithmetic shift left):
519
 *
520
 * 0010_1bbb 0i00_0000 0BBB_cccc ccaa_aaaa
521
 *
522
 * i:			if set, c is considered a 5-bit immediate, else a reg.
523
 *
524
 * b:  BBBbbb		result and the first operand (number to be shifted)
525
 * c:  cccccc		amount to be shifted
526
 */
527
#define OPC_ASL		0x28000000
528
#define ASL_I		BIT(22)
529
#define ASLI_U6(x)	OP_C((x) & 31)
530
#define OPC_ASLI	(OPC_ASL | ASL_I)
531

532
/*
533
 * The 4-byte encoding of "asr a,b,c" (arithmetic shift right):
534
 *
535
 * 0010_1bbb 0i00_0010 0BBB_cccc ccaa_aaaa
536
 *
537
 * i:			if set, c is considered a 6-bit immediate, else a reg.
538
 *
539
 * a:  aaaaaa		result
540
 * b:  BBBbbb		first input:  number to be shifted
541
 * c:  cccccc		second input: amount to be shifted
542
 */
543
#define OPC_ASR		0x28020000
544
#define ASR_I		ASL_I
545
#define ASRI_U6(x)	ASLI_U6(x)
546
#define OPC_ASRI	(OPC_ASR | ASR_I)
547

548
/*
549
 * The 4-byte encoding of "lsr a,b,c" (logical shift right):
550
 *
551
 * 0010_1bbb 0i00_0001 0BBB_cccc ccaa_aaaa
552
 *
553
 * i:			if set, c is considered a 6-bit immediate, else a reg.
554
 *
555
 * a:  aaaaaa		result
556
 * b:  BBBbbb		first input:  number to be shifted
557
 * c:  cccccc		second input: amount to be shifted
558
 */
559
#define OPC_LSR		0x28010000
560
#define LSR_I		ASL_I
561
#define LSRI_U6(x)	ASLI_U6(x)
562
#define OPC_LSRI	(OPC_LSR | LSR_I)
563

564
/*
565
 * The 4-byte encoding of "swape b,c":
566
 *
567
 * 0010_1bbb 0010_1111 0bbb_cccc cc00_1001
568
 *
569
 * b:  BBBbbb		destination register
570
 * c:  cccccc		source register
571
 */
572
#define OPC_SWAPE	0x282f0009
573

574
/*
575
 * Encoding for jump to an address in register:
576
 * j reg_c
577
 *
578
 * 0010_0000 1110_0000 0000_cccc cc00_0000
579
 *
580
 * c:  cccccc		register holding the destination address
581
 */
582
#define OPC_JMP		0x20e00000
583
/* Jump to "branch-and-link" register, which effectively is a "return". */
584
#define OPC_J_BLINK	(OPC_JMP | OP_C(ARC_R_BLINK))
585

586
/*
587
 * Encoding for jump-and-link to an address in register:
588
 * jl reg_c
589
 *
590
 * 0010_0000 0010_0010 0000_cccc cc00_0000
591
 *
592
 * c:  cccccc		register holding the destination address
593
 */
594
#define OPC_JL		0x20220000
595

596
/*
597
 * Encoding for (conditional) branch to an offset from the current location
598
 * that is word aligned: (PC & 0xffff_fffc) + s21
599
 * B[qq] s21
600
 *
601
 * 0000_0sss ssss_sss0 SSSS_SSSS SS0q_qqqq
602
 *
603
 * qq:	qqqqq				condition code
604
 * s21:	SSSS SSSS_SSss ssss_ssss	The displacement (21-bit signed)
605
 *
606
 * The displacement is supposed to be 16-bit (2-byte) aligned. Therefore,
607
 * it should be a multiple of 2. Hence, there is an implied '0' bit at its
608
 * LSB: S_SSSS SSSS_Ssss ssss_sss0
609
 */
610
#define OPC_BCC		0x00000000
611
#define BCC_S21(d)	((((d) & 0x7fe) << 16) | (((d) & 0x1ff800) >> 5))
612

613
/*
614
 * Encoding for unconditional branch to an offset from the current location
615
 * that is word aligned: (PC & 0xffff_fffc) + s25
616
 * B s25
617
 *
618
 * 0000_0sss ssss_sss1 SSSS_SSSS SS00_TTTT
619
 *
620
 * s25:	TTTT SSSS SSSS_SSss ssss_ssss	The displacement (25-bit signed)
621
 *
622
 * The displacement is supposed to be 16-bit (2-byte) aligned. Therefore,
623
 * it should be a multiple of 2. Hence, there is an implied '0' bit at its
624
 * LSB: T TTTS_SSSS SSSS_Ssss ssss_sss0
625
 */
626
#define OPC_B		0x00010000
627
#define B_S25(d)	((((d) & 0x1e00000) >> 21) | BCC_S21(d))
628

629
static inline void emit_2_bytes(u8 *buf, u16 bytes)
630
{
631
	*((u16 *)buf) = bytes;
632
}
633

634
static inline void emit_4_bytes(u8 *buf, u32 bytes)
635
{
636
	emit_2_bytes(buf, bytes >> 16);
637
	emit_2_bytes(buf + 2, bytes & 0xffff);
638
}
639

640
static inline u8 bpf_to_arc_size(u8 size)
641
{
642
	switch (size) {
643
	case BPF_B:
644
		return ZZ_1_byte;
645
	case BPF_H:
646
		return ZZ_2_byte;
647
	case BPF_W:
648
		return ZZ_4_byte;
649
	case BPF_DW:
650
		return ZZ_8_byte;
651
	default:
652
		return ZZ_4_byte;
653
	}
654
}
655

656
/************** Encoders (Deal with ARC regs) ************/
657

658
/* Move an immediate to register with a 4-byte instruction. */
659
static u8 arc_movi_r(u8 *buf, u8 reg, s16 imm)
660
{
661
	const u32 insn = OPC_MOVI | OP_B(reg) | MOVI_S12(imm);
662

663
	if (buf)
664
		emit_4_bytes(buf, insn);
665
	return INSN_len_normal;
666
}
667

668
/* rd <- rs */
669
static u8 arc_mov_r(u8 *buf, u8 rd, u8 rs)
670
{
671
	const u32 insn = OPC_MOV | OP_B(rd) | OP_C(rs);
672

673
	if (buf)
674
		emit_4_bytes(buf, insn);
675
	return INSN_len_normal;
676
}
677

678
/* The emitted code may have different sizes based on "imm". */
679
static u8 arc_mov_i(u8 *buf, u8 rd, s32 imm)
680
{
681
	const u32 insn = OPC_MOV | OP_B(rd) | OP_IMM;
682

683
	if (IN_S12_RANGE(imm))
684
		return arc_movi_r(buf, rd, imm);
685

686
	if (buf) {
687
		emit_4_bytes(buf, insn);
688
		emit_4_bytes(buf + INSN_len_normal, imm);
689
	}
690
	return INSN_len_normal + INSN_len_imm;
691
}
692

693
/* The emitted code will always have the same size (8). */
694
static u8 arc_mov_i_fixed(u8 *buf, u8 rd, s32 imm)
695
{
696
	const u32 insn = OPC_MOV | OP_B(rd) | OP_IMM;
697

698
	if (buf) {
699
		emit_4_bytes(buf, insn);
700
		emit_4_bytes(buf + INSN_len_normal, imm);
701
	}
702
	return INSN_len_normal + INSN_len_imm;
703
}
704

705
/* Conditional move. */
706
static u8 arc_mov_cc_r(u8 *buf, u8 cc, u8 rd, u8 rs)
707
{
708
	const u32 insn = OPC_MOV_CC | OP_B(rd) | OP_C(rs) | COND(cc);
709

710
	if (buf)
711
		emit_4_bytes(buf, insn);
712
	return INSN_len_normal;
713
}
714

715
/* Conditional move of a small immediate to rd. */
716
static u8 arc_movu_cc_r(u8 *buf, u8 cc, u8 rd, u8 imm)
717
{
718
	const u32 insn = OPC_MOVU_CC | OP_B(rd) | OP_C(imm) | COND(cc);
719

720
	if (buf)
721
		emit_4_bytes(buf, insn);
722
	return INSN_len_normal;
723
}
724

725
/* Sign extension from a byte. */
726
static u8 arc_sexb_r(u8 *buf, u8 rd, u8 rs)
727
{
728
	const u32 insn = OPC_SEXB | OP_B(rd) | OP_C(rs);
729

730
	if (buf)
731
		emit_4_bytes(buf, insn);
732
	return INSN_len_normal;
733
}
734

735
/* Sign extension from two bytes. */
736
static u8 arc_sexh_r(u8 *buf, u8 rd, u8 rs)
737
{
738
	const u32 insn = OPC_SEXH | OP_B(rd) | OP_C(rs);
739

740
	if (buf)
741
		emit_4_bytes(buf, insn);
742
	return INSN_len_normal;
743
}
744

745
/* st reg, [reg_mem, off] */
746
static u8 arc_st_r(u8 *buf, u8 reg, u8 reg_mem, s16 off, u8 zz)
747
{
748
	const u32 insn = OPC_STORE | STORE_ZZ(zz) | OP_C(reg) |
749
		OP_B(reg_mem) | STORE_S9(off);
750

751
	if (buf)
752
		emit_4_bytes(buf, insn);
753
	return INSN_len_normal;
754
}
755

756
/* st.aw reg, [sp, -4] */
757
static u8 arc_push_r(u8 *buf, u8 reg)
758
{
759
	const u32 insn = OPC_PUSH | OP_C(reg);
760

761
	if (buf)
762
		emit_4_bytes(buf, insn);
763
	return INSN_len_normal;
764
}
765

766
/* ld reg, [reg_mem, off] (unsigned) */
767
static u8 arc_ld_r(u8 *buf, u8 reg, u8 reg_mem, s16 off, u8 zz)
768
{
769
	const u32 insn = OPC_LDU | LOAD_ZZ(zz) | LOAD_C(reg) |
770
		OP_B(reg_mem) | LOAD_S9(off);
771

772
	if (buf)
773
		emit_4_bytes(buf, insn);
774
	return INSN_len_normal;
775
}
776

777
/* ld.x reg, [reg_mem, off] (sign extend) */
778
static u8 arc_ldx_r(u8 *buf, u8 reg, u8 reg_mem, s16 off, u8 zz)
779
{
780
	const u32 insn = OPC_LDS | LOAD_ZZ(zz) | LOAD_C(reg) |
781
		OP_B(reg_mem) | LOAD_S9(off);
782

783
	if (buf)
784
		emit_4_bytes(buf, insn);
785
	return INSN_len_normal;
786
}
787

788
/* ld.ab reg,[sp,4] */
789
static u8 arc_pop_r(u8 *buf, u8 reg)
790
{
791
	const u32 insn = OPC_POP | LOAD_C(reg);
792

793
	if (buf)
794
		emit_4_bytes(buf, insn);
795
	return INSN_len_normal;
796
}
797

798
/* add Ra,Ra,Rc */
799
static u8 arc_add_r(u8 *buf, u8 ra, u8 rc)
800
{
801
	const u32 insn = OPC_ADD | OP_A(ra) | OP_B(ra) | OP_C(rc);
802

803
	if (buf)
804
		emit_4_bytes(buf, insn);
805
	return INSN_len_normal;
806
}
807

808
/* add.f Ra,Ra,Rc */
809
static u8 arc_addf_r(u8 *buf, u8 ra, u8 rc)
810
{
811
	const u32 insn = OPC_ADDF | OP_A(ra) | OP_B(ra) | OP_C(rc);
812

813
	if (buf)
814
		emit_4_bytes(buf, insn);
815
	return INSN_len_normal;
816
}
817

818
/* add.f Ra,Ra,u6 */
819
static u8 arc_addif_r(u8 *buf, u8 ra, u8 u6)
820
{
821
	const u32 insn = OPC_ADDIF | OP_A(ra) | OP_B(ra) | ADDI_U6(u6);
822

823
	if (buf)
824
		emit_4_bytes(buf, insn);
825
	return INSN_len_normal;
826
}
827

828
/* add Ra,Ra,u6 */
829
static u8 arc_addi_r(u8 *buf, u8 ra, u8 u6)
830
{
831
	const u32 insn = OPC_ADDI | OP_A(ra) | OP_B(ra) | ADDI_U6(u6);
832

833
	if (buf)
834
		emit_4_bytes(buf, insn);
835
	return INSN_len_normal;
836
}
837

838
/* add Ra,Rb,imm */
839
static u8 arc_add_i(u8 *buf, u8 ra, u8 rb, s32 imm)
840
{
841
	const u32 insn = OPC_ADD_I | OP_A(ra) | OP_B(rb);
842

843
	if (buf) {
844
		emit_4_bytes(buf, insn);
845
		emit_4_bytes(buf + INSN_len_normal, imm);
846
	}
847
	return INSN_len_normal + INSN_len_imm;
848
}
849

850
/* adc Ra,Ra,Rc */
851
static u8 arc_adc_r(u8 *buf, u8 ra, u8 rc)
852
{
853
	const u32 insn = OPC_ADC | OP_A(ra) | OP_B(ra) | OP_C(rc);
854

855
	if (buf)
856
		emit_4_bytes(buf, insn);
857
	return INSN_len_normal;
858
}
859

860
/* adc Ra,Ra,u6 */
861
static u8 arc_adci_r(u8 *buf, u8 ra, u8 u6)
862
{
863
	const u32 insn = OPC_ADCI | OP_A(ra) | OP_B(ra) | ADCI_U6(u6);
864

865
	if (buf)
866
		emit_4_bytes(buf, insn);
867
	return INSN_len_normal;
868
}
869

870
/* sub Ra,Ra,Rc */
871
static u8 arc_sub_r(u8 *buf, u8 ra, u8 rc)
872
{
873
	const u32 insn = OPC_SUB | OP_A(ra) | OP_B(ra) | OP_C(rc);
874

875
	if (buf)
876
		emit_4_bytes(buf, insn);
877
	return INSN_len_normal;
878
}
879

880
/* sub.f Ra,Ra,Rc */
881
static u8 arc_subf_r(u8 *buf, u8 ra, u8 rc)
882
{
883
	const u32 insn = OPC_SUBF | OP_A(ra) | OP_B(ra) | OP_C(rc);
884

885
	if (buf)
886
		emit_4_bytes(buf, insn);
887
	return INSN_len_normal;
888
}
889

890
/* sub Ra,Ra,u6 */
891
static u8 arc_subi_r(u8 *buf, u8 ra, u8 u6)
892
{
893
	const u32 insn = OPC_SUBI | OP_A(ra) | OP_B(ra) | SUBI_U6(u6);
894

895
	if (buf)
896
		emit_4_bytes(buf, insn);
897
	return INSN_len_normal;
898
}
899

900
/* sub Ra,Ra,imm */
901
static u8 arc_sub_i(u8 *buf, u8 ra, s32 imm)
902
{
903
	const u32 insn = OPC_SUB_I | OP_A(ra) | OP_B(ra);
904

905
	if (buf) {
906
		emit_4_bytes(buf, insn);
907
		emit_4_bytes(buf + INSN_len_normal, imm);
908
	}
909
	return INSN_len_normal + INSN_len_imm;
910
}
911

912
/* sbc Ra,Ra,Rc */
913
static u8 arc_sbc_r(u8 *buf, u8 ra, u8 rc)
914
{
915
	const u32 insn = OPC_SBC | OP_A(ra) | OP_B(ra) | OP_C(rc);
916

917
	if (buf)
918
		emit_4_bytes(buf, insn);
919
	return INSN_len_normal;
920
}
921

922
/* cmp Rb,Rc */
923
static u8 arc_cmp_r(u8 *buf, u8 rb, u8 rc)
924
{
925
	const u32 insn = OPC_CMP | OP_B(rb) | OP_C(rc);
926

927
	if (buf)
928
		emit_4_bytes(buf, insn);
929
	return INSN_len_normal;
930
}
931

932
/*
933
 * cmp.z Rb,Rc
934
 *
935
 * This "cmp.z" variant of compare instruction is used on lower
936
 * 32-bits of register pairs after "cmp"ing their upper parts. If the
937
 * upper parts are equal (z), then this one will proceed to check the
938
 * rest.
939
 */
940
static u8 arc_cmpz_r(u8 *buf, u8 rb, u8 rc)
941
{
942
	const u32 insn = OPC_CMP | OP_B(rb) | OP_C(rc) | CC_equal;
943

944
	if (buf)
945
		emit_4_bytes(buf, insn);
946
	return INSN_len_normal;
947
}
948

949
/* neg Ra,Rb */
950
static u8 arc_neg_r(u8 *buf, u8 ra, u8 rb)
951
{
952
	const u32 insn = OPC_NEG | OP_A(ra) | OP_B(rb);
953

954
	if (buf)
955
		emit_4_bytes(buf, insn);
956
	return INSN_len_normal;
957
}
958

959
/* mpy Ra,Rb,Rc */
960
static u8 arc_mpy_r(u8 *buf, u8 ra, u8 rb, u8 rc)
961
{
962
	const u32 insn = OPC_MPY | OP_A(ra) | OP_B(rb) | OP_C(rc);
963

964
	if (buf)
965
		emit_4_bytes(buf, insn);
966
	return INSN_len_normal;
967
}
968

969
/* mpy Ra,Rb,imm */
970
static u8 arc_mpy_i(u8 *buf, u8 ra, u8 rb, s32 imm)
971
{
972
	const u32 insn = OPC_MPYI | OP_A(ra) | OP_B(rb);
973

974
	if (buf) {
975
		emit_4_bytes(buf, insn);
976
		emit_4_bytes(buf + INSN_len_normal, imm);
977
	}
978
	return INSN_len_normal + INSN_len_imm;
979
}
980

981
/* mpydu Ra,Ra,Rc */
982
static u8 arc_mpydu_r(u8 *buf, u8 ra, u8 rc)
983
{
984
	const u32 insn = OPC_MPYDU | OP_A(ra) | OP_B(ra) | OP_C(rc);
985

986
	if (buf)
987
		emit_4_bytes(buf, insn);
988
	return INSN_len_normal;
989
}
990

991
/* mpydu Ra,Ra,imm */
992
static u8 arc_mpydu_i(u8 *buf, u8 ra, s32 imm)
993
{
994
	const u32 insn = OPC_MPYDUI | OP_A(ra) | OP_B(ra);
995

996
	if (buf) {
997
		emit_4_bytes(buf, insn);
998
		emit_4_bytes(buf + INSN_len_normal, imm);
999
	}
1000
	return INSN_len_normal + INSN_len_imm;
1001
}
1002

1003
/* divu Rd,Rd,Rs */
1004
static u8 arc_divu_r(u8 *buf, u8 rd, u8 rs)
1005
{
1006
	const u32 insn = OPC_DIVU | OP_A(rd) | OP_B(rd) | OP_C(rs);
1007

1008
	if (buf)
1009
		emit_4_bytes(buf, insn);
1010
	return INSN_len_normal;
1011
}
1012

1013
/* divu Rd,Rd,imm */
1014
static u8 arc_divu_i(u8 *buf, u8 rd, s32 imm)
1015
{
1016
	const u32 insn = OPC_DIVUI | OP_A(rd) | OP_B(rd);
1017

1018
	if (buf) {
1019
		emit_4_bytes(buf, insn);
1020
		emit_4_bytes(buf + INSN_len_normal, imm);
1021
	}
1022
	return INSN_len_normal + INSN_len_imm;
1023
}
1024

1025
/* div Rd,Rd,Rs */
1026
static u8 arc_divs_r(u8 *buf, u8 rd, u8 rs)
1027
{
1028
	const u32 insn = OPC_DIVS | OP_A(rd) | OP_B(rd) | OP_C(rs);
1029

1030
	if (buf)
1031
		emit_4_bytes(buf, insn);
1032
	return INSN_len_normal;
1033
}
1034

1035
/* div Rd,Rd,imm */
1036
static u8 arc_divs_i(u8 *buf, u8 rd, s32 imm)
1037
{
1038
	const u32 insn = OPC_DIVSI | OP_A(rd) | OP_B(rd);
1039

1040
	if (buf) {
1041
		emit_4_bytes(buf, insn);
1042
		emit_4_bytes(buf + INSN_len_normal, imm);
1043
	}
1044
	return INSN_len_normal + INSN_len_imm;
1045
}
1046

1047
/* remu Rd,Rd,Rs */
1048
static u8 arc_remu_r(u8 *buf, u8 rd, u8 rs)
1049
{
1050
	const u32 insn = OPC_REMU | OP_A(rd) | OP_B(rd) | OP_C(rs);
1051

1052
	if (buf)
1053
		emit_4_bytes(buf, insn);
1054
	return INSN_len_normal;
1055
}
1056

1057
/* remu Rd,Rd,imm */
1058
static u8 arc_remu_i(u8 *buf, u8 rd, s32 imm)
1059
{
1060
	const u32 insn = OPC_REMUI | OP_A(rd) | OP_B(rd);
1061

1062
	if (buf) {
1063
		emit_4_bytes(buf, insn);
1064
		emit_4_bytes(buf + INSN_len_normal, imm);
1065
	}
1066
	return INSN_len_normal + INSN_len_imm;
1067
}
1068

1069
/* rem Rd,Rd,Rs */
1070
static u8 arc_rems_r(u8 *buf, u8 rd, u8 rs)
1071
{
1072
	const u32 insn = OPC_REMS | OP_A(rd) | OP_B(rd) | OP_C(rs);
1073

1074
	if (buf)
1075
		emit_4_bytes(buf, insn);
1076
	return INSN_len_normal;
1077
}
1078

1079
/* rem Rd,Rd,imm */
1080
static u8 arc_rems_i(u8 *buf, u8 rd, s32 imm)
1081
{
1082
	const u32 insn = OPC_REMSI | OP_A(rd) | OP_B(rd);
1083

1084
	if (buf) {
1085
		emit_4_bytes(buf, insn);
1086
		emit_4_bytes(buf + INSN_len_normal, imm);
1087
	}
1088
	return INSN_len_normal + INSN_len_imm;
1089
}
1090

1091
/* and Rd,Rd,Rs */
1092
static u8 arc_and_r(u8 *buf, u8 rd, u8 rs)
1093
{
1094
	const u32 insn = OPC_AND | OP_A(rd) | OP_B(rd) | OP_C(rs);
1095

1096
	if (buf)
1097
		emit_4_bytes(buf, insn);
1098
	return INSN_len_normal;
1099
}
1100

1101
/* and Rd,Rd,limm */
1102
static u8 arc_and_i(u8 *buf, u8 rd, s32 imm)
1103
{
1104
	const u32 insn = OPC_ANDI | OP_A(rd) | OP_B(rd);
1105

1106
	if (buf) {
1107
		emit_4_bytes(buf, insn);
1108
		emit_4_bytes(buf + INSN_len_normal, imm);
1109
	}
1110
	return INSN_len_normal + INSN_len_imm;
1111
}
1112

1113
/* tst Rd,Rs */
1114
static u8 arc_tst_r(u8 *buf, u8 rd, u8 rs)
1115
{
1116
	const u32 insn = OPC_TST | OP_B(rd) | OP_C(rs);
1117

1118
	if (buf)
1119
		emit_4_bytes(buf, insn);
1120
	return INSN_len_normal;
1121
}
1122

1123
/*
1124
 * This particular version, "tst.z ...", is meant to be used after a
1125
 * "tst" on the low 32-bit of register pairs. If that "tst" is not
1126
 * zero, then we don't need to test the upper 32-bits lest it sets
1127
 * the zero flag.
1128
 */
1129
static u8 arc_tstz_r(u8 *buf, u8 rd, u8 rs)
1130
{
1131
	const u32 insn = OPC_TST | OP_B(rd) | OP_C(rs) | CC_equal;
1132

1133
	if (buf)
1134
		emit_4_bytes(buf, insn);
1135
	return INSN_len_normal;
1136
}
1137

1138
static u8 arc_or_r(u8 *buf, u8 rd, u8 rs1, u8 rs2)
1139
{
1140
	const u32 insn = OPC_OR | OP_A(rd) | OP_B(rs1) | OP_C(rs2);
1141

1142
	if (buf)
1143
		emit_4_bytes(buf, insn);
1144
	return INSN_len_normal;
1145
}
1146

1147
static u8 arc_or_i(u8 *buf, u8 rd, s32 imm)
1148
{
1149
	const u32 insn = OPC_ORI | OP_A(rd) | OP_B(rd);
1150

1151
	if (buf) {
1152
		emit_4_bytes(buf, insn);
1153
		emit_4_bytes(buf + INSN_len_normal, imm);
1154
	}
1155
	return INSN_len_normal + INSN_len_imm;
1156
}
1157

1158
static u8 arc_xor_r(u8 *buf, u8 rd, u8 rs)
1159
{
1160
	const u32 insn = OPC_XOR | OP_A(rd) | OP_B(rd) | OP_C(rs);
1161

1162
	if (buf)
1163
		emit_4_bytes(buf, insn);
1164
	return INSN_len_normal;
1165
}
1166

1167
static u8 arc_xor_i(u8 *buf, u8 rd, s32 imm)
1168
{
1169
	const u32 insn = OPC_XORI | OP_A(rd) | OP_B(rd);
1170

1171
	if (buf) {
1172
		emit_4_bytes(buf, insn);
1173
		emit_4_bytes(buf + INSN_len_normal, imm);
1174
	}
1175
	return INSN_len_normal + INSN_len_imm;
1176
}
1177

1178
static u8 arc_not_r(u8 *buf, u8 rd, u8 rs)
1179
{
1180
	const u32 insn = OPC_NOT | OP_B(rd) | OP_C(rs);
1181

1182
	if (buf)
1183
		emit_4_bytes(buf, insn);
1184
	return INSN_len_normal;
1185
}
1186

1187
static u8 arc_btst_i(u8 *buf, u8 rs, u8 imm)
1188
{
1189
	const u32 insn = OPC_BTSTU6 | OP_B(rs) | BTST_U6(imm);
1190

1191
	if (buf)
1192
		emit_4_bytes(buf, insn);
1193
	return INSN_len_normal;
1194
}
1195

1196
static u8 arc_asl_r(u8 *buf, u8 rd, u8 rs1, u8 rs2)
1197
{
1198
	const u32 insn = OPC_ASL | OP_A(rd) | OP_B(rs1) | OP_C(rs2);
1199

1200
	if (buf)
1201
		emit_4_bytes(buf, insn);
1202
	return INSN_len_normal;
1203
}
1204

1205
static u8 arc_asli_r(u8 *buf, u8 rd, u8 rs, u8 imm)
1206
{
1207
	const u32 insn = OPC_ASLI | OP_A(rd) | OP_B(rs) | ASLI_U6(imm);
1208

1209
	if (buf)
1210
		emit_4_bytes(buf, insn);
1211
	return INSN_len_normal;
1212
}
1213

1214
static u8 arc_asr_r(u8 *buf, u8 rd, u8 rs1, u8 rs2)
1215
{
1216
	const u32 insn = OPC_ASR | OP_A(rd) | OP_B(rs1) | OP_C(rs2);
1217

1218
	if (buf)
1219
		emit_4_bytes(buf, insn);
1220
	return INSN_len_normal;
1221
}
1222

1223
static u8 arc_asri_r(u8 *buf, u8 rd, u8 rs, u8 imm)
1224
{
1225
	const u32 insn = OPC_ASRI | OP_A(rd) | OP_B(rs) | ASRI_U6(imm);
1226

1227
	if (buf)
1228
		emit_4_bytes(buf, insn);
1229
	return INSN_len_normal;
1230
}
1231

1232
static u8 arc_lsr_r(u8 *buf, u8 rd, u8 rs1, u8 rs2)
1233
{
1234
	const u32 insn = OPC_LSR | OP_A(rd) | OP_B(rs1) | OP_C(rs2);
1235

1236
	if (buf)
1237
		emit_4_bytes(buf, insn);
1238
	return INSN_len_normal;
1239
}
1240

1241
static u8 arc_lsri_r(u8 *buf, u8 rd, u8 rs, u8 imm)
1242
{
1243
	const u32 insn = OPC_LSRI | OP_A(rd) | OP_B(rs) | LSRI_U6(imm);
1244

1245
	if (buf)
1246
		emit_4_bytes(buf, insn);
1247
	return INSN_len_normal;
1248
}
1249

1250
static u8 arc_swape_r(u8 *buf, u8 r)
1251
{
1252
	const u32 insn = OPC_SWAPE | OP_B(r) | OP_C(r);
1253

1254
	if (buf)
1255
		emit_4_bytes(buf, insn);
1256
	return INSN_len_normal;
1257
}
1258

1259
static u8 arc_jmp_return(u8 *buf)
1260
{
1261
	if (buf)
1262
		emit_4_bytes(buf, OPC_J_BLINK);
1263
	return INSN_len_normal;
1264
}
1265

1266
static u8 arc_jl(u8 *buf, u8 reg)
1267
{
1268
	const u32 insn = OPC_JL | OP_C(reg);
1269

1270
	if (buf)
1271
		emit_4_bytes(buf, insn);
1272
	return INSN_len_normal;
1273
}
1274

1275
/*
1276
 * Conditional jump to an address that is max 21 bits away (signed).
1277
 *
1278
 * b<cc> s21
1279
 */
1280
static u8 arc_bcc(u8 *buf, u8 cc, int offset)
1281
{
1282
	const u32 insn = OPC_BCC | BCC_S21(offset) | COND(cc);
1283

1284
	if (buf)
1285
		emit_4_bytes(buf, insn);
1286
	return INSN_len_normal;
1287
}
1288

1289
/*
1290
 * Unconditional jump to an address that is max 25 bits away (signed).
1291
 *
1292
 * b     s25
1293
 */
1294
static u8 arc_b(u8 *buf, s32 offset)
1295
{
1296
	const u32 insn = OPC_B | B_S25(offset);
1297

1298
	if (buf)
1299
		emit_4_bytes(buf, insn);
1300
	return INSN_len_normal;
1301
}
1302

1303
/************* Packers (Deal with BPF_REGs) **************/
1304

1305
u8 zext(u8 *buf, u8 rd)
1306
{
1307
	if (rd != BPF_REG_FP)
1308
		return arc_movi_r(buf, REG_HI(rd), 0);
1309
	else
1310
		return 0;
1311
}
1312

1313
u8 mov_r32(u8 *buf, u8 rd, u8 rs, u8 sign_ext)
1314
{
1315
	u8 len = 0;
1316

1317
	if (sign_ext) {
1318
		if (sign_ext == 8)
1319
			len = arc_sexb_r(buf, REG_LO(rd), REG_LO(rs));
1320
		else if (sign_ext == 16)
1321
			len = arc_sexh_r(buf, REG_LO(rd), REG_LO(rs));
1322
		else if (sign_ext == 32 && rd != rs)
1323
			len = arc_mov_r(buf, REG_LO(rd), REG_LO(rs));
1324

1325
		return len;
1326
	}
1327

1328
	/* Unsigned move. */
1329

1330
	if (rd != rs)
1331
		len = arc_mov_r(buf, REG_LO(rd), REG_LO(rs));
1332

1333
	return len;
1334
}
1335

1336
u8 mov_r32_i32(u8 *buf, u8 reg, s32 imm)
1337
{
1338
	return arc_mov_i(buf, REG_LO(reg), imm);
1339
}
1340

1341
u8 mov_r64(u8 *buf, u8 rd, u8 rs, u8 sign_ext)
1342
{
1343
	u8 len = 0;
1344

1345
	if (sign_ext) {
1346
		/* First handle the low 32-bit part. */
1347
		len = mov_r32(buf, rd, rs, sign_ext);
1348

1349
		/* Now propagate the sign bit of LO to HI. */
1350
		if (sign_ext == 8 || sign_ext == 16 || sign_ext == 32) {
1351
			len += arc_asri_r(BUF(buf, len),
1352
					  REG_HI(rd), REG_LO(rd), 31);
1353
		}
1354

1355
		return len;
1356
	}
1357

1358
	/* Unsigned move. */
1359

1360
	if (rd == rs)
1361
		return 0;
1362

1363
	len = arc_mov_r(buf, REG_LO(rd), REG_LO(rs));
1364

1365
	if (rs != BPF_REG_FP)
1366
		len += arc_mov_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
1367
	/* BPF_REG_FP is mapped to 32-bit "fp" register. */
1368
	else
1369
		len += arc_movi_r(BUF(buf, len), REG_HI(rd), 0);
1370

1371
	return len;
1372
}
1373

1374
/* Sign extend the 32-bit immediate into 64-bit register pair. */
1375
u8 mov_r64_i32(u8 *buf, u8 reg, s32 imm)
1376
{
1377
	u8 len = 0;
1378

1379
	len = arc_mov_i(buf, REG_LO(reg), imm);
1380

1381
	/* BPF_REG_FP is mapped to 32-bit "fp" register. */
1382
	if (reg != BPF_REG_FP) {
1383
		if (imm >= 0)
1384
			len += arc_movi_r(BUF(buf, len), REG_HI(reg), 0);
1385
		else
1386
			len += arc_movi_r(BUF(buf, len), REG_HI(reg), -1);
1387
	}
1388

1389
	return len;
1390
}
1391

1392
/*
1393
 * This is merely used for translation of "LD R, IMM64" instructions
1394
 * of the BPF. These sort of instructions are sometimes used for
1395
 * relocations. If during the normal pass, the relocation value is
1396
 * not known, the BPF instruction may look something like:
1397
 *
1398
 * LD R <- 0x0000_0001_0000_0001
1399
 *
1400
 * Which will nicely translate to two 4-byte ARC instructions:
1401
 *
1402
 * mov R_lo, 1               # imm is small enough to be s12
1403
 * mov R_hi, 1               # same
1404
 *
1405
 * However, during the extra pass, the IMM64 will have changed
1406
 * to the resolved address and looks something like:
1407
 *
1408
 * LD R <- 0x0000_0000_1234_5678
1409
 *
1410
 * Now, the translated code will require 12 bytes:
1411
 *
1412
 * mov R_lo, 0x12345678      # this is an 8-byte instruction
1413
 * mov R_hi, 0               # still 4 bytes
1414
 *
1415
 * Which in practice will result in overwriting the following
1416
 * instruction. To avoid such cases, we will always emit codes
1417
 * with fixed sizes.
1418
 */
1419
u8 mov_r64_i64(u8 *buf, u8 reg, u32 lo, u32 hi)
1420
{
1421
	u8 len;
1422

1423
	len  = arc_mov_i_fixed(buf, REG_LO(reg), lo);
1424
	len += arc_mov_i_fixed(BUF(buf, len), REG_HI(reg), hi);
1425

1426
	return len;
1427
}
1428

1429
/*
1430
 * If the "off"set is too big (doesn't encode as S9) for:
1431
 *
1432
 *   {ld,st}  r, [rm, off]
1433
 *
1434
 * Then emit:
1435
 *
1436
 *   add r10, REG_LO(rm), off
1437
 *
1438
 * and make sure that r10 becomes the effective address:
1439
 *
1440
 *   {ld,st}  r, [r10, 0]
1441
 */
1442
static u8 adjust_mem_access(u8 *buf, s16 *off, u8 size,
1443
			    u8 rm, u8 *arc_reg_mem)
1444
{
1445
	u8 len = 0;
1446
	*arc_reg_mem = REG_LO(rm);
1447

1448
	if (!IN_S9_RANGE(*off) ||
1449
	    (size == BPF_DW && !IN_S9_RANGE(*off + 4))) {
1450
		len += arc_add_i(BUF(buf, len),
1451
				 REG_LO(JIT_REG_TMP), REG_LO(rm), (u32)(*off));
1452
		*arc_reg_mem = REG_LO(JIT_REG_TMP);
1453
		*off = 0;
1454
	}
1455

1456
	return len;
1457
}
1458

1459
/* store rs, [rd, off] */
1460
u8 store_r(u8 *buf, u8 rs, u8 rd, s16 off, u8 size)
1461
{
1462
	u8 len, arc_reg_mem;
1463

1464
	len = adjust_mem_access(buf, &off, size, rd, &arc_reg_mem);
1465

1466
	if (size == BPF_DW) {
1467
		len += arc_st_r(BUF(buf, len), REG_LO(rs), arc_reg_mem,
1468
				off, ZZ_4_byte);
1469
		len += arc_st_r(BUF(buf, len), REG_HI(rs), arc_reg_mem,
1470
				off + 4, ZZ_4_byte);
1471
	} else {
1472
		u8 zz = bpf_to_arc_size(size);
1473

1474
		len += arc_st_r(BUF(buf, len), REG_LO(rs), arc_reg_mem,
1475
				off, zz);
1476
	}
1477

1478
	return len;
1479
}
1480

1481
/*
1482
 * For {8,16,32}-bit stores:
1483
 *   mov r21, imm
1484
 *   st  r21, [...]
1485
 * For 64-bit stores:
1486
 *   mov r21, imm
1487
 *   st  r21, [...]
1488
 *   mov r21, {0,-1}
1489
 *   st  r21, [...+4]
1490
 */
1491
u8 store_i(u8 *buf, s32 imm, u8 rd, s16 off, u8 size)
1492
{
1493
	u8 len, arc_reg_mem;
1494
	/* REG_LO(JIT_REG_TMP) might be used by "adjust_mem_access()". */
1495
	const u8 arc_rs = REG_HI(JIT_REG_TMP);
1496

1497
	len = adjust_mem_access(buf, &off, size, rd, &arc_reg_mem);
1498

1499
	if (size == BPF_DW) {
1500
		len += arc_mov_i(BUF(buf, len), arc_rs, imm);
1501
		len += arc_st_r(BUF(buf, len), arc_rs, arc_reg_mem,
1502
				off, ZZ_4_byte);
1503
		imm = (imm >= 0 ? 0 : -1);
1504
		len += arc_mov_i(BUF(buf, len), arc_rs, imm);
1505
		len += arc_st_r(BUF(buf, len), arc_rs, arc_reg_mem,
1506
				off + 4, ZZ_4_byte);
1507
	} else {
1508
		u8 zz = bpf_to_arc_size(size);
1509

1510
		len += arc_mov_i(BUF(buf, len), arc_rs, imm);
1511
		len += arc_st_r(BUF(buf, len), arc_rs, arc_reg_mem, off, zz);
1512
	}
1513

1514
	return len;
1515
}
1516

1517
/*
1518
 * For the calling convention of a little endian machine, the LO part
1519
 * must be on top of the stack.
1520
 */
1521
static u8 push_r64(u8 *buf, u8 reg)
1522
{
1523
	u8 len = 0;
1524

1525
#ifdef __LITTLE_ENDIAN
1526
	/* BPF_REG_FP is mapped to 32-bit "fp" register. */
1527
	if (reg != BPF_REG_FP)
1528
		len += arc_push_r(BUF(buf, len), REG_HI(reg));
1529
	len += arc_push_r(BUF(buf, len), REG_LO(reg));
1530
#else
1531
	len += arc_push_r(BUF(buf, len), REG_LO(reg));
1532
	if (reg != BPF_REG_FP)
1533
		len += arc_push_r(BUF(buf, len), REG_HI(reg));
1534
#endif
1535

1536
	return len;
1537
}
1538

1539
/* load rd, [rs, off] */
1540
u8 load_r(u8 *buf, u8 rd, u8 rs, s16 off, u8 size, bool sign_ext)
1541
{
1542
	u8 len, arc_reg_mem;
1543

1544
	len = adjust_mem_access(buf, &off, size, rs, &arc_reg_mem);
1545

1546
	if (size == BPF_B || size == BPF_H || size == BPF_W) {
1547
		const u8 zz = bpf_to_arc_size(size);
1548

1549
		/* Use LD.X only if the data size is less than 32-bit. */
1550
		if (sign_ext && (zz == ZZ_1_byte || zz == ZZ_2_byte)) {
1551
			len += arc_ldx_r(BUF(buf, len), REG_LO(rd),
1552
					 arc_reg_mem, off, zz);
1553
		} else {
1554
			len += arc_ld_r(BUF(buf, len), REG_LO(rd),
1555
					arc_reg_mem, off, zz);
1556
		}
1557

1558
		if (sign_ext) {
1559
			/* Propagate the sign bit to the higher reg. */
1560
			len += arc_asri_r(BUF(buf, len),
1561
					  REG_HI(rd), REG_LO(rd), 31);
1562
		} else {
1563
			len += arc_movi_r(BUF(buf, len), REG_HI(rd), 0);
1564
		}
1565
	} else if (size == BPF_DW) {
1566
		/*
1567
		 * We are about to issue 2 consecutive loads:
1568
		 *
1569
		 *   ld rx, [rb, off+0]
1570
		 *   ld ry, [rb, off+4]
1571
		 *
1572
		 * If "rx" and "rb" are the same registers, then the order
1573
		 * should change to guarantee that "rb" remains intact
1574
		 * during these 2 operations:
1575
		 *
1576
		 *   ld ry, [rb, off+4]
1577
		 *   ld rx, [rb, off+0]
1578
		 */
1579
		if (REG_LO(rd) != arc_reg_mem) {
1580
			len += arc_ld_r(BUF(buf, len), REG_LO(rd), arc_reg_mem,
1581
					off, ZZ_4_byte);
1582
			len += arc_ld_r(BUF(buf, len), REG_HI(rd), arc_reg_mem,
1583
					off + 4, ZZ_4_byte);
1584
		} else {
1585
			len += arc_ld_r(BUF(buf, len), REG_HI(rd), arc_reg_mem,
1586
					off + 4, ZZ_4_byte);
1587
			len += arc_ld_r(BUF(buf, len), REG_LO(rd), arc_reg_mem,
1588
					off, ZZ_4_byte);
1589
		}
1590
	}
1591

1592
	return len;
1593
}
1594

1595
u8 add_r32(u8 *buf, u8 rd, u8 rs)
1596
{
1597
	return arc_add_r(buf, REG_LO(rd), REG_LO(rs));
1598
}
1599

1600
u8 add_r32_i32(u8 *buf, u8 rd, s32 imm)
1601
{
1602
	if (IN_U6_RANGE(imm))
1603
		return arc_addi_r(buf, REG_LO(rd), imm);
1604
	else
1605
		return arc_add_i(buf, REG_LO(rd), REG_LO(rd), imm);
1606
}
1607

1608
u8 add_r64(u8 *buf, u8 rd, u8 rs)
1609
{
1610
	u8 len;
1611

1612
	len  = arc_addf_r(buf, REG_LO(rd), REG_LO(rs));
1613
	len += arc_adc_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
1614
	return len;
1615
}
1616

1617
u8 add_r64_i32(u8 *buf, u8 rd, s32 imm)
1618
{
1619
	u8 len;
1620

1621
	if (IN_U6_RANGE(imm)) {
1622
		len  = arc_addif_r(buf, REG_LO(rd), imm);
1623
		len += arc_adci_r(BUF(buf, len), REG_HI(rd), 0);
1624
	} else {
1625
		len  = mov_r64_i32(buf, JIT_REG_TMP, imm);
1626
		len += add_r64(BUF(buf, len), rd, JIT_REG_TMP);
1627
	}
1628
	return len;
1629
}
1630

1631
u8 sub_r32(u8 *buf, u8 rd, u8 rs)
1632
{
1633
	return arc_sub_r(buf, REG_LO(rd), REG_LO(rs));
1634
}
1635

1636
u8 sub_r32_i32(u8 *buf, u8 rd, s32 imm)
1637
{
1638
	if (IN_U6_RANGE(imm))
1639
		return arc_subi_r(buf, REG_LO(rd), imm);
1640
	else
1641
		return arc_sub_i(buf, REG_LO(rd), imm);
1642
}
1643

1644
u8 sub_r64(u8 *buf, u8 rd, u8 rs)
1645
{
1646
	u8 len;
1647

1648
	len  = arc_subf_r(buf, REG_LO(rd), REG_LO(rs));
1649
	len += arc_sbc_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
1650
	return len;
1651
}
1652

1653
u8 sub_r64_i32(u8 *buf, u8 rd, s32 imm)
1654
{
1655
	u8 len;
1656

1657
	len  = mov_r64_i32(buf, JIT_REG_TMP, imm);
1658
	len += sub_r64(BUF(buf, len), rd, JIT_REG_TMP);
1659
	return len;
1660
}
1661

1662
static u8 cmp_r32(u8 *buf, u8 rd, u8 rs)
1663
{
1664
	return arc_cmp_r(buf, REG_LO(rd), REG_LO(rs));
1665
}
1666

1667
u8 neg_r32(u8 *buf, u8 r)
1668
{
1669
	return arc_neg_r(buf, REG_LO(r), REG_LO(r));
1670
}
1671

1672
/* In a two's complement system, -r is (~r + 1). */
1673
u8 neg_r64(u8 *buf, u8 r)
1674
{
1675
	u8 len;
1676

1677
	len  = arc_not_r(buf, REG_LO(r), REG_LO(r));
1678
	len += arc_not_r(BUF(buf, len), REG_HI(r), REG_HI(r));
1679
	len += add_r64_i32(BUF(buf, len), r, 1);
1680
	return len;
1681
}
1682

1683
u8 mul_r32(u8 *buf, u8 rd, u8 rs)
1684
{
1685
	return arc_mpy_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs));
1686
}
1687

1688
u8 mul_r32_i32(u8 *buf, u8 rd, s32 imm)
1689
{
1690
	return arc_mpy_i(buf, REG_LO(rd), REG_LO(rd), imm);
1691
}
1692

1693
/*
1694
 * MUL B, C
1695
 * --------
1696
 * mpy       t0, B_hi, C_lo
1697
 * mpy       t1, B_lo, C_hi
1698
 * mpydu   B_lo, B_lo, C_lo
1699
 * add     B_hi, B_hi,   t0
1700
 * add     B_hi, B_hi,   t1
1701
 */
1702
u8 mul_r64(u8 *buf, u8 rd, u8 rs)
1703
{
1704
	const u8 t0   = REG_LO(JIT_REG_TMP);
1705
	const u8 t1   = REG_HI(JIT_REG_TMP);
1706
	const u8 C_lo = REG_LO(rs);
1707
	const u8 C_hi = REG_HI(rs);
1708
	const u8 B_lo = REG_LO(rd);
1709
	const u8 B_hi = REG_HI(rd);
1710
	u8 len;
1711

1712
	len  = arc_mpy_r(buf, t0, B_hi, C_lo);
1713
	len += arc_mpy_r(BUF(buf, len), t1, B_lo, C_hi);
1714
	len += arc_mpydu_r(BUF(buf, len), B_lo, C_lo);
1715
	len += arc_add_r(BUF(buf, len), B_hi, t0);
1716
	len += arc_add_r(BUF(buf, len), B_hi, t1);
1717

1718
	return len;
1719
}
1720

1721
/*
1722
 * MUL B, imm
1723
 * ----------
1724
 *
1725
 *  To get a 64-bit result from a signed 64x32 multiplication:
1726
 *
1727
 *         B_hi             B_lo   *
1728
 *         sign             imm
1729
 *  -----------------------------
1730
 *  HI(B_lo*imm)     LO(B_lo*imm)  +
1731
 *     B_hi*imm                    +
1732
 *     B_lo*sign
1733
 *  -----------------------------
1734
 *        res_hi           res_lo
1735
 *
1736
 * mpy     t1, B_lo, sign(imm)
1737
 * mpy     t0, B_hi, imm
1738
 * mpydu B_lo, B_lo, imm
1739
 * add   B_hi, B_hi,  t0
1740
 * add   B_hi, B_hi,  t1
1741
 *
1742
 * Note: We can't use signed double multiplication, "mpyd", instead of an
1743
 * unsigned version, "mpydu", and then get rid of the sign adjustments
1744
 * calculated in "t1". The signed multiplication, "mpyd", will consider
1745
 * both operands, "B_lo" and "imm", as signed inputs. However, for this
1746
 * 64x32 multiplication, "B_lo" must be treated as an unsigned number.
1747
 */
1748
u8 mul_r64_i32(u8 *buf, u8 rd, s32 imm)
1749
{
1750
	const u8 t0   = REG_LO(JIT_REG_TMP);
1751
	const u8 t1   = REG_HI(JIT_REG_TMP);
1752
	const u8 B_lo = REG_LO(rd);
1753
	const u8 B_hi = REG_HI(rd);
1754
	u8 len = 0;
1755

1756
	if (imm == 1)
1757
		return 0;
1758

1759
	/* Is the sign-extension of the immediate "-1"? */
1760
	if (imm < 0)
1761
		len += arc_neg_r(BUF(buf, len), t1, B_lo);
1762

1763
	len += arc_mpy_i(BUF(buf, len), t0, B_hi, imm);
1764
	len += arc_mpydu_i(BUF(buf, len), B_lo, imm);
1765
	len += arc_add_r(BUF(buf, len), B_hi, t0);
1766

1767
	/* Add the "sign*B_lo" part, if necessary. */
1768
	if (imm < 0)
1769
		len += arc_add_r(BUF(buf, len), B_hi, t1);
1770

1771
	return len;
1772
}
1773

1774
u8 div_r32(u8 *buf, u8 rd, u8 rs, bool sign_ext)
1775
{
1776
	if (sign_ext)
1777
		return arc_divs_r(buf, REG_LO(rd), REG_LO(rs));
1778
	else
1779
		return arc_divu_r(buf, REG_LO(rd), REG_LO(rs));
1780
}
1781

1782
u8 div_r32_i32(u8 *buf, u8 rd, s32 imm, bool sign_ext)
1783
{
1784
	if (imm == 0)
1785
		return 0;
1786

1787
	if (sign_ext)
1788
		return arc_divs_i(buf, REG_LO(rd), imm);
1789
	else
1790
		return arc_divu_i(buf, REG_LO(rd), imm);
1791
}
1792

1793
u8 mod_r32(u8 *buf, u8 rd, u8 rs, bool sign_ext)
1794
{
1795
	if (sign_ext)
1796
		return arc_rems_r(buf, REG_LO(rd), REG_LO(rs));
1797
	else
1798
		return arc_remu_r(buf, REG_LO(rd), REG_LO(rs));
1799
}
1800

1801
u8 mod_r32_i32(u8 *buf, u8 rd, s32 imm, bool sign_ext)
1802
{
1803
	if (imm == 0)
1804
		return 0;
1805

1806
	if (sign_ext)
1807
		return arc_rems_i(buf, REG_LO(rd), imm);
1808
	else
1809
		return arc_remu_i(buf, REG_LO(rd), imm);
1810
}
1811

1812
u8 and_r32(u8 *buf, u8 rd, u8 rs)
1813
{
1814
	return arc_and_r(buf, REG_LO(rd), REG_LO(rs));
1815
}
1816

1817
u8 and_r32_i32(u8 *buf, u8 rd, s32 imm)
1818
{
1819
	return arc_and_i(buf, REG_LO(rd), imm);
1820
}
1821

1822
u8 and_r64(u8 *buf, u8 rd, u8 rs)
1823
{
1824
	u8 len;
1825

1826
	len  = arc_and_r(buf, REG_LO(rd), REG_LO(rs));
1827
	len += arc_and_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
1828
	return len;
1829
}
1830

1831
u8 and_r64_i32(u8 *buf, u8 rd, s32 imm)
1832
{
1833
	u8 len;
1834

1835
	len  = mov_r64_i32(buf, JIT_REG_TMP, imm);
1836
	len += and_r64(BUF(buf, len), rd, JIT_REG_TMP);
1837
	return len;
1838
}
1839

1840
static u8 tst_r32(u8 *buf, u8 rd, u8 rs)
1841
{
1842
	return arc_tst_r(buf, REG_LO(rd), REG_LO(rs));
1843
}
1844

1845
u8 or_r32(u8 *buf, u8 rd, u8 rs)
1846
{
1847
	return arc_or_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs));
1848
}
1849

1850
u8 or_r32_i32(u8 *buf, u8 rd, s32 imm)
1851
{
1852
	return arc_or_i(buf, REG_LO(rd), imm);
1853
}
1854

1855
u8 or_r64(u8 *buf, u8 rd, u8 rs)
1856
{
1857
	u8 len;
1858

1859
	len  = arc_or_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs));
1860
	len += arc_or_r(BUF(buf, len), REG_HI(rd), REG_HI(rd), REG_HI(rs));
1861
	return len;
1862
}
1863

1864
u8 or_r64_i32(u8 *buf, u8 rd, s32 imm)
1865
{
1866
	u8 len;
1867

1868
	len  = mov_r64_i32(buf, JIT_REG_TMP, imm);
1869
	len += or_r64(BUF(buf, len), rd, JIT_REG_TMP);
1870
	return len;
1871
}
1872

1873
u8 xor_r32(u8 *buf, u8 rd, u8 rs)
1874
{
1875
	return arc_xor_r(buf, REG_LO(rd), REG_LO(rs));
1876
}
1877

1878
u8 xor_r32_i32(u8 *buf, u8 rd, s32 imm)
1879
{
1880
	return arc_xor_i(buf, REG_LO(rd), imm);
1881
}
1882

1883
u8 xor_r64(u8 *buf, u8 rd, u8 rs)
1884
{
1885
	u8 len;
1886

1887
	len  = arc_xor_r(buf, REG_LO(rd), REG_LO(rs));
1888
	len += arc_xor_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
1889
	return len;
1890
}
1891

1892
u8 xor_r64_i32(u8 *buf, u8 rd, s32 imm)
1893
{
1894
	u8 len;
1895

1896
	len  = mov_r64_i32(buf, JIT_REG_TMP, imm);
1897
	len += xor_r64(BUF(buf, len), rd, JIT_REG_TMP);
1898
	return len;
1899
}
1900

1901
/* "asl a,b,c" --> "a = (b << (c & 31))". */
1902
u8 lsh_r32(u8 *buf, u8 rd, u8 rs)
1903
{
1904
	return arc_asl_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs));
1905
}
1906

1907
u8 lsh_r32_i32(u8 *buf, u8 rd, u8 imm)
1908
{
1909
	return arc_asli_r(buf, REG_LO(rd), REG_LO(rd), imm);
1910
}
1911

1912
/*
1913
 * algorithm
1914
 * ---------
1915
 * if (n <= 32)
1916
 *   to_hi = lo >> (32-n)   # (32-n) is the negate of "n" in a 5-bit width.
1917
 *   lo <<= n
1918
 *   hi <<= n
1919
 *   hi |= to_hi
1920
 * else
1921
 *   hi = lo << (n-32)
1922
 *   lo = 0
1923
 *
1924
 * assembly translation for "LSH B, C"
1925
 * (heavily influenced by ARC gcc)
1926
 * -----------------------------------
1927
 * not    t0, C_lo            # The first 3 lines are almost the same as:
1928
 * lsr    t1, B_lo, 1         #   neg   t0, C_lo
1929
 * lsr    t1, t1, t0          #   lsr   t1, B_lo, t0   --> t1 is "to_hi"
1930
 * mov    t0, C_lo*           # with one important difference. In "neg"
1931
 * asl    B_lo, B_lo, t0      # version, when C_lo=0, t1 becomes B_lo while
1932
 * asl    B_hi, B_hi, t0      # it should be 0. The "not" approach instead,
1933
 * or     B_hi, B_hi, t1      # "shift"s t1 once and 31 times, practically
1934
 * btst   t0, 5               # setting it to 0 when C_lo=0.
1935
 * mov.ne B_hi, B_lo**
1936
 * mov.ne B_lo, 0
1937
 *
1938
 * *The "mov t0, C_lo" is necessary to cover the cases that C is the same
1939
 * register as B.
1940
 *
1941
 * **ARC performs a shift in this manner: B <<= (C & 31)
1942
 * For 32<=n<64, "n-32" and "n&31" are the same. Therefore, "B << n" and
1943
 * "B << (n-32)" yield the same results. e.g. the results of "B << 35" and
1944
 * "B << 3" are the same.
1945
 *
1946
 * The behaviour is undefined for n >= 64.
1947
 */
1948
u8 lsh_r64(u8 *buf, u8 rd, u8 rs)
1949
{
1950
	const u8 t0   = REG_LO(JIT_REG_TMP);
1951
	const u8 t1   = REG_HI(JIT_REG_TMP);
1952
	const u8 C_lo = REG_LO(rs);
1953
	const u8 B_lo = REG_LO(rd);
1954
	const u8 B_hi = REG_HI(rd);
1955
	u8 len;
1956

1957
	len  = arc_not_r(buf, t0, C_lo);
1958
	len += arc_lsri_r(BUF(buf, len), t1, B_lo, 1);
1959
	len += arc_lsr_r(BUF(buf, len), t1, t1, t0);
1960
	len += arc_mov_r(BUF(buf, len), t0, C_lo);
1961
	len += arc_asl_r(BUF(buf, len), B_lo, B_lo, t0);
1962
	len += arc_asl_r(BUF(buf, len), B_hi, B_hi, t0);
1963
	len += arc_or_r(BUF(buf, len), B_hi, B_hi, t1);
1964
	len += arc_btst_i(BUF(buf, len), t0, 5);
1965
	len += arc_mov_cc_r(BUF(buf, len), CC_unequal, B_hi, B_lo);
1966
	len += arc_movu_cc_r(BUF(buf, len), CC_unequal, B_lo, 0);
1967

1968
	return len;
1969
}
1970

1971
/*
1972
 * if (n < 32)
1973
 *   to_hi = B_lo >> 32-n          # extract upper n bits
1974
 *   lo <<= n
1975
 *   hi <<=n
1976
 *   hi |= to_hi
1977
 * else if (n < 64)
1978
 *   hi = lo << n-32
1979
 *   lo = 0
1980
 */
1981
u8 lsh_r64_i32(u8 *buf, u8 rd, s32 imm)
1982
{
1983
	const u8 t0   = REG_LO(JIT_REG_TMP);
1984
	const u8 B_lo = REG_LO(rd);
1985
	const u8 B_hi = REG_HI(rd);
1986
	const u8 n    = (u8)imm;
1987
	u8 len = 0;
1988

1989
	if (n == 0) {
1990
		return 0;
1991
	} else if (n <= 31) {
1992
		len  = arc_lsri_r(buf, t0, B_lo, 32 - n);
1993
		len += arc_asli_r(BUF(buf, len), B_lo, B_lo, n);
1994
		len += arc_asli_r(BUF(buf, len), B_hi, B_hi, n);
1995
		len += arc_or_r(BUF(buf, len), B_hi, B_hi, t0);
1996
	} else if (n <= 63) {
1997
		len  = arc_asli_r(buf, B_hi, B_lo, n - 32);
1998
		len += arc_movi_r(BUF(buf, len), B_lo, 0);
1999
	}
2000
	/* n >= 64 is undefined behaviour. */
2001

2002
	return len;
2003
}
2004

2005
/* "lsr a,b,c" --> "a = (b >> (c & 31))". */
2006
u8 rsh_r32(u8 *buf, u8 rd, u8 rs)
2007
{
2008
	return arc_lsr_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs));
2009
}
2010

2011
u8 rsh_r32_i32(u8 *buf, u8 rd, u8 imm)
2012
{
2013
	return arc_lsri_r(buf, REG_LO(rd), REG_LO(rd), imm);
2014
}
2015

2016
/*
2017
 * For better commentary, see lsh_r64().
2018
 *
2019
 * algorithm
2020
 * ---------
2021
 * if (n <= 32)
2022
 *   to_lo = hi << (32-n)
2023
 *   hi >>= n
2024
 *   lo >>= n
2025
 *   lo |= to_lo
2026
 * else
2027
 *   lo = hi >> (n-32)
2028
 *   hi = 0
2029
 *
2030
 * RSH    B,C
2031
 * ----------
2032
 * not    t0, C_lo
2033
 * asl    t1, B_hi, 1
2034
 * asl    t1, t1, t0
2035
 * mov    t0, C_lo
2036
 * lsr    B_hi, B_hi, t0
2037
 * lsr    B_lo, B_lo, t0
2038
 * or     B_lo, B_lo, t1
2039
 * btst   t0, 5
2040
 * mov.ne B_lo, B_hi
2041
 * mov.ne B_hi, 0
2042
 */
2043
u8 rsh_r64(u8 *buf, u8 rd, u8 rs)
2044
{
2045
	const u8 t0   = REG_LO(JIT_REG_TMP);
2046
	const u8 t1   = REG_HI(JIT_REG_TMP);
2047
	const u8 C_lo = REG_LO(rs);
2048
	const u8 B_lo = REG_LO(rd);
2049
	const u8 B_hi = REG_HI(rd);
2050
	u8 len;
2051

2052
	len  = arc_not_r(buf, t0, C_lo);
2053
	len += arc_asli_r(BUF(buf, len), t1, B_hi, 1);
2054
	len += arc_asl_r(BUF(buf, len), t1, t1, t0);
2055
	len += arc_mov_r(BUF(buf, len), t0, C_lo);
2056
	len += arc_lsr_r(BUF(buf, len), B_hi, B_hi, t0);
2057
	len += arc_lsr_r(BUF(buf, len), B_lo, B_lo, t0);
2058
	len += arc_or_r(BUF(buf, len), B_lo, B_lo, t1);
2059
	len += arc_btst_i(BUF(buf, len), t0, 5);
2060
	len += arc_mov_cc_r(BUF(buf, len), CC_unequal, B_lo, B_hi);
2061
	len += arc_movu_cc_r(BUF(buf, len), CC_unequal, B_hi, 0);
2062

2063
	return len;
2064
}
2065

2066
/*
2067
 * if (n < 32)
2068
 *   to_lo = B_lo << 32-n     # extract lower n bits, right-padded with 32-n 0s
2069
 *   lo >>=n
2070
 *   hi >>=n
2071
 *   hi |= to_lo
2072
 * else if (n < 64)
2073
 *   lo = hi >> n-32
2074
 *   hi = 0
2075
 */
2076
u8 rsh_r64_i32(u8 *buf, u8 rd, s32 imm)
2077
{
2078
	const u8 t0   = REG_LO(JIT_REG_TMP);
2079
	const u8 B_lo = REG_LO(rd);
2080
	const u8 B_hi = REG_HI(rd);
2081
	const u8 n    = (u8)imm;
2082
	u8 len = 0;
2083

2084
	if (n == 0) {
2085
		return 0;
2086
	} else if (n <= 31) {
2087
		len  = arc_asli_r(buf, t0, B_hi, 32 - n);
2088
		len += arc_lsri_r(BUF(buf, len), B_lo, B_lo, n);
2089
		len += arc_lsri_r(BUF(buf, len), B_hi, B_hi, n);
2090
		len += arc_or_r(BUF(buf, len), B_lo, B_lo, t0);
2091
	} else if (n <= 63) {
2092
		len  = arc_lsri_r(buf, B_lo, B_hi, n - 32);
2093
		len += arc_movi_r(BUF(buf, len), B_hi, 0);
2094
	}
2095
	/* n >= 64 is undefined behaviour. */
2096

2097
	return len;
2098
}
2099

2100
/* "asr a,b,c" --> "a = (b s>> (c & 31))". */
2101
u8 arsh_r32(u8 *buf, u8 rd, u8 rs)
2102
{
2103
	return arc_asr_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs));
2104
}
2105

2106
u8 arsh_r32_i32(u8 *buf, u8 rd, u8 imm)
2107
{
2108
	return arc_asri_r(buf, REG_LO(rd), REG_LO(rd), imm);
2109
}
2110

2111
/*
2112
 * For comparison, see rsh_r64().
2113
 *
2114
 * algorithm
2115
 * ---------
2116
 * if (n <= 32)
2117
 *   to_lo = hi << (32-n)
2118
 *   hi s>>= n
2119
 *   lo  >>= n
2120
 *   lo |= to_lo
2121
 * else
2122
 *   hi_sign = hi s>>31
2123
 *   lo = hi s>> (n-32)
2124
 *   hi = hi_sign
2125
 *
2126
 * ARSH   B,C
2127
 * ----------
2128
 * not    t0, C_lo
2129
 * asl    t1, B_hi, 1
2130
 * asl    t1, t1, t0
2131
 * mov    t0, C_lo
2132
 * asr    B_hi, B_hi, t0
2133
 * lsr    B_lo, B_lo, t0
2134
 * or     B_lo, B_lo, t1
2135
 * btst   t0, 5
2136
 * asr    t0, B_hi, 31        # now, t0 = 0 or -1 based on B_hi's sign
2137
 * mov.ne B_lo, B_hi
2138
 * mov.ne B_hi, t0
2139
 */
2140
u8 arsh_r64(u8 *buf, u8 rd, u8 rs)
2141
{
2142
	const u8 t0   = REG_LO(JIT_REG_TMP);
2143
	const u8 t1   = REG_HI(JIT_REG_TMP);
2144
	const u8 C_lo = REG_LO(rs);
2145
	const u8 B_lo = REG_LO(rd);
2146
	const u8 B_hi = REG_HI(rd);
2147
	u8 len;
2148

2149
	len  = arc_not_r(buf, t0, C_lo);
2150
	len += arc_asli_r(BUF(buf, len), t1, B_hi, 1);
2151
	len += arc_asl_r(BUF(buf, len), t1, t1, t0);
2152
	len += arc_mov_r(BUF(buf, len), t0, C_lo);
2153
	len += arc_asr_r(BUF(buf, len), B_hi, B_hi, t0);
2154
	len += arc_lsr_r(BUF(buf, len), B_lo, B_lo, t0);
2155
	len += arc_or_r(BUF(buf, len), B_lo, B_lo, t1);
2156
	len += arc_btst_i(BUF(buf, len), t0, 5);
2157
	len += arc_asri_r(BUF(buf, len), t0, B_hi, 31);
2158
	len += arc_mov_cc_r(BUF(buf, len), CC_unequal, B_lo, B_hi);
2159
	len += arc_mov_cc_r(BUF(buf, len), CC_unequal, B_hi, t0);
2160

2161
	return len;
2162
}
2163

2164
/*
2165
 * if (n < 32)
2166
 *   to_lo = lo << 32-n     # extract lower n bits, right-padded with 32-n 0s
2167
 *   lo >>=n
2168
 *   hi s>>=n
2169
 *   hi |= to_lo
2170
 * else if (n < 64)
2171
 *   lo = hi s>> n-32
2172
 *   hi = (lo[msb] ? -1 : 0)
2173
 */
2174
u8 arsh_r64_i32(u8 *buf, u8 rd, s32 imm)
2175
{
2176
	const u8 t0   = REG_LO(JIT_REG_TMP);
2177
	const u8 B_lo = REG_LO(rd);
2178
	const u8 B_hi = REG_HI(rd);
2179
	const u8 n    = (u8)imm;
2180
	u8 len = 0;
2181

2182
	if (n == 0) {
2183
		return 0;
2184
	} else if (n <= 31) {
2185
		len  = arc_asli_r(buf, t0, B_hi, 32 - n);
2186
		len += arc_lsri_r(BUF(buf, len), B_lo, B_lo, n);
2187
		len += arc_asri_r(BUF(buf, len), B_hi, B_hi, n);
2188
		len += arc_or_r(BUF(buf, len), B_lo, B_lo, t0);
2189
	} else if (n <= 63) {
2190
		len  = arc_asri_r(buf, B_lo, B_hi, n - 32);
2191
		len += arc_movi_r(BUF(buf, len), B_hi, -1);
2192
		len += arc_btst_i(BUF(buf, len), B_lo, 31);
2193
		len += arc_movu_cc_r(BUF(buf, len), CC_equal, B_hi, 0);
2194
	}
2195
	/* n >= 64 is undefined behaviour. */
2196

2197
	return len;
2198
}
2199

2200
u8 gen_swap(u8 *buf, u8 rd, u8 size, u8 endian, bool force, bool do_zext)
2201
{
2202
	u8 len = 0;
2203
#ifdef __BIG_ENDIAN
2204
	const u8 host_endian = BPF_FROM_BE;
2205
#else
2206
	const u8 host_endian = BPF_FROM_LE;
2207
#endif
2208
	if (host_endian != endian || force) {
2209
		switch (size) {
2210
		case 16:
2211
			/*
2212
			 * r = B4B3_B2B1 << 16 --> r = B2B1_0000
2213
			 * then, swape(r) would become the desired 0000_B1B2
2214
			 */
2215
			len = arc_asli_r(buf, REG_LO(rd), REG_LO(rd), 16);
2216
			fallthrough;
2217
		case 32:
2218
			len += arc_swape_r(BUF(buf, len), REG_LO(rd));
2219
			if (do_zext)
2220
				len += zext(BUF(buf, len), rd);
2221
			break;
2222
		case 64:
2223
			/*
2224
			 * swap "hi" and "lo":
2225
			 *   hi ^= lo;
2226
			 *   lo ^= hi;
2227
			 *   hi ^= lo;
2228
			 * and then swap the bytes in "hi" and "lo".
2229
			 */
2230
			len  = arc_xor_r(buf, REG_HI(rd), REG_LO(rd));
2231
			len += arc_xor_r(BUF(buf, len), REG_LO(rd), REG_HI(rd));
2232
			len += arc_xor_r(BUF(buf, len), REG_HI(rd), REG_LO(rd));
2233
			len += arc_swape_r(BUF(buf, len), REG_LO(rd));
2234
			len += arc_swape_r(BUF(buf, len), REG_HI(rd));
2235
			break;
2236
		default:
2237
			/* The caller must have handled this. */
2238
			break;
2239
		}
2240
	} else {
2241
		/*
2242
		 * If the same endianness, there's not much to do other
2243
		 * than zeroing out the upper bytes based on the "size".
2244
		 */
2245
		switch (size) {
2246
		case 16:
2247
			len = arc_and_i(buf, REG_LO(rd), 0xffff);
2248
			fallthrough;
2249
		case 32:
2250
			if (do_zext)
2251
				len += zext(BUF(buf, len), rd);
2252
			break;
2253
		case 64:
2254
			break;
2255
		default:
2256
			/* The caller must have handled this. */
2257
			break;
2258
		}
2259
	}
2260

2261
	return len;
2262
}
2263

2264
/*
2265
 * To create a frame, all that is needed is:
2266
 *
2267
 *  push fp
2268
 *  mov  fp, sp
2269
 *  sub  sp, <frame_size>
2270
 *
2271
 * "push fp" is taken care of separately while saving the clobbered registers.
2272
 * All that remains is copying SP value to FP and shrinking SP's address space
2273
 * for any possible function call to come.
2274
 */
2275
static inline u8 frame_create(u8 *buf, u16 size)
2276
{
2277
	u8 len;
2278

2279
	len = arc_mov_r(buf, ARC_R_FP, ARC_R_SP);
2280
	if (IN_U6_RANGE(size))
2281
		len += arc_subi_r(BUF(buf, len), ARC_R_SP, size);
2282
	else
2283
		len += arc_sub_i(BUF(buf, len), ARC_R_SP, size);
2284
	return len;
2285
}
2286

2287
/*
2288
 * mov sp, fp
2289
 *
2290
 * The value of SP upon entering was copied to FP.
2291
 */
2292
static inline u8 frame_restore(u8 *buf)
2293
{
2294
	return arc_mov_r(buf, ARC_R_SP, ARC_R_FP);
2295
}
2296

2297
/*
2298
 * Going from a JITed code to the native caller:
2299
 *
2300
 * mov ARC_ABI_RET_lo, BPF_REG_0_lo     # r0 <- r8
2301
 * mov ARC_ABI_RET_hi, BPF_REG_0_hi     # r1 <- r9
2302
 */
2303
static u8 bpf_to_arc_return(u8 *buf)
2304
{
2305
	u8 len;
2306

2307
	len  = arc_mov_r(buf, ARC_R_0, REG_LO(BPF_REG_0));
2308
	len += arc_mov_r(BUF(buf, len), ARC_R_1, REG_HI(BPF_REG_0));
2309
	return len;
2310
}
2311

2312
/*
2313
 * Coming back from an external (in-kernel) function to the JITed code:
2314
 *
2315
 * mov ARC_ABI_RET_lo, BPF_REG_0_lo     # r8 <- r0
2316
 * mov ARC_ABI_RET_hi, BPF_REG_0_hi     # r9 <- r1
2317
 */
2318
u8 arc_to_bpf_return(u8 *buf)
2319
{
2320
	u8 len;
2321

2322
	len  = arc_mov_r(buf, REG_LO(BPF_REG_0), ARC_R_0);
2323
	len += arc_mov_r(BUF(buf, len), REG_HI(BPF_REG_0), ARC_R_1);
2324
	return len;
2325
}
2326

2327
/*
2328
 * This translation leads to:
2329
 *
2330
 *   mov r10, addr                # always an 8-byte instruction
2331
 *   jl  [r10]
2332
 *
2333
 * The length of the "mov" must be fixed (8), otherwise it may diverge
2334
 * during the normal and extra passes:
2335
 *
2336
 *          normal pass                  extra pass
2337
 *
2338
 *   180:  mov     r10,0        |   180:  mov     r10,0x700578d8
2339
 *   184:  jl      [r10]        |   188:  jl      [r10]
2340
 *   188:  add.f   r16,r16,0x1  |   18c:  adc     r17,r17,0
2341
 *   18c:  adc     r17,r17,0    |
2342
 *
2343
 * In the above example, the change from "r10 <- 0" to "r10 <- 0x700578d8"
2344
 * has led to an increase in the length of the "mov" instruction.
2345
 * Inadvertently, that caused the loss of the "add.f" instruction.
2346
 */
2347
static u8 jump_and_link(u8 *buf, u32 addr)
2348
{
2349
	u8 len;
2350

2351
	len  = arc_mov_i_fixed(buf, REG_LO(JIT_REG_TMP), addr);
2352
	len += arc_jl(BUF(buf, len), REG_LO(JIT_REG_TMP));
2353
	return len;
2354
}
2355

2356
/*
2357
 * This function determines which ARC registers must be saved and restored.
2358
 * It does so by looking into:
2359
 *
2360
 * "bpf_reg": The clobbered (destination) BPF register
2361
 * "is_call": Indicator if the current instruction is a call
2362
 *
2363
 * When a register of interest is clobbered, its corresponding bit position
2364
 * in return value, "usage", is set to true.
2365
 */
2366
u32 mask_for_used_regs(u8 bpf_reg, bool is_call)
2367
{
2368
	u32 usage = 0;
2369

2370
	/* BPF registers that must be saved. */
2371
	if (bpf_reg >= BPF_REG_6 && bpf_reg <= BPF_REG_9) {
2372
		usage |= BIT(REG_LO(bpf_reg));
2373
		usage |= BIT(REG_HI(bpf_reg));
2374
	/*
2375
	 * Using the frame pointer register implies that it should
2376
	 * be saved and reinitialised with the current frame data.
2377
	 */
2378
	} else if (bpf_reg == BPF_REG_FP) {
2379
		usage |= BIT(REG_LO(BPF_REG_FP));
2380
	/* Could there be some ARC registers that must to be saved? */
2381
	} else {
2382
		if (REG_LO(bpf_reg) >= ARC_CALLEE_SAVED_REG_FIRST &&
2383
		    REG_LO(bpf_reg) <= ARC_CALLEE_SAVED_REG_LAST)
2384
			usage |= BIT(REG_LO(bpf_reg));
2385

2386
		if (REG_HI(bpf_reg) >= ARC_CALLEE_SAVED_REG_FIRST &&
2387
		    REG_HI(bpf_reg) <= ARC_CALLEE_SAVED_REG_LAST)
2388
			usage |= BIT(REG_HI(bpf_reg));
2389
	}
2390

2391
	/* A "call" indicates that ARC's "blink" reg must be saved. */
2392
	usage |= is_call ? BIT(ARC_R_BLINK) : 0;
2393

2394
	return usage;
2395
}
2396

2397
/*
2398
 * push blink             # if blink is marked as clobbered
2399
 * push r[0-n]            # if r[i] is marked as clobbered
2400
 * push fp                # if fp is marked as clobbered
2401
 * mov  fp, sp            # if frame_size > 0 (clobbers fp)
2402
 * sub  sp, <frame_size>  # same as above
2403
 */
2404
u8 arc_prologue(u8 *buf, u32 usage, u16 frame_size)
2405
{
2406
	u8 len = 0;
2407
	u32 gp_regs = 0;
2408

2409
	/* Deal with blink first. */
2410
	if (usage & BIT(ARC_R_BLINK))
2411
		len += arc_push_r(BUF(buf, len), ARC_R_BLINK);
2412

2413
	gp_regs = usage & ~(BIT(ARC_R_BLINK) | BIT(ARC_R_FP));
2414
	while (gp_regs) {
2415
		u8 reg = __builtin_ffs(gp_regs) - 1;
2416

2417
		len += arc_push_r(BUF(buf, len), reg);
2418
		gp_regs &= ~BIT(reg);
2419
	}
2420

2421
	/* Deal with fp last. */
2422
	if ((usage & BIT(ARC_R_FP)) || frame_size > 0)
2423
		len += arc_push_r(BUF(buf, len), ARC_R_FP);
2424

2425
	if (frame_size > 0)
2426
		len += frame_create(BUF(buf, len), frame_size);
2427

2428
#ifdef ARC_BPF_JIT_DEBUG
2429
	if ((usage & BIT(ARC_R_FP)) && frame_size == 0) {
2430
		pr_err("FP is being saved while there is no frame.");
2431
		BUG();
2432
	}
2433
#endif
2434

2435
	return len;
2436
}
2437

2438
/*
2439
 * mov  sp, fp            # if frame_size > 0
2440
 * pop  fp                # if fp is marked as clobbered
2441
 * pop  r[n-0]            # if r[i] is marked as clobbered
2442
 * pop  blink             # if blink is marked as clobbered
2443
 * mov  r0, r8            # always: ABI_return <- BPF_return
2444
 * mov  r1, r9            # continuation of above
2445
 * j    [blink]           # always
2446
 *
2447
 * "fp being marked as clobbered" and "frame_size > 0" are the two sides of
2448
 * the same coin.
2449
 */
2450
u8 arc_epilogue(u8 *buf, u32 usage, u16 frame_size)
2451
{
2452
	u32 len = 0;
2453
	u32 gp_regs = 0;
2454

2455
#ifdef ARC_BPF_JIT_DEBUG
2456
	if ((usage & BIT(ARC_R_FP)) && frame_size == 0) {
2457
		pr_err("FP is being saved while there is no frame.");
2458
		BUG();
2459
	}
2460
#endif
2461

2462
	if (frame_size > 0)
2463
		len += frame_restore(BUF(buf, len));
2464

2465
	/* Deal with fp first. */
2466
	if ((usage & BIT(ARC_R_FP)) || frame_size > 0)
2467
		len += arc_pop_r(BUF(buf, len), ARC_R_FP);
2468

2469
	gp_regs = usage & ~(BIT(ARC_R_BLINK) | BIT(ARC_R_FP));
2470
	while (gp_regs) {
2471
		/* "usage" is 32-bit, each bit indicating an ARC register. */
2472
		u8 reg = 31 - __builtin_clz(gp_regs);
2473

2474
		len += arc_pop_r(BUF(buf, len), reg);
2475
		gp_regs &= ~BIT(reg);
2476
	}
2477

2478
	/* Deal with blink last. */
2479
	if (usage & BIT(ARC_R_BLINK))
2480
		len += arc_pop_r(BUF(buf, len), ARC_R_BLINK);
2481

2482
	/* Wrap up the return value and jump back to the caller. */
2483
	len += bpf_to_arc_return(BUF(buf, len));
2484
	len += arc_jmp_return(BUF(buf, len));
2485

2486
	return len;
2487
}
2488

2489
/*
2490
 * For details on the algorithm, see the comments of "gen_jcc_64()".
2491
 *
2492
 * This data structure is holding information for jump translations.
2493
 *
2494
 * jit_off: How many bytes into the current JIT address, "b"ranch insn. occurs
2495
 * cond: The condition that the ARC branch instruction must use
2496
 *
2497
 * e.g.:
2498
 *
2499
 * BPF_JGE  R1, R0, @target
2500
 * ------------------------
2501
 *            |
2502
 *            v
2503
 * 0x1000: cmp  r3, r1     # 0x1000 is the JIT address for "BPF_JGE ..." insn
2504
 * 0x1004: bhi  @target    # first jump (branch higher)
2505
 * 0x1008: blo  @end       # second jump acting as a skip (end is 0x1014)
2506
 * 0x100C: cmp  r2, r0     # the lower 32 bits are evaluated
2507
 * 0x1010: bhs  @target    # third jump (branch higher or same)
2508
 * 0x1014: ...
2509
 *
2510
 * The jit_off(set) of the "bhi" is 4 bytes.
2511
 * The cond(ition) for the "bhi" is "CC_great_u".
2512
 *
2513
 * The jit_off(set) is necessary for calculating the exact displacement
2514
 * to the "target" address:
2515
 *
2516
 * jit_address + jit_off(set) - @target
2517
 * 0x1000      + 4            - @target
2518
 */
2519
#define JCC64_NR_OF_JMPS 3	/* Number of jumps in jcc64 template. */
2520
#define JCC64_INSNS_TO_END 3	/* Number of insn. inclusive the 2nd jmp to end. */
2521
#define JCC64_SKIP_JMP 1	/* Index of the "skip" jump to "end". */
2522
static const struct {
2523
	/*
2524
	 * "jit_off" is common between all "jmp[]" and is coupled with
2525
	 * "cond" of each "jmp[]" instance. e.g.:
2526
	 *
2527
	 * arcv2_64_jccs.jit_off[1]
2528
	 * arcv2_64_jccs.jmp[ARC_CC_UGT].cond[1]
2529
	 *
2530
	 * Are indicating that the second jump in JITed code of "UGT"
2531
	 * is at offset "jit_off[1]" while its condition is "cond[1]".
2532
	 */
2533
	u8 jit_off[JCC64_NR_OF_JMPS];
2534

2535
	struct {
2536
		u8 cond[JCC64_NR_OF_JMPS];
2537
	} jmp[ARC_CC_SLE + 1];
2538
} arcv2_64_jccs = {
2539
	.jit_off = {
2540
		INSN_len_normal * 1,
2541
		INSN_len_normal * 2,
2542
		INSN_len_normal * 4
2543
	},
2544
	/*
2545
	 *   cmp  rd_hi, rs_hi
2546
	 *   bhi  @target         # 1: u>
2547
	 *   blo  @end            # 2: u<
2548
	 *   cmp  rd_lo, rs_lo
2549
	 *   bhi  @target         # 3: u>
2550
	 * end:
2551
	 */
2552
	.jmp[ARC_CC_UGT] = {
2553
		.cond = {CC_great_u, CC_less_u, CC_great_u}
2554
	},
2555
	/*
2556
	 *   cmp  rd_hi, rs_hi
2557
	 *   bhi  @target         # 1: u>
2558
	 *   blo  @end            # 2: u<
2559
	 *   cmp  rd_lo, rs_lo
2560
	 *   bhs  @target         # 3: u>=
2561
	 * end:
2562
	 */
2563
	.jmp[ARC_CC_UGE] = {
2564
		.cond = {CC_great_u, CC_less_u, CC_great_eq_u}
2565
	},
2566
	/*
2567
	 *   cmp  rd_hi, rs_hi
2568
	 *   blo  @target         # 1: u<
2569
	 *   bhi  @end            # 2: u>
2570
	 *   cmp  rd_lo, rs_lo
2571
	 *   blo  @target         # 3: u<
2572
	 * end:
2573
	 */
2574
	.jmp[ARC_CC_ULT] = {
2575
		.cond = {CC_less_u, CC_great_u, CC_less_u}
2576
	},
2577
	/*
2578
	 *   cmp  rd_hi, rs_hi
2579
	 *   blo  @target         # 1: u<
2580
	 *   bhi  @end            # 2: u>
2581
	 *   cmp  rd_lo, rs_lo
2582
	 *   bls  @target         # 3: u<=
2583
	 * end:
2584
	 */
2585
	.jmp[ARC_CC_ULE] = {
2586
		.cond = {CC_less_u, CC_great_u, CC_less_eq_u}
2587
	},
2588
	/*
2589
	 *   cmp  rd_hi, rs_hi
2590
	 *   bgt  @target         # 1: s>
2591
	 *   blt  @end            # 2: s<
2592
	 *   cmp  rd_lo, rs_lo
2593
	 *   bhi  @target         # 3: u>
2594
	 * end:
2595
	 */
2596
	.jmp[ARC_CC_SGT] = {
2597
		.cond = {CC_great_s, CC_less_s, CC_great_u}
2598
	},
2599
	/*
2600
	 *   cmp  rd_hi, rs_hi
2601
	 *   bgt  @target         # 1: s>
2602
	 *   blt  @end            # 2: s<
2603
	 *   cmp  rd_lo, rs_lo
2604
	 *   bhs  @target         # 3: u>=
2605
	 * end:
2606
	 */
2607
	.jmp[ARC_CC_SGE] = {
2608
		.cond = {CC_great_s, CC_less_s, CC_great_eq_u}
2609
	},
2610
	/*
2611
	 *   cmp  rd_hi, rs_hi
2612
	 *   blt  @target         # 1: s<
2613
	 *   bgt  @end            # 2: s>
2614
	 *   cmp  rd_lo, rs_lo
2615
	 *   blo  @target         # 3: u<
2616
	 * end:
2617
	 */
2618
	.jmp[ARC_CC_SLT] = {
2619
		.cond = {CC_less_s, CC_great_s, CC_less_u}
2620
	},
2621
	/*
2622
	 *   cmp  rd_hi, rs_hi
2623
	 *   blt  @target         # 1: s<
2624
	 *   bgt  @end            # 2: s>
2625
	 *   cmp  rd_lo, rs_lo
2626
	 *   bls  @target         # 3: u<=
2627
	 * end:
2628
	 */
2629
	.jmp[ARC_CC_SLE] = {
2630
		.cond = {CC_less_s, CC_great_s, CC_less_eq_u}
2631
	}
2632
};
2633

2634
/*
2635
 * The displacement (offset) for ARC's "b"ranch instruction is the distance
2636
 * from the aligned version of _current_ instruction (PCL) to the target
2637
 * instruction:
2638
 *
2639
 * DISP = TARGET - PCL          # PCL is the word aligned PC
2640
 */
2641
static inline s32 get_displacement(u32 curr_off, u32 targ_off)
2642
{
2643
	return (s32)(targ_off - (curr_off & ~3L));
2644
}
2645

2646
/*
2647
 * "disp"lacement should be:
2648
 *
2649
 * 1. 16-bit aligned.
2650
 * 2. fit in S25, because no "condition code" is supposed to be encoded.
2651
 */
2652
static inline bool is_valid_far_disp(s32 disp)
2653
{
2654
	return (!(disp & 1) && IN_S25_RANGE(disp));
2655
}
2656

2657
/*
2658
 * "disp"lacement should be:
2659
 *
2660
 * 1. 16-bit aligned.
2661
 * 2. fit in S21, because "condition code" is supposed to be encoded too.
2662
 */
2663
static inline bool is_valid_near_disp(s32 disp)
2664
{
2665
	return (!(disp & 1) && IN_S21_RANGE(disp));
2666
}
2667

2668
/*
2669
 * cmp        rd_hi, rs_hi
2670
 * cmp.z      rd_lo, rs_lo
2671
 * b{eq,ne}   @target
2672
 *   |  |
2673
 *   |  `-->  "eq" param is false (JNE)
2674
 *   `----->  "eq" param is true  (JEQ)
2675
 */
2676
static int gen_j_eq_64(u8 *buf, u8 rd, u8 rs, bool eq,
2677
		       u32 curr_off, u32 targ_off)
2678
{
2679
	s32 disp;
2680
	u8 len = 0;
2681

2682
	len += arc_cmp_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
2683
	len += arc_cmpz_r(BUF(buf, len), REG_LO(rd), REG_LO(rs));
2684
	disp = get_displacement(curr_off + len, targ_off);
2685
	len += arc_bcc(BUF(buf, len), eq ? CC_equal : CC_unequal, disp);
2686

2687
	return len;
2688
}
2689

2690
/*
2691
 * tst   rd_hi, rs_hi
2692
 * tst.z rd_lo, rs_lo
2693
 * bne   @target
2694
 */
2695
static u8 gen_jset_64(u8 *buf, u8 rd, u8 rs, u32 curr_off, u32 targ_off)
2696
{
2697
	u8 len = 0;
2698
	s32 disp;
2699

2700
	len += arc_tst_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
2701
	len += arc_tstz_r(BUF(buf, len), REG_LO(rd), REG_LO(rs));
2702
	disp = get_displacement(curr_off + len, targ_off);
2703
	len += arc_bcc(BUF(buf, len), CC_unequal, disp);
2704

2705
	return len;
2706
}
2707

2708
/*
2709
 * Verify if all the jumps for a JITed jcc64 operation are valid,
2710
 * by consulting the data stored at "arcv2_64_jccs".
2711
 */
2712
static bool check_jcc_64(u32 curr_off, u32 targ_off, u8 cond)
2713
{
2714
	size_t i;
2715

2716
	if (cond >= ARC_CC_LAST)
2717
		return false;
2718

2719
	for (i = 0; i < JCC64_NR_OF_JMPS; i++) {
2720
		u32 from, to;
2721

2722
		from = curr_off + arcv2_64_jccs.jit_off[i];
2723
		/* for the 2nd jump, we jump to the end of block. */
2724
		if (i != JCC64_SKIP_JMP)
2725
			to = targ_off;
2726
		else
2727
			to = from + (JCC64_INSNS_TO_END * INSN_len_normal);
2728
		/* There is a "cc" in the instruction, so a "near" jump. */
2729
		if (!is_valid_near_disp(get_displacement(from, to)))
2730
			return false;
2731
	}
2732

2733
	return true;
2734
}
2735

2736
/* Can the jump from "curr_off" to "targ_off" actually happen? */
2737
bool check_jmp_64(u32 curr_off, u32 targ_off, u8 cond)
2738
{
2739
	s32 disp;
2740

2741
	switch (cond) {
2742
	case ARC_CC_UGT:
2743
	case ARC_CC_UGE:
2744
	case ARC_CC_ULT:
2745
	case ARC_CC_ULE:
2746
	case ARC_CC_SGT:
2747
	case ARC_CC_SGE:
2748
	case ARC_CC_SLT:
2749
	case ARC_CC_SLE:
2750
		return check_jcc_64(curr_off, targ_off, cond);
2751
	case ARC_CC_EQ:
2752
	case ARC_CC_NE:
2753
	case ARC_CC_SET:
2754
		/*
2755
		 * The "jump" for the JITed BPF_J{SET,EQ,NE} is actually the
2756
		 * 3rd instruction. See comments of "gen_j{set,_eq}_64()".
2757
		 */
2758
		curr_off += 2 * INSN_len_normal;
2759
		disp = get_displacement(curr_off, targ_off);
2760
		/* There is a "cc" field in the issued instruction. */
2761
		return is_valid_near_disp(disp);
2762
	case ARC_CC_AL:
2763
		disp = get_displacement(curr_off, targ_off);
2764
		return is_valid_far_disp(disp);
2765
	default:
2766
		return false;
2767
	}
2768
}
2769

2770
/*
2771
 * The template for the 64-bit jumps with the following BPF conditions
2772
 *
2773
 * u< u<= u> u>= s< s<= s> s>=
2774
 *
2775
 * Looks like below:
2776
 *
2777
 *   cmp   rd_hi, rs_hi
2778
 *   b<c1> @target
2779
 *   b<c2> @end
2780
 *   cmp   rd_lo, rs_lo   # if execution reaches here, r{d,s}_hi are equal
2781
 *   b<c3> @target
2782
 * end:
2783
 *
2784
 * "c1" is the condition that JIT is handling minus the equality part.
2785
 * For instance if we have to translate an "unsigned greater or equal",
2786
 * then "c1" will be "unsigned greater". We won't know about equality
2787
 * until all 64-bits of data (higeher and lower registers) are processed.
2788
 *
2789
 * "c2" is the counter logic of "c1". For instance, if "c1" is originated
2790
 * from "s>", then "c2" would be "s<". Notice that equality doesn't play
2791
 * a role here either, because the lower 32 bits are not processed yet.
2792
 *
2793
 * "c3" is the unsigned version of "c1", no matter if the BPF condition
2794
 * was signed or unsigned. An unsigned version is necessary, because the
2795
 * MSB of the lower 32 bits does not reflect a sign in the whole 64-bit
2796
 * scheme. Otherwise, 64-bit comparisons like
2797
 * (0x0000_0000,0x8000_0000) s>= (0x0000_0000,0x0000_0000)
2798
 * would yield an incorrect result. Finally, if there is an equality
2799
 * check in the BPF condition, it will be reflected in "c3".
2800
 *
2801
 * You can find all the instances of this template where the
2802
 * "arcv2_64_jccs" is getting initialised.
2803
 */
2804
static u8 gen_jcc_64(u8 *buf, u8 rd, u8 rs, u8 cond,
2805
		     u32 curr_off, u32 targ_off)
2806
{
2807
	s32 disp;
2808
	u32 end_off;
2809
	const u8 *cc = arcv2_64_jccs.jmp[cond].cond;
2810
	u8 len = 0;
2811

2812
	/* cmp rd_hi, rs_hi */
2813
	len += arc_cmp_r(buf, REG_HI(rd), REG_HI(rs));
2814

2815
	/* b<c1> @target */
2816
	disp = get_displacement(curr_off + len, targ_off);
2817
	len += arc_bcc(BUF(buf, len), cc[0], disp);
2818

2819
	/* b<c2> @end */
2820
	end_off = curr_off + len + (JCC64_INSNS_TO_END * INSN_len_normal);
2821
	disp = get_displacement(curr_off + len, end_off);
2822
	len += arc_bcc(BUF(buf, len), cc[1], disp);
2823

2824
	/* cmp rd_lo, rs_lo */
2825
	len += arc_cmp_r(BUF(buf, len), REG_LO(rd), REG_LO(rs));
2826

2827
	/* b<c3> @target */
2828
	disp = get_displacement(curr_off + len, targ_off);
2829
	len += arc_bcc(BUF(buf, len), cc[2], disp);
2830

2831
	return len;
2832
}
2833

2834
/*
2835
 * This function only applies the necessary logic to make the proper
2836
 * translations. All the sanity checks must have already been done
2837
 * by calling the check_jmp_64().
2838
 */
2839
u8 gen_jmp_64(u8 *buf, u8 rd, u8 rs, u8 cond, u32 curr_off, u32 targ_off)
2840
{
2841
	u8 len = 0;
2842
	bool eq = false;
2843
	s32 disp;
2844

2845
	switch (cond) {
2846
	case ARC_CC_AL:
2847
		disp = get_displacement(curr_off, targ_off);
2848
		len = arc_b(buf, disp);
2849
		break;
2850
	case ARC_CC_UGT:
2851
	case ARC_CC_UGE:
2852
	case ARC_CC_ULT:
2853
	case ARC_CC_ULE:
2854
	case ARC_CC_SGT:
2855
	case ARC_CC_SGE:
2856
	case ARC_CC_SLT:
2857
	case ARC_CC_SLE:
2858
		len = gen_jcc_64(buf, rd, rs, cond, curr_off, targ_off);
2859
		break;
2860
	case ARC_CC_EQ:
2861
		eq = true;
2862
		fallthrough;
2863
	case ARC_CC_NE:
2864
		len = gen_j_eq_64(buf, rd, rs, eq, curr_off, targ_off);
2865
		break;
2866
	case ARC_CC_SET:
2867
		len = gen_jset_64(buf, rd, rs, curr_off, targ_off);
2868
		break;
2869
	default:
2870
#ifdef ARC_BPF_JIT_DEBUG
2871
		pr_err("64-bit jump condition is not known.");
2872
		BUG();
2873
#endif
2874
	}
2875
	return len;
2876
}
2877

2878
/*
2879
 * The condition codes to use when generating JIT instructions
2880
 * for 32-bit jumps.
2881
 *
2882
 * The "ARC_CC_AL" index is not really used by the code, but it
2883
 * is here for the sake of completeness.
2884
 *
2885
 * The "ARC_CC_SET" becomes "CC_unequal" because of the "tst"
2886
 * instruction that precedes the conditional branch.
2887
 */
2888
static const u8 arcv2_32_jmps[ARC_CC_LAST] = {
2889
	[ARC_CC_UGT] = CC_great_u,
2890
	[ARC_CC_UGE] = CC_great_eq_u,
2891
	[ARC_CC_ULT] = CC_less_u,
2892
	[ARC_CC_ULE] = CC_less_eq_u,
2893
	[ARC_CC_SGT] = CC_great_s,
2894
	[ARC_CC_SGE] = CC_great_eq_s,
2895
	[ARC_CC_SLT] = CC_less_s,
2896
	[ARC_CC_SLE] = CC_less_eq_s,
2897
	[ARC_CC_AL]  = CC_always,
2898
	[ARC_CC_EQ]  = CC_equal,
2899
	[ARC_CC_NE]  = CC_unequal,
2900
	[ARC_CC_SET] = CC_unequal
2901
};
2902

2903
/* Can the jump from "curr_off" to "targ_off" actually happen? */
2904
bool check_jmp_32(u32 curr_off, u32 targ_off, u8 cond)
2905
{
2906
	u8 addendum;
2907
	s32 disp;
2908

2909
	if (cond >= ARC_CC_LAST)
2910
		return false;
2911

2912
	/*
2913
	 * The unconditional jump happens immediately, while the rest
2914
	 * are either preceded by a "cmp" or "tst" instruction.
2915
	 */
2916
	addendum = (cond == ARC_CC_AL) ? 0 : INSN_len_normal;
2917
	disp = get_displacement(curr_off + addendum, targ_off);
2918

2919
	if (cond == ARC_CC_AL)
2920
		return is_valid_far_disp(disp);
2921
	else
2922
		return is_valid_near_disp(disp);
2923
}
2924

2925
/*
2926
 * The JITed code for 32-bit (conditional) branches:
2927
 *
2928
 * ARC_CC_AL  @target
2929
 *   b @jit_targ_addr
2930
 *
2931
 * ARC_CC_SET rd, rs, @target
2932
 *   tst rd, rs
2933
 *   bnz @jit_targ_addr
2934
 *
2935
 * ARC_CC_xx  rd, rs, @target
2936
 *   cmp rd, rs
2937
 *   b<cc> @jit_targ_addr            # cc = arcv2_32_jmps[xx]
2938
 */
2939
u8 gen_jmp_32(u8 *buf, u8 rd, u8 rs, u8 cond, u32 curr_off, u32 targ_off)
2940
{
2941
	s32 disp;
2942
	u8 len = 0;
2943

2944
	/*
2945
	 * Although this must have already been checked by "check_jmp_32()",
2946
	 * we're not going to risk accessing "arcv2_32_jmps" array without
2947
	 * the boundary check.
2948
	 */
2949
	if (cond >= ARC_CC_LAST) {
2950
#ifdef ARC_BPF_JIT_DEBUG
2951
		pr_err("32-bit jump condition is not known.");
2952
		BUG();
2953
#endif
2954
		return 0;
2955
	}
2956

2957
	/* If there is a "condition", issue the "cmp" or "tst" first. */
2958
	if (cond != ARC_CC_AL) {
2959
		if (cond == ARC_CC_SET)
2960
			len = tst_r32(buf, rd, rs);
2961
		else
2962
			len = cmp_r32(buf, rd, rs);
2963
		/*
2964
		 * The issued instruction affects the "disp"lacement as
2965
		 * it alters the "curr_off" by its "len"gth. The "curr_off"
2966
		 * should always point to the jump instruction.
2967
		 */
2968
		disp = get_displacement(curr_off + len, targ_off);
2969
		len += arc_bcc(BUF(buf, len), arcv2_32_jmps[cond], disp);
2970
	} else {
2971
		/* The straight forward unconditional jump. */
2972
		disp = get_displacement(curr_off, targ_off);
2973
		len = arc_b(buf, disp);
2974
	}
2975

2976
	return len;
2977
}
2978

2979
/*
2980
 * Generate code for functions calls. There can be two types of calls:
2981
 *
2982
 * - Calling another BPF function
2983
 * - Calling an in-kernel function which is compiled by ARC gcc
2984
 *
2985
 * In the later case, we must comply to ARCv2 ABI and handle arguments
2986
 * and return values accordingly.
2987
 */
2988
u8 gen_func_call(u8 *buf, ARC_ADDR func_addr, bool external_func)
2989
{
2990
	u8 len = 0;
2991

2992
	/*
2993
	 * In case of an in-kernel function call, always push the 5th
2994
	 * argument onto the stack, because that's where the ABI dictates
2995
	 * it should be found. If the callee doesn't really use it, no harm
2996
	 * is done. The stack is readjusted either way after the call.
2997
	 */
2998
	if (external_func)
2999
		len += push_r64(BUF(buf, len), BPF_REG_5);
3000

3001
	len += jump_and_link(BUF(buf, len), func_addr);
3002

3003
	if (external_func)
3004
		len += arc_add_i(BUF(buf, len), ARC_R_SP, ARC_R_SP, ARG5_SIZE);
3005

3006
	return len;
3007
}
3008

3009
Product

Resources

Company