Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/zstd/lib/decompress/huf_decompress_amd64.S
48375 views
1
/*
2
* Copyright (c) Facebook, Inc.
3
* All rights reserved.
4
*
5
* This source code is licensed under both the BSD-style license (found in the
6
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
* in the COPYING file in the root directory of this source tree).
8
* You may select, at your option, one of the above-listed licenses.
9
*/
10
11
#include "../common/portability_macros.h"
12
13
/* Stack marking
14
* ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
15
*/
16
#if defined(__ELF__) && defined(__GNUC__)
17
.section .note.GNU-stack,"",%progbits
18
#endif
19
20
#if ZSTD_ENABLE_ASM_X86_64_BMI2
21
22
/* Calling convention:
23
*
24
* %rdi contains the first argument: HUF_DecompressAsmArgs*.
25
* %rbp isn't maintained (no frame pointer).
26
* %rsp contains the stack pointer that grows down.
27
* No red-zone is assumed, only addresses >= %rsp are used.
28
* All register contents are preserved.
29
*
30
* TODO: Support Windows calling convention.
31
*/
32
33
ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
34
ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
35
ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
36
ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
37
.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
38
.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
39
.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
40
.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
41
.text
42
43
/* Sets up register mappings for clarity.
44
* op[], bits[], dtable & ip[0] each get their own register.
45
* ip[1,2,3] & olimit alias var[].
46
* %rax is a scratch register.
47
*/
48
49
#define op0 rsi
50
#define op1 rbx
51
#define op2 rcx
52
#define op3 rdi
53
54
#define ip0 r8
55
#define ip1 r9
56
#define ip2 r10
57
#define ip3 r11
58
59
#define bits0 rbp
60
#define bits1 rdx
61
#define bits2 r12
62
#define bits3 r13
63
#define dtable r14
64
#define olimit r15
65
66
/* var[] aliases ip[1,2,3] & olimit
67
* ip[1,2,3] are saved every iteration.
68
* olimit is only used in compute_olimit.
69
*/
70
#define var0 r15
71
#define var1 r9
72
#define var2 r10
73
#define var3 r11
74
75
/* 32-bit var registers */
76
#define vard0 r15d
77
#define vard1 r9d
78
#define vard2 r10d
79
#define vard3 r11d
80
81
/* Calls X(N) for each stream 0, 1, 2, 3. */
82
#define FOR_EACH_STREAM(X) \
83
X(0); \
84
X(1); \
85
X(2); \
86
X(3)
87
88
/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
89
#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
90
X(0, idx); \
91
X(1, idx); \
92
X(2, idx); \
93
X(3, idx)
94
95
/* Define both _HUF_* & HUF_* symbols because MacOS
96
* C symbols are prefixed with '_' & Linux symbols aren't.
97
*/
98
_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
99
HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
100
/* Save all registers - even if they are callee saved for simplicity. */
101
push %rax
102
push %rbx
103
push %rcx
104
push %rdx
105
push %rbp
106
push %rsi
107
push %rdi
108
push %r8
109
push %r9
110
push %r10
111
push %r11
112
push %r12
113
push %r13
114
push %r14
115
push %r15
116
117
/* Read HUF_DecompressAsmArgs* args from %rax */
118
movq %rdi, %rax
119
movq 0(%rax), %ip0
120
movq 8(%rax), %ip1
121
movq 16(%rax), %ip2
122
movq 24(%rax), %ip3
123
movq 32(%rax), %op0
124
movq 40(%rax), %op1
125
movq 48(%rax), %op2
126
movq 56(%rax), %op3
127
movq 64(%rax), %bits0
128
movq 72(%rax), %bits1
129
movq 80(%rax), %bits2
130
movq 88(%rax), %bits3
131
movq 96(%rax), %dtable
132
push %rax /* argument */
133
push 104(%rax) /* ilimit */
134
push 112(%rax) /* oend */
135
push %olimit /* olimit space */
136
137
subq $24, %rsp
138
139
.L_4X1_compute_olimit:
140
/* Computes how many iterations we can do safely
141
* %r15, %rax may be clobbered
142
* rbx, rdx must be saved
143
* op3 & ip0 mustn't be clobbered
144
*/
145
movq %rbx, 0(%rsp)
146
movq %rdx, 8(%rsp)
147
148
movq 32(%rsp), %rax /* rax = oend */
149
subq %op3, %rax /* rax = oend - op3 */
150
151
/* r15 = (oend - op3) / 5 */
152
movabsq $-3689348814741910323, %rdx
153
mulq %rdx
154
movq %rdx, %r15
155
shrq $2, %r15
156
157
movq %ip0, %rax /* rax = ip0 */
158
movq 40(%rsp), %rdx /* rdx = ilimit */
159
subq %rdx, %rax /* rax = ip0 - ilimit */
160
movq %rax, %rbx /* rbx = ip0 - ilimit */
161
162
/* rdx = (ip0 - ilimit) / 7 */
163
movabsq $2635249153387078803, %rdx
164
mulq %rdx
165
subq %rdx, %rbx
166
shrq %rbx
167
addq %rbx, %rdx
168
shrq $2, %rdx
169
170
/* r15 = min(%rdx, %r15) */
171
cmpq %rdx, %r15
172
cmova %rdx, %r15
173
174
/* r15 = r15 * 5 */
175
leaq (%r15, %r15, 4), %r15
176
177
/* olimit = op3 + r15 */
178
addq %op3, %olimit
179
180
movq 8(%rsp), %rdx
181
movq 0(%rsp), %rbx
182
183
/* If (op3 + 20 > olimit) */
184
movq %op3, %rax /* rax = op3 */
185
addq $20, %rax /* rax = op3 + 20 */
186
cmpq %rax, %olimit /* op3 + 20 > olimit */
187
jb .L_4X1_exit
188
189
/* If (ip1 < ip0) go to exit */
190
cmpq %ip0, %ip1
191
jb .L_4X1_exit
192
193
/* If (ip2 < ip1) go to exit */
194
cmpq %ip1, %ip2
195
jb .L_4X1_exit
196
197
/* If (ip3 < ip2) go to exit */
198
cmpq %ip2, %ip3
199
jb .L_4X1_exit
200
201
/* Reads top 11 bits from bits[n]
202
* Loads dt[bits[n]] into var[n]
203
*/
204
#define GET_NEXT_DELT(n) \
205
movq $53, %var##n; \
206
shrxq %var##n, %bits##n, %var##n; \
207
movzwl (%dtable,%var##n,2),%vard##n
208
209
/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
210
* Moves var[n] to %rax
211
* bits[n] <<= var[n] & 63
212
* op[n][idx] = %rax >> 8
213
* %ah is a way to access bits [8, 16) of %rax
214
*/
215
#define DECODE_FROM_DELT(n, idx) \
216
movq %var##n, %rax; \
217
shlxq %var##n, %bits##n, %bits##n; \
218
movb %ah, idx(%op##n)
219
220
/* Assumes GET_NEXT_DELT has been called.
221
* Calls DECODE_FROM_DELT then GET_NEXT_DELT
222
*/
223
#define DECODE_AND_GET_NEXT(n, idx) \
224
DECODE_FROM_DELT(n, idx); \
225
GET_NEXT_DELT(n) \
226
227
/* // ctz & nbBytes is stored in bits[n]
228
* // nbBits is stored in %rax
229
* ctz = CTZ[bits[n]]
230
* nbBits = ctz & 7
231
* nbBytes = ctz >> 3
232
* op[n] += 5
233
* ip[n] -= nbBytes
234
* // Note: x86-64 is little-endian ==> no bswap
235
* bits[n] = MEM_readST(ip[n]) | 1
236
* bits[n] <<= nbBits
237
*/
238
#define RELOAD_BITS(n) \
239
bsfq %bits##n, %bits##n; \
240
movq %bits##n, %rax; \
241
andq $7, %rax; \
242
shrq $3, %bits##n; \
243
leaq 5(%op##n), %op##n; \
244
subq %bits##n, %ip##n; \
245
movq (%ip##n), %bits##n; \
246
orq $1, %bits##n; \
247
shlx %rax, %bits##n, %bits##n
248
249
/* Store clobbered variables on the stack */
250
movq %olimit, 24(%rsp)
251
movq %ip1, 0(%rsp)
252
movq %ip2, 8(%rsp)
253
movq %ip3, 16(%rsp)
254
255
/* Call GET_NEXT_DELT for each stream */
256
FOR_EACH_STREAM(GET_NEXT_DELT)
257
258
.p2align 6
259
260
.L_4X1_loop_body:
261
/* Decode 5 symbols in each of the 4 streams (20 total)
262
* Must have called GET_NEXT_DELT for each stream
263
*/
264
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
265
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
266
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
267
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
268
FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
269
270
/* Load ip[1,2,3] from stack (var[] aliases them)
271
* ip[] is needed for RELOAD_BITS
272
* Each will be stored back to the stack after RELOAD
273
*/
274
movq 0(%rsp), %ip1
275
movq 8(%rsp), %ip2
276
movq 16(%rsp), %ip3
277
278
/* Reload each stream & fetch the next table entry
279
* to prepare for the next iteration
280
*/
281
RELOAD_BITS(0)
282
GET_NEXT_DELT(0)
283
284
RELOAD_BITS(1)
285
movq %ip1, 0(%rsp)
286
GET_NEXT_DELT(1)
287
288
RELOAD_BITS(2)
289
movq %ip2, 8(%rsp)
290
GET_NEXT_DELT(2)
291
292
RELOAD_BITS(3)
293
movq %ip3, 16(%rsp)
294
GET_NEXT_DELT(3)
295
296
/* If op3 < olimit: continue the loop */
297
cmp %op3, 24(%rsp)
298
ja .L_4X1_loop_body
299
300
/* Reload ip[1,2,3] from stack */
301
movq 0(%rsp), %ip1
302
movq 8(%rsp), %ip2
303
movq 16(%rsp), %ip3
304
305
/* Re-compute olimit */
306
jmp .L_4X1_compute_olimit
307
308
#undef GET_NEXT_DELT
309
#undef DECODE_FROM_DELT
310
#undef DECODE
311
#undef RELOAD_BITS
312
.L_4X1_exit:
313
addq $24, %rsp
314
315
/* Restore stack (oend & olimit) */
316
pop %rax /* olimit */
317
pop %rax /* oend */
318
pop %rax /* ilimit */
319
pop %rax /* arg */
320
321
/* Save ip / op / bits */
322
movq %ip0, 0(%rax)
323
movq %ip1, 8(%rax)
324
movq %ip2, 16(%rax)
325
movq %ip3, 24(%rax)
326
movq %op0, 32(%rax)
327
movq %op1, 40(%rax)
328
movq %op2, 48(%rax)
329
movq %op3, 56(%rax)
330
movq %bits0, 64(%rax)
331
movq %bits1, 72(%rax)
332
movq %bits2, 80(%rax)
333
movq %bits3, 88(%rax)
334
335
/* Restore registers */
336
pop %r15
337
pop %r14
338
pop %r13
339
pop %r12
340
pop %r11
341
pop %r10
342
pop %r9
343
pop %r8
344
pop %rdi
345
pop %rsi
346
pop %rbp
347
pop %rdx
348
pop %rcx
349
pop %rbx
350
pop %rax
351
ret
352
353
_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
354
HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
355
/* Save all registers - even if they are callee saved for simplicity. */
356
push %rax
357
push %rbx
358
push %rcx
359
push %rdx
360
push %rbp
361
push %rsi
362
push %rdi
363
push %r8
364
push %r9
365
push %r10
366
push %r11
367
push %r12
368
push %r13
369
push %r14
370
push %r15
371
372
movq %rdi, %rax
373
movq 0(%rax), %ip0
374
movq 8(%rax), %ip1
375
movq 16(%rax), %ip2
376
movq 24(%rax), %ip3
377
movq 32(%rax), %op0
378
movq 40(%rax), %op1
379
movq 48(%rax), %op2
380
movq 56(%rax), %op3
381
movq 64(%rax), %bits0
382
movq 72(%rax), %bits1
383
movq 80(%rax), %bits2
384
movq 88(%rax), %bits3
385
movq 96(%rax), %dtable
386
push %rax /* argument */
387
push %rax /* olimit */
388
push 104(%rax) /* ilimit */
389
390
movq 112(%rax), %rax
391
push %rax /* oend3 */
392
393
movq %op3, %rax
394
push %rax /* oend2 */
395
396
movq %op2, %rax
397
push %rax /* oend1 */
398
399
movq %op1, %rax
400
push %rax /* oend0 */
401
402
/* Scratch space */
403
subq $8, %rsp
404
405
.L_4X2_compute_olimit:
406
/* Computes how many iterations we can do safely
407
* %r15, %rax may be clobbered
408
* rdx must be saved
409
* op[1,2,3,4] & ip0 mustn't be clobbered
410
*/
411
movq %rdx, 0(%rsp)
412
413
/* We can consume up to 7 input bytes each iteration. */
414
movq %ip0, %rax /* rax = ip0 */
415
movq 40(%rsp), %rdx /* rdx = ilimit */
416
subq %rdx, %rax /* rax = ip0 - ilimit */
417
movq %rax, %r15 /* r15 = ip0 - ilimit */
418
419
/* rdx = rax / 7 */
420
movabsq $2635249153387078803, %rdx
421
mulq %rdx
422
subq %rdx, %r15
423
shrq %r15
424
addq %r15, %rdx
425
shrq $2, %rdx
426
427
/* r15 = (ip0 - ilimit) / 7 */
428
movq %rdx, %r15
429
430
movabsq $-3689348814741910323, %rdx
431
movq 8(%rsp), %rax /* rax = oend0 */
432
subq %op0, %rax /* rax = oend0 - op0 */
433
mulq %rdx
434
shrq $3, %rdx /* rdx = rax / 10 */
435
436
/* r15 = min(%rdx, %r15) */
437
cmpq %rdx, %r15
438
cmova %rdx, %r15
439
440
movabsq $-3689348814741910323, %rdx
441
movq 16(%rsp), %rax /* rax = oend1 */
442
subq %op1, %rax /* rax = oend1 - op1 */
443
mulq %rdx
444
shrq $3, %rdx /* rdx = rax / 10 */
445
446
/* r15 = min(%rdx, %r15) */
447
cmpq %rdx, %r15
448
cmova %rdx, %r15
449
450
movabsq $-3689348814741910323, %rdx
451
movq 24(%rsp), %rax /* rax = oend2 */
452
subq %op2, %rax /* rax = oend2 - op2 */
453
mulq %rdx
454
shrq $3, %rdx /* rdx = rax / 10 */
455
456
/* r15 = min(%rdx, %r15) */
457
cmpq %rdx, %r15
458
cmova %rdx, %r15
459
460
movabsq $-3689348814741910323, %rdx
461
movq 32(%rsp), %rax /* rax = oend3 */
462
subq %op3, %rax /* rax = oend3 - op3 */
463
mulq %rdx
464
shrq $3, %rdx /* rdx = rax / 10 */
465
466
/* r15 = min(%rdx, %r15) */
467
cmpq %rdx, %r15
468
cmova %rdx, %r15
469
470
/* olimit = op3 + 5 * r15 */
471
movq %r15, %rax
472
leaq (%op3, %rax, 4), %olimit
473
addq %rax, %olimit
474
475
movq 0(%rsp), %rdx
476
477
/* If (op3 + 10 > olimit) */
478
movq %op3, %rax /* rax = op3 */
479
addq $10, %rax /* rax = op3 + 10 */
480
cmpq %rax, %olimit /* op3 + 10 > olimit */
481
jb .L_4X2_exit
482
483
/* If (ip1 < ip0) go to exit */
484
cmpq %ip0, %ip1
485
jb .L_4X2_exit
486
487
/* If (ip2 < ip1) go to exit */
488
cmpq %ip1, %ip2
489
jb .L_4X2_exit
490
491
/* If (ip3 < ip2) go to exit */
492
cmpq %ip2, %ip3
493
jb .L_4X2_exit
494
495
#define DECODE(n, idx) \
496
movq %bits##n, %rax; \
497
shrq $53, %rax; \
498
movzwl 0(%dtable,%rax,4),%r8d; \
499
movzbl 2(%dtable,%rax,4),%r15d; \
500
movzbl 3(%dtable,%rax,4),%eax; \
501
movw %r8w, (%op##n); \
502
shlxq %r15, %bits##n, %bits##n; \
503
addq %rax, %op##n
504
505
#define RELOAD_BITS(n) \
506
bsfq %bits##n, %bits##n; \
507
movq %bits##n, %rax; \
508
shrq $3, %bits##n; \
509
andq $7, %rax; \
510
subq %bits##n, %ip##n; \
511
movq (%ip##n), %bits##n; \
512
orq $1, %bits##n; \
513
shlxq %rax, %bits##n, %bits##n
514
515
516
movq %olimit, 48(%rsp)
517
518
.p2align 6
519
520
.L_4X2_loop_body:
521
/* We clobber r8, so store it on the stack */
522
movq %r8, 0(%rsp)
523
524
/* Decode 5 symbols from each of the 4 streams (20 symbols total). */
525
FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
526
FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
527
FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
528
FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
529
FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
530
531
/* Reload r8 */
532
movq 0(%rsp), %r8
533
534
FOR_EACH_STREAM(RELOAD_BITS)
535
536
cmp %op3, 48(%rsp)
537
ja .L_4X2_loop_body
538
jmp .L_4X2_compute_olimit
539
540
#undef DECODE
541
#undef RELOAD_BITS
542
.L_4X2_exit:
543
addq $8, %rsp
544
/* Restore stack (oend & olimit) */
545
pop %rax /* oend0 */
546
pop %rax /* oend1 */
547
pop %rax /* oend2 */
548
pop %rax /* oend3 */
549
pop %rax /* ilimit */
550
pop %rax /* olimit */
551
pop %rax /* arg */
552
553
/* Save ip / op / bits */
554
movq %ip0, 0(%rax)
555
movq %ip1, 8(%rax)
556
movq %ip2, 16(%rax)
557
movq %ip3, 24(%rax)
558
movq %op0, 32(%rax)
559
movq %op1, 40(%rax)
560
movq %op2, 48(%rax)
561
movq %op3, 56(%rax)
562
movq %bits0, 64(%rax)
563
movq %bits1, 72(%rax)
564
movq %bits2, 80(%rax)
565
movq %bits3, 88(%rax)
566
567
/* Restore registers */
568
pop %r15
569
pop %r14
570
pop %r13
571
pop %r12
572
pop %r11
573
pop %r10
574
pop %r9
575
pop %r8
576
pop %rdi
577
pop %rsi
578
pop %rbp
579
pop %rdx
580
pop %rcx
581
pop %rbx
582
pop %rax
583
ret
584
585
#endif
586
587