Path: blob/main/sys/contrib/zstd/lib/decompress/huf_decompress_amd64.S
48375 views
/*1* Copyright (c) Facebook, Inc.2* All rights reserved.3*4* This source code is licensed under both the BSD-style license (found in the5* LICENSE file in the root directory of this source tree) and the GPLv2 (found6* in the COPYING file in the root directory of this source tree).7* You may select, at your option, one of the above-listed licenses.8*/910#include "../common/portability_macros.h"1112/* Stack marking13* ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart14*/15#if defined(__ELF__) && defined(__GNUC__)16.section .note.GNU-stack,"",%progbits17#endif1819#if ZSTD_ENABLE_ASM_X86_64_BMI22021/* Calling convention:22*23* %rdi contains the first argument: HUF_DecompressAsmArgs*.24* %rbp isn't maintained (no frame pointer).25* %rsp contains the stack pointer that grows down.26* No red-zone is assumed, only addresses >= %rsp are used.27* All register contents are preserved.28*29* TODO: Support Windows calling convention.30*/3132ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)33ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)34ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)35ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)36.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop37.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop38.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop39.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop40.text4142/* Sets up register mappings for clarity.43* op[], bits[], dtable & ip[0] each get their own register.44* ip[1,2,3] & olimit alias var[].45* %rax is a scratch register.46*/4748#define op0 rsi49#define op1 rbx50#define op2 rcx51#define op3 rdi5253#define ip0 r854#define ip1 r955#define ip2 r1056#define ip3 r115758#define bits0 rbp59#define bits1 rdx60#define bits2 r1261#define bits3 r1362#define dtable r1463#define olimit r156465/* var[] aliases ip[1,2,3] & olimit66* ip[1,2,3] are saved every iteration.67* olimit is only used in compute_olimit.68*/69#define var0 r1570#define var1 r971#define var2 r1072#define var3 r117374/* 32-bit var registers */75#define vard0 r15d76#define vard1 r9d77#define vard2 r10d78#define vard3 r11d7980/* Calls X(N) for each stream 0, 1, 2, 3. */81#define FOR_EACH_STREAM(X) \82X(0); \83X(1); \84X(2); \85X(3)8687/* Calls X(N, idx) for each stream 0, 1, 2, 3. */88#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \89X(0, idx); \90X(1, idx); \91X(2, idx); \92X(3, idx)9394/* Define both _HUF_* & HUF_* symbols because MacOS95* C symbols are prefixed with '_' & Linux symbols aren't.96*/97_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:98HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:99/* Save all registers - even if they are callee saved for simplicity. */100push %rax101push %rbx102push %rcx103push %rdx104push %rbp105push %rsi106push %rdi107push %r8108push %r9109push %r10110push %r11111push %r12112push %r13113push %r14114push %r15115116/* Read HUF_DecompressAsmArgs* args from %rax */117movq %rdi, %rax118movq 0(%rax), %ip0119movq 8(%rax), %ip1120movq 16(%rax), %ip2121movq 24(%rax), %ip3122movq 32(%rax), %op0123movq 40(%rax), %op1124movq 48(%rax), %op2125movq 56(%rax), %op3126movq 64(%rax), %bits0127movq 72(%rax), %bits1128movq 80(%rax), %bits2129movq 88(%rax), %bits3130movq 96(%rax), %dtable131push %rax /* argument */132push 104(%rax) /* ilimit */133push 112(%rax) /* oend */134push %olimit /* olimit space */135136subq $24, %rsp137138.L_4X1_compute_olimit:139/* Computes how many iterations we can do safely140* %r15, %rax may be clobbered141* rbx, rdx must be saved142* op3 & ip0 mustn't be clobbered143*/144movq %rbx, 0(%rsp)145movq %rdx, 8(%rsp)146147movq 32(%rsp), %rax /* rax = oend */148subq %op3, %rax /* rax = oend - op3 */149150/* r15 = (oend - op3) / 5 */151movabsq $-3689348814741910323, %rdx152mulq %rdx153movq %rdx, %r15154shrq $2, %r15155156movq %ip0, %rax /* rax = ip0 */157movq 40(%rsp), %rdx /* rdx = ilimit */158subq %rdx, %rax /* rax = ip0 - ilimit */159movq %rax, %rbx /* rbx = ip0 - ilimit */160161/* rdx = (ip0 - ilimit) / 7 */162movabsq $2635249153387078803, %rdx163mulq %rdx164subq %rdx, %rbx165shrq %rbx166addq %rbx, %rdx167shrq $2, %rdx168169/* r15 = min(%rdx, %r15) */170cmpq %rdx, %r15171cmova %rdx, %r15172173/* r15 = r15 * 5 */174leaq (%r15, %r15, 4), %r15175176/* olimit = op3 + r15 */177addq %op3, %olimit178179movq 8(%rsp), %rdx180movq 0(%rsp), %rbx181182/* If (op3 + 20 > olimit) */183movq %op3, %rax /* rax = op3 */184addq $20, %rax /* rax = op3 + 20 */185cmpq %rax, %olimit /* op3 + 20 > olimit */186jb .L_4X1_exit187188/* If (ip1 < ip0) go to exit */189cmpq %ip0, %ip1190jb .L_4X1_exit191192/* If (ip2 < ip1) go to exit */193cmpq %ip1, %ip2194jb .L_4X1_exit195196/* If (ip3 < ip2) go to exit */197cmpq %ip2, %ip3198jb .L_4X1_exit199200/* Reads top 11 bits from bits[n]201* Loads dt[bits[n]] into var[n]202*/203#define GET_NEXT_DELT(n) \204movq $53, %var##n; \205shrxq %var##n, %bits##n, %var##n; \206movzwl (%dtable,%var##n,2),%vard##n207208/* var[n] must contain the DTable entry computed with GET_NEXT_DELT209* Moves var[n] to %rax210* bits[n] <<= var[n] & 63211* op[n][idx] = %rax >> 8212* %ah is a way to access bits [8, 16) of %rax213*/214#define DECODE_FROM_DELT(n, idx) \215movq %var##n, %rax; \216shlxq %var##n, %bits##n, %bits##n; \217movb %ah, idx(%op##n)218219/* Assumes GET_NEXT_DELT has been called.220* Calls DECODE_FROM_DELT then GET_NEXT_DELT221*/222#define DECODE_AND_GET_NEXT(n, idx) \223DECODE_FROM_DELT(n, idx); \224GET_NEXT_DELT(n) \225226/* // ctz & nbBytes is stored in bits[n]227* // nbBits is stored in %rax228* ctz = CTZ[bits[n]]229* nbBits = ctz & 7230* nbBytes = ctz >> 3231* op[n] += 5232* ip[n] -= nbBytes233* // Note: x86-64 is little-endian ==> no bswap234* bits[n] = MEM_readST(ip[n]) | 1235* bits[n] <<= nbBits236*/237#define RELOAD_BITS(n) \238bsfq %bits##n, %bits##n; \239movq %bits##n, %rax; \240andq $7, %rax; \241shrq $3, %bits##n; \242leaq 5(%op##n), %op##n; \243subq %bits##n, %ip##n; \244movq (%ip##n), %bits##n; \245orq $1, %bits##n; \246shlx %rax, %bits##n, %bits##n247248/* Store clobbered variables on the stack */249movq %olimit, 24(%rsp)250movq %ip1, 0(%rsp)251movq %ip2, 8(%rsp)252movq %ip3, 16(%rsp)253254/* Call GET_NEXT_DELT for each stream */255FOR_EACH_STREAM(GET_NEXT_DELT)256257.p2align 6258259.L_4X1_loop_body:260/* Decode 5 symbols in each of the 4 streams (20 total)261* Must have called GET_NEXT_DELT for each stream262*/263FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)264FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)265FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)266FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)267FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)268269/* Load ip[1,2,3] from stack (var[] aliases them)270* ip[] is needed for RELOAD_BITS271* Each will be stored back to the stack after RELOAD272*/273movq 0(%rsp), %ip1274movq 8(%rsp), %ip2275movq 16(%rsp), %ip3276277/* Reload each stream & fetch the next table entry278* to prepare for the next iteration279*/280RELOAD_BITS(0)281GET_NEXT_DELT(0)282283RELOAD_BITS(1)284movq %ip1, 0(%rsp)285GET_NEXT_DELT(1)286287RELOAD_BITS(2)288movq %ip2, 8(%rsp)289GET_NEXT_DELT(2)290291RELOAD_BITS(3)292movq %ip3, 16(%rsp)293GET_NEXT_DELT(3)294295/* If op3 < olimit: continue the loop */296cmp %op3, 24(%rsp)297ja .L_4X1_loop_body298299/* Reload ip[1,2,3] from stack */300movq 0(%rsp), %ip1301movq 8(%rsp), %ip2302movq 16(%rsp), %ip3303304/* Re-compute olimit */305jmp .L_4X1_compute_olimit306307#undef GET_NEXT_DELT308#undef DECODE_FROM_DELT309#undef DECODE310#undef RELOAD_BITS311.L_4X1_exit:312addq $24, %rsp313314/* Restore stack (oend & olimit) */315pop %rax /* olimit */316pop %rax /* oend */317pop %rax /* ilimit */318pop %rax /* arg */319320/* Save ip / op / bits */321movq %ip0, 0(%rax)322movq %ip1, 8(%rax)323movq %ip2, 16(%rax)324movq %ip3, 24(%rax)325movq %op0, 32(%rax)326movq %op1, 40(%rax)327movq %op2, 48(%rax)328movq %op3, 56(%rax)329movq %bits0, 64(%rax)330movq %bits1, 72(%rax)331movq %bits2, 80(%rax)332movq %bits3, 88(%rax)333334/* Restore registers */335pop %r15336pop %r14337pop %r13338pop %r12339pop %r11340pop %r10341pop %r9342pop %r8343pop %rdi344pop %rsi345pop %rbp346pop %rdx347pop %rcx348pop %rbx349pop %rax350ret351352_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:353HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:354/* Save all registers - even if they are callee saved for simplicity. */355push %rax356push %rbx357push %rcx358push %rdx359push %rbp360push %rsi361push %rdi362push %r8363push %r9364push %r10365push %r11366push %r12367push %r13368push %r14369push %r15370371movq %rdi, %rax372movq 0(%rax), %ip0373movq 8(%rax), %ip1374movq 16(%rax), %ip2375movq 24(%rax), %ip3376movq 32(%rax), %op0377movq 40(%rax), %op1378movq 48(%rax), %op2379movq 56(%rax), %op3380movq 64(%rax), %bits0381movq 72(%rax), %bits1382movq 80(%rax), %bits2383movq 88(%rax), %bits3384movq 96(%rax), %dtable385push %rax /* argument */386push %rax /* olimit */387push 104(%rax) /* ilimit */388389movq 112(%rax), %rax390push %rax /* oend3 */391392movq %op3, %rax393push %rax /* oend2 */394395movq %op2, %rax396push %rax /* oend1 */397398movq %op1, %rax399push %rax /* oend0 */400401/* Scratch space */402subq $8, %rsp403404.L_4X2_compute_olimit:405/* Computes how many iterations we can do safely406* %r15, %rax may be clobbered407* rdx must be saved408* op[1,2,3,4] & ip0 mustn't be clobbered409*/410movq %rdx, 0(%rsp)411412/* We can consume up to 7 input bytes each iteration. */413movq %ip0, %rax /* rax = ip0 */414movq 40(%rsp), %rdx /* rdx = ilimit */415subq %rdx, %rax /* rax = ip0 - ilimit */416movq %rax, %r15 /* r15 = ip0 - ilimit */417418/* rdx = rax / 7 */419movabsq $2635249153387078803, %rdx420mulq %rdx421subq %rdx, %r15422shrq %r15423addq %r15, %rdx424shrq $2, %rdx425426/* r15 = (ip0 - ilimit) / 7 */427movq %rdx, %r15428429movabsq $-3689348814741910323, %rdx430movq 8(%rsp), %rax /* rax = oend0 */431subq %op0, %rax /* rax = oend0 - op0 */432mulq %rdx433shrq $3, %rdx /* rdx = rax / 10 */434435/* r15 = min(%rdx, %r15) */436cmpq %rdx, %r15437cmova %rdx, %r15438439movabsq $-3689348814741910323, %rdx440movq 16(%rsp), %rax /* rax = oend1 */441subq %op1, %rax /* rax = oend1 - op1 */442mulq %rdx443shrq $3, %rdx /* rdx = rax / 10 */444445/* r15 = min(%rdx, %r15) */446cmpq %rdx, %r15447cmova %rdx, %r15448449movabsq $-3689348814741910323, %rdx450movq 24(%rsp), %rax /* rax = oend2 */451subq %op2, %rax /* rax = oend2 - op2 */452mulq %rdx453shrq $3, %rdx /* rdx = rax / 10 */454455/* r15 = min(%rdx, %r15) */456cmpq %rdx, %r15457cmova %rdx, %r15458459movabsq $-3689348814741910323, %rdx460movq 32(%rsp), %rax /* rax = oend3 */461subq %op3, %rax /* rax = oend3 - op3 */462mulq %rdx463shrq $3, %rdx /* rdx = rax / 10 */464465/* r15 = min(%rdx, %r15) */466cmpq %rdx, %r15467cmova %rdx, %r15468469/* olimit = op3 + 5 * r15 */470movq %r15, %rax471leaq (%op3, %rax, 4), %olimit472addq %rax, %olimit473474movq 0(%rsp), %rdx475476/* If (op3 + 10 > olimit) */477movq %op3, %rax /* rax = op3 */478addq $10, %rax /* rax = op3 + 10 */479cmpq %rax, %olimit /* op3 + 10 > olimit */480jb .L_4X2_exit481482/* If (ip1 < ip0) go to exit */483cmpq %ip0, %ip1484jb .L_4X2_exit485486/* If (ip2 < ip1) go to exit */487cmpq %ip1, %ip2488jb .L_4X2_exit489490/* If (ip3 < ip2) go to exit */491cmpq %ip2, %ip3492jb .L_4X2_exit493494#define DECODE(n, idx) \495movq %bits##n, %rax; \496shrq $53, %rax; \497movzwl 0(%dtable,%rax,4),%r8d; \498movzbl 2(%dtable,%rax,4),%r15d; \499movzbl 3(%dtable,%rax,4),%eax; \500movw %r8w, (%op##n); \501shlxq %r15, %bits##n, %bits##n; \502addq %rax, %op##n503504#define RELOAD_BITS(n) \505bsfq %bits##n, %bits##n; \506movq %bits##n, %rax; \507shrq $3, %bits##n; \508andq $7, %rax; \509subq %bits##n, %ip##n; \510movq (%ip##n), %bits##n; \511orq $1, %bits##n; \512shlxq %rax, %bits##n, %bits##n513514515movq %olimit, 48(%rsp)516517.p2align 6518519.L_4X2_loop_body:520/* We clobber r8, so store it on the stack */521movq %r8, 0(%rsp)522523/* Decode 5 symbols from each of the 4 streams (20 symbols total). */524FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)525FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)526FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)527FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)528FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)529530/* Reload r8 */531movq 0(%rsp), %r8532533FOR_EACH_STREAM(RELOAD_BITS)534535cmp %op3, 48(%rsp)536ja .L_4X2_loop_body537jmp .L_4X2_compute_olimit538539#undef DECODE540#undef RELOAD_BITS541.L_4X2_exit:542addq $8, %rsp543/* Restore stack (oend & olimit) */544pop %rax /* oend0 */545pop %rax /* oend1 */546pop %rax /* oend2 */547pop %rax /* oend3 */548pop %rax /* ilimit */549pop %rax /* olimit */550pop %rax /* arg */551552/* Save ip / op / bits */553movq %ip0, 0(%rax)554movq %ip1, 8(%rax)555movq %ip2, 16(%rax)556movq %ip3, 24(%rax)557movq %op0, 32(%rax)558movq %op1, 40(%rax)559movq %op2, 48(%rax)560movq %op3, 56(%rax)561movq %bits0, 64(%rax)562movq %bits1, 72(%rax)563movq %bits2, 80(%rax)564movq %bits3, 88(%rax)565566/* Restore registers */567pop %r15568pop %r14569pop %r13570pop %r12571pop %r11572pop %r10573pop %r9574pop %r8575pop %rdi576pop %rsi577pop %rbp578pop %rdx579pop %rcx580pop %rbx581pop %rax582ret583584#endif585586587