.arch armv8-a+crypto
.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
20, 24, 25, 26, 27, 28, 29, 30, 31
.set .Lv\b\().4s, \b
.endr
.macro sm4e, vd, vn
.inst 0xcec08400 | (.L\vn << 5) | .L\vd
.endm
.macro sm4ekey, vd, vn, vm
.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
.endm
.align 3
SYM_FUNC_START(sm4_ce_expand_key)
ld1 {v0.16b}, [x0];
rev32 v0.16b, v0.16b;
ld1 {v1.16b}, [x3];
ld1 {v24.16b-v27.16b}, [x4],
ld1 {v28.16b-v31.16b}, [x4];
eor v0.16b, v0.16b, v1.16b;
sm4ekey v0.4s, v0.4s, v24.4s;
sm4ekey v1.4s, v0.4s, v25.4s;
sm4ekey v2.4s, v1.4s, v26.4s;
sm4ekey v3.4s, v2.4s, v27.4s;
sm4ekey v4.4s, v3.4s, v28.4s;
sm4ekey v5.4s, v4.4s, v29.4s;
sm4ekey v6.4s, v5.4s, v30.4s;
sm4ekey v7.4s, v6.4s, v31.4s;
adr_l x5, .Lbswap128_mask
ld1 {v24.16b}, [x5]
st1 {v0.16b-v3.16b}, [x1],
st1 {v4.16b-v7.16b}, [x1];
tbl v16.16b, {v7.16b}, v24.16b
tbl v17.16b, {v6.16b}, v24.16b
tbl v18.16b, {v5.16b}, v24.16b
tbl v19.16b, {v4.16b}, v24.16b
tbl v20.16b, {v3.16b}, v24.16b
tbl v21.16b, {v2.16b}, v24.16b
tbl v22.16b, {v1.16b}, v24.16b
tbl v23.16b, {v0.16b}, v24.16b
st1 {v16.16b-v19.16b}, [x2],
st1 {v20.16b-v23.16b}, [x2]
ret;
SYM_FUNC_END(sm4_ce_expand_key)
.align 3
SYM_FUNC_START(sm4_ce_crypt_block)
SM4_PREPARE(x0)
ld1 {v0.16b}, [x2];
SM4_CRYPT_BLK(v0);
st1 {v0.16b}, [x1];
ret;
SYM_FUNC_END(sm4_ce_crypt_block)
.align 3
SYM_FUNC_START(sm4_ce_crypt)
SM4_PREPARE(x0)
.Lcrypt_loop_blk:
sub w3, w3,
tbnz w3,
ld1 {v0.16b-v3.16b}, [x2],
ld1 {v4.16b-v7.16b}, [x2],
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
st1 {v0.16b-v3.16b}, [x1],
st1 {v4.16b-v7.16b}, [x1],
cbz w3, .Lcrypt_end;
b .Lcrypt_loop_blk;
.Lcrypt_tail8:
add w3, w3,
cmp w3,
blt .Lcrypt_tail4;
sub w3, w3,
ld1 {v0.16b-v3.16b}, [x2],
SM4_CRYPT_BLK4(v0, v1, v2, v3);
st1 {v0.16b-v3.16b}, [x1],
cbz w3, .Lcrypt_end;
.Lcrypt_tail4:
sub w3, w3,
ld1 {v0.16b}, [x2],
SM4_CRYPT_BLK(v0);
st1 {v0.16b}, [x1],
cbnz w3, .Lcrypt_tail4;
.Lcrypt_end:
ret;
SYM_FUNC_END(sm4_ce_crypt)
.align 3
SYM_FUNC_START(sm4_ce_cbc_enc)
SM4_PREPARE(x0)
ld1 {RIV.16b}, [x3]
.Lcbc_enc_loop_4x:
cmp w4,
blt .Lcbc_enc_loop_1x
sub w4, w4,
ld1 {v0.16b-v3.16b}, [x2],
eor v0.16b, v0.16b, RIV.16b
SM4_CRYPT_BLK(v0)
eor v1.16b, v1.16b, v0.16b
SM4_CRYPT_BLK(v1)
eor v2.16b, v2.16b, v1.16b
SM4_CRYPT_BLK(v2)
eor v3.16b, v3.16b, v2.16b
SM4_CRYPT_BLK(v3)
st1 {v0.16b-v3.16b}, [x1],
mov RIV.16b, v3.16b
cbz w4, .Lcbc_enc_end
b .Lcbc_enc_loop_4x
.Lcbc_enc_loop_1x:
sub w4, w4,
ld1 {v0.16b}, [x2],
eor RIV.16b, RIV.16b, v0.16b
SM4_CRYPT_BLK(RIV)
st1 {RIV.16b}, [x1],
cbnz w4, .Lcbc_enc_loop_1x
.Lcbc_enc_end:
st1 {RIV.16b}, [x3]
ret
SYM_FUNC_END(sm4_ce_cbc_enc)
.align 3
SYM_FUNC_START(sm4_ce_cbc_dec)
SM4_PREPARE(x0)
ld1 {RIV.16b}, [x3]
.Lcbc_dec_loop_8x:
sub w4, w4,
tbnz w4,
ld1 {v0.16b-v3.16b}, [x2],
ld1 {v4.16b-v7.16b}, [x2],
rev32 v8.16b, v0.16b
rev32 v9.16b, v1.16b
rev32 v10.16b, v2.16b
rev32 v11.16b, v3.16b
rev32 v12.16b, v4.16b
rev32 v13.16b, v5.16b
rev32 v14.16b, v6.16b
rev32 v15.16b, v7.16b
SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
eor v8.16b, v8.16b, RIV.16b
eor v9.16b, v9.16b, v0.16b
eor v10.16b, v10.16b, v1.16b
eor v11.16b, v11.16b, v2.16b
eor v12.16b, v12.16b, v3.16b
eor v13.16b, v13.16b, v4.16b
eor v14.16b, v14.16b, v5.16b
eor v15.16b, v15.16b, v6.16b
st1 {v8.16b-v11.16b}, [x1],
st1 {v12.16b-v15.16b}, [x1],
mov RIV.16b, v7.16b
cbz w4, .Lcbc_dec_end
b .Lcbc_dec_loop_8x
.Lcbc_dec_4x:
add w4, w4,
cmp w4,
blt .Lcbc_dec_loop_1x
sub w4, w4,
ld1 {v0.16b-v3.16b}, [x2],
rev32 v8.16b, v0.16b
rev32 v9.16b, v1.16b
rev32 v10.16b, v2.16b
rev32 v11.16b, v3.16b
SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
eor v8.16b, v8.16b, RIV.16b
eor v9.16b, v9.16b, v0.16b
eor v10.16b, v10.16b, v1.16b
eor v11.16b, v11.16b, v2.16b
st1 {v8.16b-v11.16b}, [x1],
mov RIV.16b, v3.16b
cbz w4, .Lcbc_dec_end
.Lcbc_dec_loop_1x:
sub w4, w4,
ld1 {v0.16b}, [x2],
rev32 v8.16b, v0.16b
SM4_CRYPT_BLK_BE(v8)
eor v8.16b, v8.16b, RIV.16b
st1 {v8.16b}, [x1],
mov RIV.16b, v0.16b
cbnz w4, .Lcbc_dec_loop_1x
.Lcbc_dec_end:
st1 {RIV.16b}, [x3]
ret
SYM_FUNC_END(sm4_ce_cbc_dec)
.align 3
SYM_FUNC_START(sm4_ce_cbc_cts_enc)
SM4_PREPARE(x0)
sub w5, w4,
uxtw x5, w5
ld1 {RIV.16b}, [x3]
ld1 {v0.16b}, [x2]
eor RIV.16b, RIV.16b, v0.16b
SM4_CRYPT_BLK(RIV)
adr_l x6, .Lcts_permute_table
add x7, x6,
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.16b}, [x6]
ld1 {v4.16b}, [x7]
add x2, x2, x5
ld1 {v1.16b}, [x2]
tbl v0.16b, {RIV.16b}, v3.16b
tbl v1.16b, {v1.16b}, v4.16b
eor v1.16b, v1.16b, RIV.16b
SM4_CRYPT_BLK(v1)
add x5, x1, x5
st1 {v0.16b}, [x5]
st1 {v1.16b}, [x1]
ret
SYM_FUNC_END(sm4_ce_cbc_cts_enc)
.align 3
SYM_FUNC_START(sm4_ce_cbc_cts_dec)
SM4_PREPARE(x0)
sub w5, w4,
uxtw x5, w5
ld1 {RIV.16b}, [x3]
adr_l x6, .Lcts_permute_table
add x7, x6,
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.16b}, [x6]
ld1 {v4.16b}, [x7]
ld1 {v0.16b}, [x2], x5
ld1 {v1.16b}, [x2]
SM4_CRYPT_BLK(v0)
tbl v2.16b, {v0.16b}, v3.16b
eor v2.16b, v2.16b, v1.16b
tbx v0.16b, {v1.16b}, v4.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, RIV.16b
add x5, x1, x5
st1 {v2.16b}, [x5]
st1 {v0.16b}, [x1]
ret
SYM_FUNC_END(sm4_ce_cbc_cts_dec)
.align 3
SYM_FUNC_START(sm4_ce_ctr_enc)
SM4_PREPARE(x0)
ldp x7, x8, [x3]
rev x7, x7
rev x8, x8
.Lctr_loop_8x:
sub w4, w4,
tbnz w4,
mov vctr.d[1], x8; \
mov vctr.d[0], x7; \
adds x8, x8,
rev64 vctr.16b, vctr.16b; \
adc x7, x7, xzr;
inc_le128(v0)
inc_le128(v1)
inc_le128(v2)
inc_le128(v3)
inc_le128(v4)
inc_le128(v5)
inc_le128(v6)
inc_le128(v7)
ld1 {v8.16b-v11.16b}, [x2],
ld1 {v12.16b-v15.16b}, [x2],
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
st1 {v0.16b-v3.16b}, [x1],
st1 {v4.16b-v7.16b}, [x1],
cbz w4, .Lctr_end
b .Lctr_loop_8x
.Lctr_4x:
add w4, w4,
cmp w4,
blt .Lctr_loop_1x
sub w4, w4,
inc_le128(v0)
inc_le128(v1)
inc_le128(v2)
inc_le128(v3)
ld1 {v8.16b-v11.16b}, [x2],
SM4_CRYPT_BLK4(v0, v1, v2, v3)
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
st1 {v0.16b-v3.16b}, [x1],
cbz w4, .Lctr_end
.Lctr_loop_1x:
sub w4, w4,
inc_le128(v0)
ld1 {v8.16b}, [x2],
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v8.16b
st1 {v0.16b}, [x1],
cbnz w4, .Lctr_loop_1x
.Lctr_end:
rev x7, x7
rev x8, x8
stp x7, x8, [x3]
ret
SYM_FUNC_END(sm4_ce_ctr_enc)
sshr RTMP.2d, vin.2d,
and RTMP.16b, RTMP.16b, RMASK.16b; \
add vt.2d, vin.2d, vin.2d; \
ext RTMP.16b, RTMP.16b, RTMP.16b,
eor vt.16b, vt.16b, RTMP.16b;
.align 3
SYM_FUNC_START(sm4_ce_xts_enc)
ld1 {v8.16b}, [x3]
cbz x5, .Lxts_enc_nofirst
SM4_PREPARE(x5)
SM4_CRYPT_BLK(v8)
.Lxts_enc_nofirst:
SM4_PREPARE(x0)
ands w5, w4,
lsr w4, w4,
sub w6, w4,
csel w4, w4, w6, eq
uxtw x5, w5
movi RMASK.2s,
movi RTMP0.2s,
uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
cbz w4, .Lxts_enc_cts
.Lxts_enc_loop_8x:
sub w4, w4,
tbnz w4,
tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
tweak_next(v12, v11, RTMP3)
tweak_next(v13, v12, RTMP0)
tweak_next(v14, v13, RTMP1)
tweak_next(v15, v14, RTMP2)
ld1 {v0.16b-v3.16b}, [x2],
ld1 {v4.16b-v7.16b}, [x2],
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
st1 {v0.16b-v3.16b}, [x1],
st1 {v4.16b-v7.16b}, [x1],
tweak_next(v8, v15, RTMP3)
cbz w4, .Lxts_enc_cts
b .Lxts_enc_loop_8x
.Lxts_enc_4x:
add w4, w4,
cmp w4,
blt .Lxts_enc_loop_1x
sub w4, w4,
tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
ld1 {v0.16b-v3.16b}, [x2],
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
SM4_CRYPT_BLK4(v0, v1, v2, v3)
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
st1 {v0.16b-v3.16b}, [x1],
tweak_next(v8, v11, RTMP3)
cbz w4, .Lxts_enc_cts
.Lxts_enc_loop_1x:
sub w4, w4,
ld1 {v0.16b}, [x2],
eor v0.16b, v0.16b, v8.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v8.16b
st1 {v0.16b}, [x1],
tweak_next(v8, v8, RTMP0)
cbnz w4, .Lxts_enc_loop_1x
.Lxts_enc_cts:
cbz x5, .Lxts_enc_end
tweak_next(v9, v8, RTMP0)
ld1 {v0.16b}, [x2]
eor v0.16b, v0.16b, v8.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v8.16b
adr_l x6, .Lcts_permute_table
add x7, x6,
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.16b}, [x6]
ld1 {v4.16b}, [x7]
add x2, x2, x5
ld1 {v1.16b}, [x2]
tbl v2.16b, {v0.16b}, v3.16b
tbx v0.16b, {v1.16b}, v4.16b
eor v0.16b, v0.16b, v9.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v9.16b
add x5, x1, x5
st1 {v2.16b}, [x5]
st1 {v0.16b}, [x1]
b .Lxts_enc_ret
.Lxts_enc_end:
st1 {v8.16b}, [x3]
.Lxts_enc_ret:
ret
SYM_FUNC_END(sm4_ce_xts_enc)
.align 3
SYM_FUNC_START(sm4_ce_xts_dec)
ld1 {v8.16b}, [x3]
cbz x5, .Lxts_dec_nofirst
SM4_PREPARE(x5)
SM4_CRYPT_BLK(v8)
.Lxts_dec_nofirst:
SM4_PREPARE(x0)
ands w5, w4,
lsr w4, w4,
sub w6, w4,
csel w4, w4, w6, eq
uxtw x5, w5
movi RMASK.2s,
movi RTMP0.2s,
uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
cbz w4, .Lxts_dec_cts
.Lxts_dec_loop_8x:
sub w4, w4,
tbnz w4,
tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
tweak_next(v12, v11, RTMP3)
tweak_next(v13, v12, RTMP0)
tweak_next(v14, v13, RTMP1)
tweak_next(v15, v14, RTMP2)
ld1 {v0.16b-v3.16b}, [x2],
ld1 {v4.16b-v7.16b}, [x2],
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
st1 {v0.16b-v3.16b}, [x1],
st1 {v4.16b-v7.16b}, [x1],
tweak_next(v8, v15, RTMP3)
cbz w4, .Lxts_dec_cts
b .Lxts_dec_loop_8x
.Lxts_dec_4x:
add w4, w4,
cmp w4,
blt .Lxts_dec_loop_1x
sub w4, w4,
tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
ld1 {v0.16b-v3.16b}, [x2],
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
SM4_CRYPT_BLK4(v0, v1, v2, v3)
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
st1 {v0.16b-v3.16b}, [x1],
tweak_next(v8, v11, RTMP3)
cbz w4, .Lxts_dec_cts
.Lxts_dec_loop_1x:
sub w4, w4,
ld1 {v0.16b}, [x2],
eor v0.16b, v0.16b, v8.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v8.16b
st1 {v0.16b}, [x1],
tweak_next(v8, v8, RTMP0)
cbnz w4, .Lxts_dec_loop_1x
.Lxts_dec_cts:
cbz x5, .Lxts_dec_end
tweak_next(v9, v8, RTMP0)
ld1 {v0.16b}, [x2]
eor v0.16b, v0.16b, v9.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v9.16b
adr_l x6, .Lcts_permute_table
add x7, x6,
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.16b}, [x6]
ld1 {v4.16b}, [x7]
add x2, x2, x5
ld1 {v1.16b}, [x2]
tbl v2.16b, {v0.16b}, v3.16b
tbx v0.16b, {v1.16b}, v4.16b
eor v0.16b, v0.16b, v8.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v8.16b
add x5, x1, x5
st1 {v2.16b}, [x5]
st1 {v0.16b}, [x1]
b .Lxts_dec_ret
.Lxts_dec_end:
st1 {v8.16b}, [x3]
.Lxts_dec_ret:
ret
SYM_FUNC_END(sm4_ce_xts_dec)
.align 3
SYM_FUNC_START(sm4_ce_mac_update)
SM4_PREPARE(x0)
ld1 {RMAC.16b}, [x1]
cbz w4, .Lmac_update
SM4_CRYPT_BLK(RMAC)
.Lmac_update:
cbz w3, .Lmac_ret
sub w6, w3,
cmp w5, wzr
csel w3, w3, w6, ne
cbz w3, .Lmac_end
.Lmac_loop_4x:
cmp w3,
blt .Lmac_loop_1x
sub w3, w3,
ld1 {v0.16b-v3.16b}, [x2],
eor RMAC.16b, RMAC.16b, v0.16b
SM4_CRYPT_BLK(RMAC)
eor RMAC.16b, RMAC.16b, v1.16b
SM4_CRYPT_BLK(RMAC)
eor RMAC.16b, RMAC.16b, v2.16b
SM4_CRYPT_BLK(RMAC)
eor RMAC.16b, RMAC.16b, v3.16b
SM4_CRYPT_BLK(RMAC)
cbz w3, .Lmac_end
b .Lmac_loop_4x
.Lmac_loop_1x:
sub w3, w3,
ld1 {v0.16b}, [x2],
eor RMAC.16b, RMAC.16b, v0.16b
SM4_CRYPT_BLK(RMAC)
cbnz w3, .Lmac_loop_1x
.Lmac_end:
cbnz w5, .Lmac_ret
ld1 {v0.16b}, [x2],
eor RMAC.16b, RMAC.16b, v0.16b
.Lmac_ret:
st1 {RMAC.16b}, [x1]
ret
SYM_FUNC_END(sm4_ce_mac_update)
.section ".rodata", "a"
.align 4
.Lbswap128_mask:
.byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
.byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
.Lcts_permute_table:
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff