Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crc/x86/crc32.h
26292 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* x86-optimized CRC32 functions
4
*
5
* Copyright (C) 2008 Intel Corporation
6
* Copyright 2012 Xyratex Technology Limited
7
* Copyright 2024 Google LLC
8
*/
9
10
#include "crc-pclmul-template.h"
11
12
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
13
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
14
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
15
16
DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
17
18
static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
19
{
20
CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
21
have_pclmulqdq);
22
return crc32_le_base(crc, p, len);
23
}
24
25
#ifdef CONFIG_X86_64
26
#define CRC32_INST "crc32q %1, %q0"
27
#else
28
#define CRC32_INST "crc32l %1, %0"
29
#endif
30
31
/*
32
* Use carryless multiply version of crc32c when buffer size is >= 512 to
33
* account for FPU state save/restore overhead.
34
*/
35
#define CRC32C_PCLMUL_BREAKEVEN 512
36
37
asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
38
39
static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
40
{
41
size_t num_longs;
42
43
if (!static_branch_likely(&have_crc32))
44
return crc32c_base(crc, p, len);
45
46
if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
47
static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
48
/*
49
* Long length, the vector registers are usable, and the CPU is
50
* 64-bit and supports both CRC32 and PCLMULQDQ instructions.
51
* It is worthwhile to divide the data into multiple streams,
52
* CRC them independently, and combine them using PCLMULQDQ.
53
* crc32c_x86_3way() does this using 3 streams, which is the
54
* most that x86_64 CPUs have traditionally been capable of.
55
*
56
* However, due to improved VPCLMULQDQ performance on newer
57
* CPUs, use crc32_lsb_vpclmul_avx512() instead of
58
* crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
59
* "good" implementation of AVX-512.
60
*
61
* Future work: the optimal strategy on Zen 3--5 is actually to
62
* use both crc32q and VPCLMULQDQ in parallel. Unfortunately,
63
* different numbers of streams and vector lengths are optimal
64
* on each CPU microarchitecture, making it challenging to take
65
* advantage of this. (Zen 5 even supports 7 parallel crc32q, a
66
* major upgrade.) For now, just choose between
67
* crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter
68
* is needed anyway for crc32_le(), so we just reuse it here.
69
*/
70
kernel_fpu_begin();
71
if (static_branch_likely(&have_vpclmul_avx512))
72
crc = crc32_lsb_vpclmul_avx512(crc, p, len,
73
crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
74
else
75
crc = crc32c_x86_3way(crc, p, len);
76
kernel_fpu_end();
77
return crc;
78
}
79
80
/*
81
* Short length, XMM registers unusable, or the CPU is 32-bit; but the
82
* CPU supports CRC32 instructions. Just issue a single stream of CRC32
83
* instructions inline. While this doesn't use the CPU's CRC32
84
* throughput very well, it avoids the need to combine streams. Stream
85
* combination would be inefficient here.
86
*/
87
88
for (num_longs = len / sizeof(unsigned long);
89
num_longs != 0; num_longs--, p += sizeof(unsigned long))
90
asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
91
92
if (sizeof(unsigned long) > 4 && (len & 4)) {
93
asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
94
p += 4;
95
}
96
if (len & 2) {
97
asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
98
p += 2;
99
}
100
if (len & 1)
101
asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
102
103
return crc;
104
}
105
106
#define crc32_be_arch crc32_be_base /* not implemented on this arch */
107
108
#define crc32_mod_init_arch crc32_mod_init_arch
109
static inline void crc32_mod_init_arch(void)
110
{
111
if (boot_cpu_has(X86_FEATURE_XMM4_2))
112
static_branch_enable(&have_crc32);
113
if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
114
static_branch_enable(&have_pclmulqdq);
115
if (have_vpclmul()) {
116
if (have_avx512()) {
117
static_call_update(crc32_lsb_pclmul,
118
crc32_lsb_vpclmul_avx512);
119
static_branch_enable(&have_vpclmul_avx512);
120
} else {
121
static_call_update(crc32_lsb_pclmul,
122
crc32_lsb_vpclmul_avx2);
123
}
124
}
125
}
126
}
127
128
static inline u32 crc32_optimizations_arch(void)
129
{
130
u32 optimizations = 0;
131
132
if (static_key_enabled(&have_crc32))
133
optimizations |= CRC32C_OPTIMIZATION;
134
if (static_key_enabled(&have_pclmulqdq))
135
optimizations |= CRC32_LE_OPTIMIZATION;
136
return optimizations;
137
}
138
139