/*1* Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions2*3* Copyright (C) 2016 Linaro Ltd <[email protected]>4*5* This program is free software; you can redistribute it and/or modify6* it under the terms of the GNU General Public License version 2 as7* published by the Free Software Foundation.8*/910/* GPL HEADER START11*12* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.13*14* This program is free software; you can redistribute it and/or modify15* it under the terms of the GNU General Public License version 2 only,16* as published by the Free Software Foundation.17*18* This program is distributed in the hope that it will be useful, but19* WITHOUT ANY WARRANTY; without even the implied warranty of20* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU21* General Public License version 2 for more details (a copy is included22* in the LICENSE file that accompanied this code).23*24* You should have received a copy of the GNU General Public License25* version 2 along with this program; If not, see http://www.gnu.org/licenses26*27* Please visit http://www.xyratex.com/contact if you need additional28* information or have any questions.29*30* GPL HEADER END31*/3233/*34* Copyright 2012 Xyratex Technology Limited35*36* Using hardware provided PCLMULQDQ instruction to accelerate the CRC3237* calculation.38* CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)39* PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found40* at:41* https://www.intel.com/products/processor/manuals/42* Intel(R) 64 and IA-32 Architectures Software Developer's Manual43* Volume 2B: Instruction Set Reference, N-Z44*45* Authors: Gregory Prestas <[email protected]>46* Alexander Boyko <[email protected]>47*/4849#include <linux/linkage.h>50#include <asm/assembler.h>5152.text53.align 654.arch armv8-a55.arch_extension crc56.fpu crypto-neon-fp-armv85758.Lcrc32_constants:59/*60* [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd461* #define CONSTANT_R1 0x154442bd4LL62*63* [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e4159664* #define CONSTANT_R2 0x1c6e41596LL65*/66.quad 0x0000000154442bd467.quad 0x00000001c6e415966869/*70* [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d071* #define CONSTANT_R3 0x1751997d0LL72*73* [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e74* #define CONSTANT_R4 0x0ccaa009eLL75*/76.quad 0x00000001751997d077.quad 0x00000000ccaa009e7879/*80* [(x64 mod P(x) << 32)]' << 1 = 0x163cd612481* #define CONSTANT_R5 0x163cd6124LL82*/83.quad 0x0000000163cd612484.quad 0x00000000FFFFFFFF8586/*87* #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL88*89* Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`90* = 0x1F7011641LL91* #define CONSTANT_RU 0x1F7011641LL92*/93.quad 0x00000001DB71064194.quad 0x00000001F70116419596.Lcrc32c_constants:97.quad 0x00000000740eef0298.quad 0x000000009e4addf899.quad 0x00000000f20c0dfe100.quad 0x000000014cd00bd6101.quad 0x00000000dd45aab8102.quad 0x00000000FFFFFFFF103.quad 0x0000000105ec76f0104.quad 0x00000000dea713f1105106dCONSTANTl .req d0107dCONSTANTh .req d1108qCONSTANT .req q0109110BUF .req r0111LEN .req r1112CRC .req r2113114qzr .req q9115116/**117* Calculate crc32118* BUF - buffer119* LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63120* CRC - initial crc32121* return %eax crc32122* uint crc32_pmull_le(unsigned char const *buffer,123* size_t len, uint crc32)124*/125SYM_FUNC_START(crc32_pmull_le)126adr r3, .Lcrc32_constants127b 0f128SYM_FUNC_END(crc32_pmull_le)129130SYM_FUNC_START(crc32c_pmull_le)131adr r3, .Lcrc32c_constants1321330: bic LEN, LEN, #15134vld1.8 {q1-q2}, [BUF, :128]!135vld1.8 {q3-q4}, [BUF, :128]!136vmov.i8 qzr, #0137vmov.i8 qCONSTANT, #0138vmov.32 dCONSTANTl[0], CRC139veor.8 d2, d2, dCONSTANTl140sub LEN, LEN, #0x40141cmp LEN, #0x40142blt less_64143144vld1.64 {qCONSTANT}, [r3]145146loop_64: /* 64 bytes Full cache line folding */147sub LEN, LEN, #0x40148149vmull.p64 q5, d3, dCONSTANTh150vmull.p64 q6, d5, dCONSTANTh151vmull.p64 q7, d7, dCONSTANTh152vmull.p64 q8, d9, dCONSTANTh153154vmull.p64 q1, d2, dCONSTANTl155vmull.p64 q2, d4, dCONSTANTl156vmull.p64 q3, d6, dCONSTANTl157vmull.p64 q4, d8, dCONSTANTl158159veor.8 q1, q1, q5160vld1.8 {q5}, [BUF, :128]!161veor.8 q2, q2, q6162vld1.8 {q6}, [BUF, :128]!163veor.8 q3, q3, q7164vld1.8 {q7}, [BUF, :128]!165veor.8 q4, q4, q8166vld1.8 {q8}, [BUF, :128]!167168veor.8 q1, q1, q5169veor.8 q2, q2, q6170veor.8 q3, q3, q7171veor.8 q4, q4, q8172173cmp LEN, #0x40174bge loop_64175176less_64: /* Folding cache line into 128bit */177vldr dCONSTANTl, [r3, #16]178vldr dCONSTANTh, [r3, #24]179180vmull.p64 q5, d3, dCONSTANTh181vmull.p64 q1, d2, dCONSTANTl182veor.8 q1, q1, q5183veor.8 q1, q1, q2184185vmull.p64 q5, d3, dCONSTANTh186vmull.p64 q1, d2, dCONSTANTl187veor.8 q1, q1, q5188veor.8 q1, q1, q3189190vmull.p64 q5, d3, dCONSTANTh191vmull.p64 q1, d2, dCONSTANTl192veor.8 q1, q1, q5193veor.8 q1, q1, q4194195teq LEN, #0196beq fold_64197198loop_16: /* Folding rest buffer into 128bit */199subs LEN, LEN, #0x10200201vld1.8 {q2}, [BUF, :128]!202vmull.p64 q5, d3, dCONSTANTh203vmull.p64 q1, d2, dCONSTANTl204veor.8 q1, q1, q5205veor.8 q1, q1, q2206207bne loop_16208209fold_64:210/* perform the last 64 bit fold, also adds 32 zeroes211* to the input stream */212vmull.p64 q2, d2, dCONSTANTh213vext.8 q1, q1, qzr, #8214veor.8 q1, q1, q2215216/* final 32-bit fold */217vldr dCONSTANTl, [r3, #32]218vldr d6, [r3, #40]219vmov.i8 d7, #0220221vext.8 q2, q1, qzr, #4222vand.8 d2, d2, d6223vmull.p64 q1, d2, dCONSTANTl224veor.8 q1, q1, q2225226/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */227vldr dCONSTANTl, [r3, #48]228vldr dCONSTANTh, [r3, #56]229230vand.8 q2, q1, q3231vext.8 q2, qzr, q2, #8232vmull.p64 q2, d5, dCONSTANTh233vand.8 q2, q2, q3234vmull.p64 q2, d4, dCONSTANTl235veor.8 q1, q1, q2236vmov r0, s5237238bx lr239SYM_FUNC_END(crc32c_pmull_le)240241.macro __crc32, c242subs ip, r2, #8243bmi .Ltail\c244245tst r1, #3246bne .Lunaligned\c247248teq ip, #0249.Laligned8\c:250ldrd r2, r3, [r1], #8251ARM_BE8(rev r2, r2 )252ARM_BE8(rev r3, r3 )253crc32\c\()w r0, r0, r2254crc32\c\()w r0, r0, r3255bxeq lr256subs ip, ip, #8257bpl .Laligned8\c258259.Ltail\c:260tst ip, #4261beq 2f262ldr r3, [r1], #4263ARM_BE8(rev r3, r3 )264crc32\c\()w r0, r0, r32652662: tst ip, #2267beq 1f268ldrh r3, [r1], #2269ARM_BE8(rev16 r3, r3 )270crc32\c\()h r0, r0, r32712721: tst ip, #1273bxeq lr274ldrb r3, [r1]275crc32\c\()b r0, r0, r3276bx lr277278.Lunaligned\c:279tst r1, #1280beq 2f281ldrb r3, [r1], #1282subs r2, r2, #1283crc32\c\()b r0, r0, r3284285tst r1, #2286beq 0f2872: ldrh r3, [r1], #2288subs r2, r2, #2289ARM_BE8(rev16 r3, r3 )290crc32\c\()h r0, r0, r32912920: subs ip, r2, #8293bpl .Laligned8\c294b .Ltail\c295.endm296297.align 5298SYM_FUNC_START(crc32_armv8_le)299__crc32300SYM_FUNC_END(crc32_armv8_le)301302.align 5303SYM_FUNC_START(crc32c_armv8_le)304__crc32 c305SYM_FUNC_END(crc32c_armv8_le)306307308