Path: blob/main/contrib/bearssl/src/symcipher/chacha20_sse2.c
39482 views
/*1* Copyright (c) 2017 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#define BR_ENABLE_INTRINSICS 125#include "inner.h"2627#if BR_SSE22829/*30* This file contains a ChaCha20 implementation that leverages SSE231* opcodes for better performance.32*/3334/* see bearssl_block.h */35br_chacha20_run36br_chacha20_sse2_get(void)37{38/*39* If using 64-bit mode, then SSE2 opcodes should be automatically40* available, since they are part of the ABI.41*42* In 32-bit mode, we use CPUID to detect the SSE2 feature.43*/4445#if BR_amd6446return &br_chacha20_sse2_run;47#else4849/*50* SSE2 support is indicated by bit 26 in EDX.51*/52if (br_cpuid(0, 0, 0, 0x04000000)) {53return &br_chacha20_sse2_run;54} else {55return 0;56}57#endif58}5960BR_TARGETS_X86_UP6162/* see bearssl_block.h */63BR_TARGET("sse2")64uint32_t65br_chacha20_sse2_run(const void *key,66const void *iv, uint32_t cc, void *data, size_t len)67{68unsigned char *buf;69uint32_t ivtmp[4];70__m128i kw0, kw1;71__m128i iw, cw;72__m128i one;7374static const uint32_t CW[] = {750x61707865, 0x3320646e, 0x79622d32, 0x6b20657476};7778buf = data;79kw0 = _mm_loadu_si128(key);80kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));81ivtmp[0] = cc;82memcpy(ivtmp + 1, iv, 12);83iw = _mm_loadu_si128((const void *)ivtmp);84cw = _mm_loadu_si128((const void *)CW);85one = _mm_set_epi32(0, 0, 0, 1);8687while (len > 0) {88/*89* sj contains state words 4*j to 4*j+3.90*/91__m128i s0, s1, s2, s3;92int i;9394s0 = cw;95s1 = kw0;96s2 = kw1;97s3 = iw;98for (i = 0; i < 10; i ++) {99/*100* Even round is straightforward application on101* the state words.102*/103s0 = _mm_add_epi32(s0, s1);104s3 = _mm_xor_si128(s3, s0);105s3 = _mm_or_si128(106_mm_slli_epi32(s3, 16),107_mm_srli_epi32(s3, 16));108109s2 = _mm_add_epi32(s2, s3);110s1 = _mm_xor_si128(s1, s2);111s1 = _mm_or_si128(112_mm_slli_epi32(s1, 12),113_mm_srli_epi32(s1, 20));114115s0 = _mm_add_epi32(s0, s1);116s3 = _mm_xor_si128(s3, s0);117s3 = _mm_or_si128(118_mm_slli_epi32(s3, 8),119_mm_srli_epi32(s3, 24));120121s2 = _mm_add_epi32(s2, s3);122s1 = _mm_xor_si128(s1, s2);123s1 = _mm_or_si128(124_mm_slli_epi32(s1, 7),125_mm_srli_epi32(s1, 25));126127/*128* For the odd round, we must rotate some state129* words so that the computations apply on the130* right combinations of words.131*/132s1 = _mm_shuffle_epi32(s1, 0x39);133s2 = _mm_shuffle_epi32(s2, 0x4E);134s3 = _mm_shuffle_epi32(s3, 0x93);135136s0 = _mm_add_epi32(s0, s1);137s3 = _mm_xor_si128(s3, s0);138s3 = _mm_or_si128(139_mm_slli_epi32(s3, 16),140_mm_srli_epi32(s3, 16));141142s2 = _mm_add_epi32(s2, s3);143s1 = _mm_xor_si128(s1, s2);144s1 = _mm_or_si128(145_mm_slli_epi32(s1, 12),146_mm_srli_epi32(s1, 20));147148s0 = _mm_add_epi32(s0, s1);149s3 = _mm_xor_si128(s3, s0);150s3 = _mm_or_si128(151_mm_slli_epi32(s3, 8),152_mm_srli_epi32(s3, 24));153154s2 = _mm_add_epi32(s2, s3);155s1 = _mm_xor_si128(s1, s2);156s1 = _mm_or_si128(157_mm_slli_epi32(s1, 7),158_mm_srli_epi32(s1, 25));159160/*161* After the odd round, we rotate back the values162* to undo the rotate at the start of the odd round.163*/164s1 = _mm_shuffle_epi32(s1, 0x93);165s2 = _mm_shuffle_epi32(s2, 0x4E);166s3 = _mm_shuffle_epi32(s3, 0x39);167}168169/*170* Addition with the initial state.171*/172s0 = _mm_add_epi32(s0, cw);173s1 = _mm_add_epi32(s1, kw0);174s2 = _mm_add_epi32(s2, kw1);175s3 = _mm_add_epi32(s3, iw);176177/*178* Increment block counter.179*/180iw = _mm_add_epi32(iw, one);181182/*183* XOR final state with the data.184*/185if (len < 64) {186unsigned char tmp[64];187size_t u;188189_mm_storeu_si128((void *)(tmp + 0), s0);190_mm_storeu_si128((void *)(tmp + 16), s1);191_mm_storeu_si128((void *)(tmp + 32), s2);192_mm_storeu_si128((void *)(tmp + 48), s3);193for (u = 0; u < len; u ++) {194buf[u] ^= tmp[u];195}196break;197} else {198__m128i b0, b1, b2, b3;199200b0 = _mm_loadu_si128((const void *)(buf + 0));201b1 = _mm_loadu_si128((const void *)(buf + 16));202b2 = _mm_loadu_si128((const void *)(buf + 32));203b3 = _mm_loadu_si128((const void *)(buf + 48));204b0 = _mm_xor_si128(b0, s0);205b1 = _mm_xor_si128(b1, s1);206b2 = _mm_xor_si128(b2, s2);207b3 = _mm_xor_si128(b3, s3);208_mm_storeu_si128((void *)(buf + 0), b0);209_mm_storeu_si128((void *)(buf + 16), b1);210_mm_storeu_si128((void *)(buf + 32), b2);211_mm_storeu_si128((void *)(buf + 48), b3);212buf += 64;213len -= 64;214}215}216217/*218* _mm_extract_epi32() requires SSE4.1. We prefer to stick to219* raw SSE2, thus we use _mm_extract_epi16().220*/221return (uint32_t)_mm_extract_epi16(iw, 0)222| ((uint32_t)_mm_extract_epi16(iw, 1) << 16);223}224225BR_TARGETS_X86_DOWN226227#else228229/* see bearssl_block.h */230br_chacha20_run231br_chacha20_sse2_get(void)232{233return 0;234}235236#endif237238239