Path: blob/main/contrib/arm-optimized-routines/networking/test/chksum.c
48255 views
/*1* Ones' complement checksum test & benchmark2*3* Copyright (c) 2016-2020, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67#define _GNU_SOURCE8#include <inttypes.h>9#include <stdbool.h>10#include <stdint.h>11#include <stdio.h>12#include <stdlib.h>13#include <string.h>14#include <sys/mman.h>15#include <time.h>16#include <unistd.h>17#include "../include/networking.h"1819#if WANT_ASSERT20#undef NDEBUG21#include <assert.h>22#define Assert(exp) assert(exp)23#else24#define Assert(exp) (void) (exp)25#endif2627#ifdef __GNUC__28#define may_alias __attribute__((__may_alias__))29#else30#define may_alias31#endif3233#define CACHE_LINE 6434#define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))3536/* Reference implementation - do not modify! */37static uint16_t38checksum_simple(const void *ptr, uint32_t nbytes)39{40const uint16_t *may_alias hptr = ptr;41uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */4243/* Sum all halfwords, assume misaligned accesses are handled in HW */44for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--)45{46sum += *hptr++;47}4849/* Add any trailing odd byte */50if ((nbytes & 0x01) != 0)51{52sum += *(uint8_t *) hptr;53}5455/* Fold 64-bit sum to 32 bits */56sum = (sum & 0xffffffff) + (sum >> 32);57sum = (sum & 0xffffffff) + (sum >> 32);58Assert(sum == (uint32_t) sum);5960/* Fold 32-bit sum to 16 bits */61sum = (sum & 0xffff) + (sum >> 16);62sum = (sum & 0xffff) + (sum >> 16);63Assert(sum == (uint16_t) sum);6465return (uint16_t) sum;66}6768static struct69{70uint16_t (*cksum_fp)(const void *, uint32_t);71const char *name;72} implementations[] =73{74{ checksum_simple, "simple"},75{ __chksum, "scalar"},76#if __arm__77{ __chksum_arm_simd, "simd" },78#elif __aarch64__79{ __chksum_aarch64_simd, "simd" },80#endif81{ NULL, NULL}82};8384static int85find_impl(const char *name)86{87for (int i = 0; implementations[i].name != NULL; i++)88{89if (strcmp(implementations[i].name, name) == 0)90{91return i;92}93}94return -1;95}9697static uint16_t (*CKSUM_FP)(const void *, uint32_t);98static volatile uint16_t SINK;99100static bool101verify(const void *data, uint32_t offset, uint32_t size)102{103104uint16_t csum_expected = checksum_simple(data, size);105uint16_t csum_actual = CKSUM_FP(data, size);106if (csum_actual != csum_expected)107{108fprintf(stderr, "\nInvalid checksum for offset %u size %u: "109"actual %04x expected %04x (valid)",110offset, size, csum_actual, csum_expected);111if (size < 65536)112{113/* Fatal error */114exit(EXIT_FAILURE);115}116/* Else some implementations only support sizes up to 2^16 */117return false;118}119return true;120}121122static uint64_t123clock_get_ns(void)124{125struct timespec ts;126clock_gettime(CLOCK_MONOTONIC, &ts);127return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;128}129130static void131benchmark(const uint8_t *base,132size_t poolsize,133uint32_t blksize,134uint32_t numops,135uint64_t cpufreq)136{137printf("%11u ", (unsigned int) blksize); fflush(stdout);138139uint64_t start = clock_get_ns();140for (uint32_t i = 0; i < numops; i ++)141{142/* Read a random value from the pool */143uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)];144/* Generate a random starting address */145const void *data = &base[random % (poolsize - blksize)];146SINK = CKSUM_FP(data, blksize);147}148uint64_t end = clock_get_ns();149150#define MEGABYTE 1000000 /* Decimal megabyte (MB) */151uint64_t elapsed_ns = end - start;152uint64_t elapsed_ms = elapsed_ns / 1000000;153uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000);154uint64_t accbytes = (uint64_t) numops * blksize;155printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE);156unsigned int cyc_per_blk = cpufreq / blks_per_s;157printf("%11u ", cyc_per_blk);158if (blksize != 0)159{160unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize;161printf("%7u.%03u ",162cyc_per_byte / 1000, cyc_per_byte % 1000);163}164printf("\n");165}166167int main(int argc, char *argv[])168{169int c;170bool DUMP = false;171uint32_t IMPL = 0;/* Simple implementation */172uint64_t CPUFREQ = 0;173uint32_t BLKSIZE = 0;174uint32_t NUMOPS = 1000000;175uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */176177setvbuf(stdout, NULL, _IOLBF, 160);178while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1)179{180switch (c)181{182case 'b' :183{184int blksize = atoi(optarg);185if (blksize < 1 || blksize > POOLSIZE / 2)186{187fprintf(stderr, "Invalid block size %d\n", blksize);188exit(EXIT_FAILURE);189}190BLKSIZE = (unsigned) blksize;191break;192}193case 'd' :194DUMP = true;195break;196case 'f' :197{198int64_t cpufreq = atoll(optarg);199if (cpufreq < 1)200{201fprintf(stderr, "Invalid CPU frequency %"PRId64"\n",202cpufreq);203exit(EXIT_FAILURE);204}205CPUFREQ = cpufreq;206break;207}208case 'i' :209{210int impl = find_impl(optarg);211if (impl < 0)212{213fprintf(stderr, "Invalid implementation %s\n", optarg);214goto usage;215}216IMPL = (unsigned) impl;217break;218}219case 'n' :220{221int numops = atoi(optarg);222if (numops < 1)223{224fprintf(stderr, "Invalid number of operations %d\n", numops);225exit(EXIT_FAILURE);226}227NUMOPS = (unsigned) numops;228break;229}230case 'p' :231{232int poolsize = atoi(optarg);233if (poolsize < 4096)234{235fprintf(stderr, "Invalid pool size %d\n", poolsize);236exit(EXIT_FAILURE);237}238char c = optarg[strlen(optarg) - 1];239if (c == 'M')240{241POOLSIZE = (unsigned) poolsize * 1024 * 1024;242}243else if (c == 'K')244{245POOLSIZE = (unsigned) poolsize * 1024;246}247else248{249POOLSIZE = (unsigned) poolsize;250}251break;252}253default :254usage :255fprintf(stderr, "Usage: checksum <options>\n"256"-b <blksize> Block size\n"257"-d Dump first 96 bytes of data\n"258"-f <cpufreq> CPU frequency (Hz)\n"259"-i <impl> Implementation\n"260"-n <numops> Number of operations\n"261"-p <poolsize> Pool size (K or M suffix)\n"262);263printf("Implementations:");264for (int i = 0; implementations[i].name != NULL; i++)265{266printf(" %s", implementations[i].name);267}268printf("\n");269exit(EXIT_FAILURE);270}271}272if (optind > argc)273{274goto usage;275}276277CKSUM_FP = implementations[IMPL].cksum_fp;278POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE);279uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE,280MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);281if (base == MAP_FAILED)282{283perror("aligned_alloc"), exit(EXIT_FAILURE);284}285for (size_t i = 0; i < POOLSIZE / 4; i++)286{287((uint32_t *) base)[i] = rand();288}289290printf("Implementation: %s\n", implementations[IMPL].name);291printf("numops %u, poolsize ", NUMOPS);292if (POOLSIZE % (1024 * 1024) == 0)293{294printf("%uMiB", POOLSIZE / (1024 * 1024));295}296else if (POOLSIZE % 1024 == 0)297{298printf("%uKiB", POOLSIZE / 1024);299}300else301{302printf("%uB", POOLSIZE);303}304printf(", blocksize %u, CPU frequency %juMHz\n",305BLKSIZE, (uintmax_t) (CPUFREQ / 1000000));306#if WANT_ASSERT307printf("Warning: assertions are enabled\n");308#endif309310if (DUMP)311{312/* Print out first 96 bytes of data for human debugging */313for (int i = 0; i < 96; i++)314{315if (i % 8 == 0)316printf("%2u:", i);317printf(" %02x", base[i]);318if (i % 8 == 7)319printf("\n");320}321}322323/* Verify that chosen algorithm handles all combinations of offsets and sizes */324printf("Verifying..."); fflush(stdout);325bool success = true;326/* Check all (relevant) combinations of size and offset */327for (int size = 0; size <= 256; size++)328{329for (int offset = 0; offset < 255; offset++)330{331/* Check at start of mapped memory */332success &= verify(&base[offset], offset, size);333/* Check at end of mapped memory */334uint8_t *p = base + POOLSIZE - (size + offset);335success &= verify(p, (uintptr_t) p % 64, size);336}337}338/* Check increasingly larger sizes */339for (size_t size = 1; size < POOLSIZE; size *= 2)340{341success &= verify(base, 0, size);342}343/* Check the full size, this can detect accumulator overflows */344success &= verify(base, 0, POOLSIZE);345printf("%s\n", success ? "OK" : "failure");346347/* Print throughput in decimal megabyte (1000000B) per second */348if (CPUFREQ != 0)349{350printf("%11s %11s %11s %11s\n",351"block size", "MB/s", "cycles/blk", "cycles/byte");352}353else354{355printf("%11s %11s %11s %11s\n",356"block size", "MB/s", "ns/blk", "ns/byte");357CPUFREQ = 1000000000;358}359if (BLKSIZE != 0)360{361benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ);362}363else364{365static const uint16_t sizes[] =366{ 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };367for (int i = 0; sizes[i] != 0; i++)368{369uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]);370benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ);371}372}373374if (munmap(base, POOLSIZE) != 0)375{376perror("munmap"), exit(EXIT_FAILURE);377}378379return success ? EXIT_SUCCESS : EXIT_FAILURE;380}381382383