Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/networking/test/chksum.c
48255 views
1
/*
2
* Ones' complement checksum test & benchmark
3
*
4
* Copyright (c) 2016-2020, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
#define _GNU_SOURCE
9
#include <inttypes.h>
10
#include <stdbool.h>
11
#include <stdint.h>
12
#include <stdio.h>
13
#include <stdlib.h>
14
#include <string.h>
15
#include <sys/mman.h>
16
#include <time.h>
17
#include <unistd.h>
18
#include "../include/networking.h"
19
20
#if WANT_ASSERT
21
#undef NDEBUG
22
#include <assert.h>
23
#define Assert(exp) assert(exp)
24
#else
25
#define Assert(exp) (void) (exp)
26
#endif
27
28
#ifdef __GNUC__
29
#define may_alias __attribute__((__may_alias__))
30
#else
31
#define may_alias
32
#endif
33
34
#define CACHE_LINE 64
35
#define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))
36
37
/* Reference implementation - do not modify! */
38
static uint16_t
39
checksum_simple(const void *ptr, uint32_t nbytes)
40
{
41
const uint16_t *may_alias hptr = ptr;
42
uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */
43
44
/* Sum all halfwords, assume misaligned accesses are handled in HW */
45
for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--)
46
{
47
sum += *hptr++;
48
}
49
50
/* Add any trailing odd byte */
51
if ((nbytes & 0x01) != 0)
52
{
53
sum += *(uint8_t *) hptr;
54
}
55
56
/* Fold 64-bit sum to 32 bits */
57
sum = (sum & 0xffffffff) + (sum >> 32);
58
sum = (sum & 0xffffffff) + (sum >> 32);
59
Assert(sum == (uint32_t) sum);
60
61
/* Fold 32-bit sum to 16 bits */
62
sum = (sum & 0xffff) + (sum >> 16);
63
sum = (sum & 0xffff) + (sum >> 16);
64
Assert(sum == (uint16_t) sum);
65
66
return (uint16_t) sum;
67
}
68
69
static struct
70
{
71
uint16_t (*cksum_fp)(const void *, uint32_t);
72
const char *name;
73
} implementations[] =
74
{
75
{ checksum_simple, "simple"},
76
{ __chksum, "scalar"},
77
#if __arm__
78
{ __chksum_arm_simd, "simd" },
79
#elif __aarch64__
80
{ __chksum_aarch64_simd, "simd" },
81
#endif
82
{ NULL, NULL}
83
};
84
85
static int
86
find_impl(const char *name)
87
{
88
for (int i = 0; implementations[i].name != NULL; i++)
89
{
90
if (strcmp(implementations[i].name, name) == 0)
91
{
92
return i;
93
}
94
}
95
return -1;
96
}
97
98
static uint16_t (*CKSUM_FP)(const void *, uint32_t);
99
static volatile uint16_t SINK;
100
101
static bool
102
verify(const void *data, uint32_t offset, uint32_t size)
103
{
104
105
uint16_t csum_expected = checksum_simple(data, size);
106
uint16_t csum_actual = CKSUM_FP(data, size);
107
if (csum_actual != csum_expected)
108
{
109
fprintf(stderr, "\nInvalid checksum for offset %u size %u: "
110
"actual %04x expected %04x (valid)",
111
offset, size, csum_actual, csum_expected);
112
if (size < 65536)
113
{
114
/* Fatal error */
115
exit(EXIT_FAILURE);
116
}
117
/* Else some implementations only support sizes up to 2^16 */
118
return false;
119
}
120
return true;
121
}
122
123
static uint64_t
124
clock_get_ns(void)
125
{
126
struct timespec ts;
127
clock_gettime(CLOCK_MONOTONIC, &ts);
128
return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
129
}
130
131
static void
132
benchmark(const uint8_t *base,
133
size_t poolsize,
134
uint32_t blksize,
135
uint32_t numops,
136
uint64_t cpufreq)
137
{
138
printf("%11u ", (unsigned int) blksize); fflush(stdout);
139
140
uint64_t start = clock_get_ns();
141
for (uint32_t i = 0; i < numops; i ++)
142
{
143
/* Read a random value from the pool */
144
uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)];
145
/* Generate a random starting address */
146
const void *data = &base[random % (poolsize - blksize)];
147
SINK = CKSUM_FP(data, blksize);
148
}
149
uint64_t end = clock_get_ns();
150
151
#define MEGABYTE 1000000 /* Decimal megabyte (MB) */
152
uint64_t elapsed_ns = end - start;
153
uint64_t elapsed_ms = elapsed_ns / 1000000;
154
uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000);
155
uint64_t accbytes = (uint64_t) numops * blksize;
156
printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE);
157
unsigned int cyc_per_blk = cpufreq / blks_per_s;
158
printf("%11u ", cyc_per_blk);
159
if (blksize != 0)
160
{
161
unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize;
162
printf("%7u.%03u ",
163
cyc_per_byte / 1000, cyc_per_byte % 1000);
164
}
165
printf("\n");
166
}
167
168
int main(int argc, char *argv[])
169
{
170
int c;
171
bool DUMP = false;
172
uint32_t IMPL = 0;/* Simple implementation */
173
uint64_t CPUFREQ = 0;
174
uint32_t BLKSIZE = 0;
175
uint32_t NUMOPS = 1000000;
176
uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */
177
178
setvbuf(stdout, NULL, _IOLBF, 160);
179
while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1)
180
{
181
switch (c)
182
{
183
case 'b' :
184
{
185
int blksize = atoi(optarg);
186
if (blksize < 1 || blksize > POOLSIZE / 2)
187
{
188
fprintf(stderr, "Invalid block size %d\n", blksize);
189
exit(EXIT_FAILURE);
190
}
191
BLKSIZE = (unsigned) blksize;
192
break;
193
}
194
case 'd' :
195
DUMP = true;
196
break;
197
case 'f' :
198
{
199
int64_t cpufreq = atoll(optarg);
200
if (cpufreq < 1)
201
{
202
fprintf(stderr, "Invalid CPU frequency %"PRId64"\n",
203
cpufreq);
204
exit(EXIT_FAILURE);
205
}
206
CPUFREQ = cpufreq;
207
break;
208
}
209
case 'i' :
210
{
211
int impl = find_impl(optarg);
212
if (impl < 0)
213
{
214
fprintf(stderr, "Invalid implementation %s\n", optarg);
215
goto usage;
216
}
217
IMPL = (unsigned) impl;
218
break;
219
}
220
case 'n' :
221
{
222
int numops = atoi(optarg);
223
if (numops < 1)
224
{
225
fprintf(stderr, "Invalid number of operations %d\n", numops);
226
exit(EXIT_FAILURE);
227
}
228
NUMOPS = (unsigned) numops;
229
break;
230
}
231
case 'p' :
232
{
233
int poolsize = atoi(optarg);
234
if (poolsize < 4096)
235
{
236
fprintf(stderr, "Invalid pool size %d\n", poolsize);
237
exit(EXIT_FAILURE);
238
}
239
char c = optarg[strlen(optarg) - 1];
240
if (c == 'M')
241
{
242
POOLSIZE = (unsigned) poolsize * 1024 * 1024;
243
}
244
else if (c == 'K')
245
{
246
POOLSIZE = (unsigned) poolsize * 1024;
247
}
248
else
249
{
250
POOLSIZE = (unsigned) poolsize;
251
}
252
break;
253
}
254
default :
255
usage :
256
fprintf(stderr, "Usage: checksum <options>\n"
257
"-b <blksize> Block size\n"
258
"-d Dump first 96 bytes of data\n"
259
"-f <cpufreq> CPU frequency (Hz)\n"
260
"-i <impl> Implementation\n"
261
"-n <numops> Number of operations\n"
262
"-p <poolsize> Pool size (K or M suffix)\n"
263
);
264
printf("Implementations:");
265
for (int i = 0; implementations[i].name != NULL; i++)
266
{
267
printf(" %s", implementations[i].name);
268
}
269
printf("\n");
270
exit(EXIT_FAILURE);
271
}
272
}
273
if (optind > argc)
274
{
275
goto usage;
276
}
277
278
CKSUM_FP = implementations[IMPL].cksum_fp;
279
POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE);
280
uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE,
281
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
282
if (base == MAP_FAILED)
283
{
284
perror("aligned_alloc"), exit(EXIT_FAILURE);
285
}
286
for (size_t i = 0; i < POOLSIZE / 4; i++)
287
{
288
((uint32_t *) base)[i] = rand();
289
}
290
291
printf("Implementation: %s\n", implementations[IMPL].name);
292
printf("numops %u, poolsize ", NUMOPS);
293
if (POOLSIZE % (1024 * 1024) == 0)
294
{
295
printf("%uMiB", POOLSIZE / (1024 * 1024));
296
}
297
else if (POOLSIZE % 1024 == 0)
298
{
299
printf("%uKiB", POOLSIZE / 1024);
300
}
301
else
302
{
303
printf("%uB", POOLSIZE);
304
}
305
printf(", blocksize %u, CPU frequency %juMHz\n",
306
BLKSIZE, (uintmax_t) (CPUFREQ / 1000000));
307
#if WANT_ASSERT
308
printf("Warning: assertions are enabled\n");
309
#endif
310
311
if (DUMP)
312
{
313
/* Print out first 96 bytes of data for human debugging */
314
for (int i = 0; i < 96; i++)
315
{
316
if (i % 8 == 0)
317
printf("%2u:", i);
318
printf(" %02x", base[i]);
319
if (i % 8 == 7)
320
printf("\n");
321
}
322
}
323
324
/* Verify that chosen algorithm handles all combinations of offsets and sizes */
325
printf("Verifying..."); fflush(stdout);
326
bool success = true;
327
/* Check all (relevant) combinations of size and offset */
328
for (int size = 0; size <= 256; size++)
329
{
330
for (int offset = 0; offset < 255; offset++)
331
{
332
/* Check at start of mapped memory */
333
success &= verify(&base[offset], offset, size);
334
/* Check at end of mapped memory */
335
uint8_t *p = base + POOLSIZE - (size + offset);
336
success &= verify(p, (uintptr_t) p % 64, size);
337
}
338
}
339
/* Check increasingly larger sizes */
340
for (size_t size = 1; size < POOLSIZE; size *= 2)
341
{
342
success &= verify(base, 0, size);
343
}
344
/* Check the full size, this can detect accumulator overflows */
345
success &= verify(base, 0, POOLSIZE);
346
printf("%s\n", success ? "OK" : "failure");
347
348
/* Print throughput in decimal megabyte (1000000B) per second */
349
if (CPUFREQ != 0)
350
{
351
printf("%11s %11s %11s %11s\n",
352
"block size", "MB/s", "cycles/blk", "cycles/byte");
353
}
354
else
355
{
356
printf("%11s %11s %11s %11s\n",
357
"block size", "MB/s", "ns/blk", "ns/byte");
358
CPUFREQ = 1000000000;
359
}
360
if (BLKSIZE != 0)
361
{
362
benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ);
363
}
364
else
365
{
366
static const uint16_t sizes[] =
367
{ 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };
368
for (int i = 0; sizes[i] != 0; i++)
369
{
370
uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]);
371
benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ);
372
}
373
}
374
375
if (munmap(base, POOLSIZE) != 0)
376
{
377
perror("munmap"), exit(EXIT_FAILURE);
378
}
379
380
return success ? EXIT_SUCCESS : EXIT_FAILURE;
381
}
382
383