Path: blob/main/test/benchmark/benchmark_sse.h
6174 views
#pragma once12#include <stdio.h>3#include <math.h>4#include <time.h>5#include <inttypes.h>6#include <stdlib.h>7#include <assert.h>89#ifdef __EMSCRIPTEN__10#include <emscripten/emscripten.h>11#endif1213#if defined(__unix__) && !defined(__EMSCRIPTEN__) // Native build without Emscripten.14#include <time.h>15#include <errno.h>16#include <string.h>17#endif1819#ifdef __APPLE__20#define aligned_alloc(align, size) malloc((size))21#endif2223#ifdef WIN3224#include <Windows.h>25#define aligned_alloc(align, size) _aligned_malloc((size), (align))26#endif2728// Scalar horizontal max across four lanes.29float hmax(__m128 m) {30float f[4];31_mm_storeu_ps(f, m);32return fmax(fmax(f[0], f[1]), fmax(f[2], f[3]));33}3435#include "tick.h"3637const int N = 8*1024*1024;3839tick_t scalarTotalTicks = 0;40tick_t simdTotalTicks = 0;41tick_t scalarTicks = 0;42const char *chartName = "";43#define SETCHART(x) chartName = (x);4445#define START() \46do { \47tick_t start = tick();4849bool comma=false;50#define END(result, name) \51tick_t end = tick(); \52tick_t ticks = end - start; \53scalarTotalTicks += scalarTicks; \54simdTotalTicks += ticks; \55double nsecs = (double)ticks * 1000.0 * 1000.0 * 1000.0 / ticks_per_sec() / N; \56printf("%s{ \"chart\": \"%s\", \"category\": \"%s\", \"scalar\": %f, \"simd\": %f }\n", comma?",":"", chartName, name, scalarTime, nsecs); \57comma = true; \58printf("%s", (result) != 0 ? "Error!" : ""); \59} while(0)6061#define ENDSCALAR(result, name) \62tick_t end = tick(); \63scalarTicks = end - start; \64scalarTime = (double)scalarTicks * 1000.0 * 1000.0 * 1000.0 / ticks_per_sec() / N; \65printf("%s", (result) != 0 ? "Error!" : ""); \66} while(0)6768void Print(__m128 m)69{70float val[4];71_mm_storeu_ps(val, m);72fprintf(stderr, "[%g, %g, %g, %g]\n", val[3], val[2], val[1], val[0]);73}7475bool always_true() { return time(NULL) != 0; } // This function always returns true, but the compiler should not know this.7677#ifdef _MSC_VER78#define NOINLINE __declspec(noinline)79#define INLINE __forceinline80#else81#define NOINLINE __attribute__((noinline))82#define INLINE __inline__83#endif8485// Slightly awkward way to allocate so that compiler will definitely not see this memory area as compile-time optimizable:86int NOINLINE *alloc_int_buffer() { return always_true() ? (int*)aligned_alloc(16, (N+16)*sizeof(int)) : 0; }87float NOINLINE *alloc_float_buffer() { return always_true() ? (float*)aligned_alloc(16, (N+16)*sizeof(float)) : 0; }88double NOINLINE *alloc_double_buffer() { return always_true() ? (double*)aligned_alloc(16, (N+16)*sizeof(double)) : 0; }8990template<typename T>91T checksum_dst(T *dst) {92if (always_true()) {93return 0.f;94} else {95T s = 0.f; for(int i = 0; i < N; ++i) s += dst[i];96return s;97}98}99100uint32_t fcastu(float f) { return *(uint32_t*)&f; }101uint64_t dcastu(double f) { return *(uint64_t*)&f; }102float ucastf(uint32_t t) { return *(float*)&t; }103double ucastd(uint64_t t) { return *(double*)&t; }104105#define LOAD_STORE_F(msg, load_instr, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride) \106START(); \107for(int i = 0; i < N; i += num_elems_stride) \108store_instr((store_ptr_type)dst_flt+store_offset+i, load_instr(src_flt+load_offset+i)); \109END(checksum_dst(dst_flt), msg);110111#define LOAD_STORE_D(msg, load_instr, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride) \112START(); \113for(int i = 0; i < N; i += num_elems_stride) \114store_instr((store_ptr_type)dst_dbl+store_offset+i, load_instr(src_dbl+load_offset+i)); \115END(checksum_dst(dst_dbl), msg);116117#define LOAD_STORE_I(msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride) \118START(); \119for(int i = 0; i < N; i += num_elems_stride) \120store_instr((__m128i*)(dst_int+store_offset+i), load_instr((__m128i*)(src_int+load_offset+i))); \121END(checksum_dst(dst_int), msg);122123// load M64*, store M128124#define LOAD_STORE_M64(msg, reg, load_instr, load_ptr_type, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride) \125START(); \126for(int i = 0; i < N; i += num_elems_stride) \127store_instr((store_ptr_type)dst_flt+store_offset+i, load_instr(reg, (load_ptr_type)(src_flt+load_offset+i))); \128END(checksum_dst(dst_flt), msg);129130#define LOAD_STORE_64_F(msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride) \131START(); \132for(int i = 0; i < N; i += num_elems_stride) \133store_instr((__m64*)(dst_flt+store_offset+i), load_instr(src_flt+load_offset+i)); \134END(checksum_dst(dst_flt), msg);135136#define LOAD_STORE_64_D(msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride) \137START(); \138for(int i = 0; i < N; i += num_elems_stride) \139store_instr((__m64*)(dst_dbl+store_offset+i), load_instr(src_dbl+load_offset+i)); \140END(checksum_dst(dst_dbl), msg);141142#define SET_STORE_F(msg, set_instr) \143START(); \144for(int i = 0; i < N; i += 4) \145_mm_store_ps(dst_flt+i, set_instr); \146END(checksum_dst(dst_flt), msg);147148#define SET_STORE_D(msg, set_instr) \149START(); \150for(int i = 0; i < N; i += 4) \151_mm_store_pd(dst_dbl+i, set_instr); \152END(checksum_dst(dst_dbl), msg);153154#define UNARYOP_F_F(msg, instr, op0) \155START(); \156__m128 o = op0; \157for(int i = 0; i < N; i += 4) \158o = instr(o); \159_mm_store_ps(dst_flt, o); \160END(checksum_dst(dst_flt), msg);161162#define UNARYOP_I_I(msg, instr, op0) \163START(); \164__m128 o = op0; \165for(int i = 0; i < N; i += 4) \166o = instr(o); \167_mm_store_si128((__m128i*)dst_int, o); \168END(checksum_dst(dst_int), msg);169170#define UNARYOP_i_F(msg, instr) \171START(); \172for(int i = 0; i < N; i += 4) \173dst_int_scalar += instr; \174END(dst_int_scalar, msg);175176#define UNARYOP_D_D(msg, instr, op0) \177START(); \178__m128d o = op0; \179for(int i = 0; i < N; i += 2) \180o = instr(o); \181_mm_store_pd(dst_dbl, o); \182END(checksum_dst(dst_dbl), msg);183184#define BINARYOP_F_FF(msg, instr, op0, op1) \185START(); \186__m128 o0 = op0; \187__m128 o1 = op1; \188for(int i = 0; i < N; i += 4) \189o0 = instr(o0, o1); \190_mm_store_ps(dst_flt, o0); \191END(checksum_dst(dst_flt), msg);192193#define BINARYOP_I_II(msg, instr, op0, op1) \194START(); \195__m128 o0 = op0; \196__m128 o1 = op1; \197for(int i = 0; i < N; i += 4) \198o0 = instr(o0, o1); \199_mm_store_si128((__m128i*)dst_int, o0); \200END(checksum_dst(dst_int), msg);201202#define BINARYOP_D_DD(msg, instr, op0, op1) \203START(); \204__m128d o0 = op0; \205__m128d o1 = op1; \206for(int i = 0; i < N; i += 2) \207o0 = instr(o0, o1); \208_mm_store_pd(dst_dbl, o0); \209END(checksum_dst(dst_dbl), msg);210211#define Max(a,b) ((a) >= (b) ? (a) : (b))212#define Min(a,b) ((a) <= (b) ? (a) : (b))213214static INLINE int Isnan(float __f) {215return (*(unsigned int*)&__f << 1) > 0xFF000000u;216}217218219