CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Common/Math/fast/fast_matrix.c
Views: 1401
#include "ppsspp_config.h"12#include "Common/Math/CrossSIMD.h"34#include "fast_matrix.h"56#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)78#include <emmintrin.h>910#include "fast_matrix.h"1112void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {13int i;14__m128 a_col_1 = _mm_loadu_ps(a);15__m128 a_col_2 = _mm_loadu_ps(&a[4]);16__m128 a_col_3 = _mm_loadu_ps(&a[8]);17__m128 a_col_4 = _mm_loadu_ps(&a[12]);1819for (i = 0; i < 16; i += 4) {20__m128 r_col = _mm_mul_ps(a_col_1, _mm_set1_ps(b[i]));21r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_2, _mm_set1_ps(b[i + 1])));22r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_3, _mm_set1_ps(b[i + 2])));23r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_4, _mm_set1_ps(b[i + 3])));24_mm_storeu_ps(&dest[i], r_col);25}26}2728#elif PPSSPP_ARCH(ARM_NEON)2930#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)31#include <arm64_neon.h>32#else33#include <arm_neon.h>34#endif3536#if PPSSPP_ARCH(ARM)37static inline float32x4_t vfmaq_laneq_f32(float32x4_t _s, float32x4_t _a, float32x4_t _b, int lane) {38if (lane == 0) return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 0);39else if (lane == 1) return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 1);40else if (lane == 2) return vmlaq_lane_f32(_s, _a, vget_high_f32(_b), 0);41else if (lane == 3) return vmlaq_lane_f32(_s, _a, vget_high_f32(_b), 1);42else return vdupq_n_f32(0.f);43}44#endif4546// From https://developer.arm.com/documentation/102467/0100/Matrix-multiplication-example47void fast_matrix_mul_4x4_neon(float *C, const float *A, const float *B) {48// these are the columns A49float32x4_t A0;50float32x4_t A1;51float32x4_t A2;52float32x4_t A3;5354// these are the columns B55float32x4_t B0;56float32x4_t B1;57float32x4_t B2;58float32x4_t B3;5960// these are the columns C61float32x4_t C0;62float32x4_t C1;63float32x4_t C2;64float32x4_t C3;6566A0 = vld1q_f32(A);67A1 = vld1q_f32(A + 4);68A2 = vld1q_f32(A + 8);69A3 = vld1q_f32(A + 12);7071// Multiply accumulate in 4x1 blocks, i.e. each column in C72B0 = vld1q_f32(B);73C0 = vmulq_laneq_f32(A0, B0, 0);74C0 = vfmaq_laneq_f32(C0, A1, B0, 1);75C0 = vfmaq_laneq_f32(C0, A2, B0, 2);76C0 = vfmaq_laneq_f32(C0, A3, B0, 3);77vst1q_f32(C, C0);7879B1 = vld1q_f32(B + 4);80C1 = vmulq_laneq_f32(A0, B1, 0);81C1 = vfmaq_laneq_f32(C1, A1, B1, 1);82C1 = vfmaq_laneq_f32(C1, A2, B1, 2);83C1 = vfmaq_laneq_f32(C1, A3, B1, 3);84vst1q_f32(C + 4, C1);8586B2 = vld1q_f32(B + 8);87C2 = vmulq_laneq_f32(A0, B2, 0);88C2 = vfmaq_laneq_f32(C2, A1, B2, 1);89C2 = vfmaq_laneq_f32(C2, A2, B2, 2);90C2 = vfmaq_laneq_f32(C2, A3, B2, 3);91vst1q_f32(C + 8, C2);9293B3 = vld1q_f32(B + 12);94C3 = vmulq_laneq_f32(A0, B3, 0);95C3 = vfmaq_laneq_f32(C3, A1, B3, 1);96C3 = vfmaq_laneq_f32(C3, A2, B3, 2);97C3 = vfmaq_laneq_f32(C3, A3, B3, 3);98vst1q_f32(C + 12, C3);99}100101#else102103#define xx 0104#define xy 1105#define xz 2106#define xw 3107#define yx 4108#define yy 5109#define yz 6110#define yw 7111#define zx 8112#define zy 9113#define zz 10114#define zw 11115#define wx 12116#define wy 13117#define wz 14118#define ww 15119120void fast_matrix_mul_4x4_c(float *dest, const float *a, const float *b) {121dest[xx] = b[xx] * a[xx] + b[xy] * a[yx] + b[xz] * a[zx] + b[xw] * a[wx];122dest[xy] = b[xx] * a[xy] + b[xy] * a[yy] + b[xz] * a[zy] + b[xw] * a[wy];123dest[xz] = b[xx] * a[xz] + b[xy] * a[yz] + b[xz] * a[zz] + b[xw] * a[wz];124dest[xw] = b[xx] * a[xw] + b[xy] * a[yw] + b[xz] * a[zw] + b[xw] * a[ww];125126dest[yx] = b[yx] * a[xx] + b[yy] * a[yx] + b[yz] * a[zx] + b[yw] * a[wx];127dest[yy] = b[yx] * a[xy] + b[yy] * a[yy] + b[yz] * a[zy] + b[yw] * a[wy];128dest[yz] = b[yx] * a[xz] + b[yy] * a[yz] + b[yz] * a[zz] + b[yw] * a[wz];129dest[yw] = b[yx] * a[xw] + b[yy] * a[yw] + b[yz] * a[zw] + b[yw] * a[ww];130131dest[zx] = b[zx] * a[xx] + b[zy] * a[yx] + b[zz] * a[zx] + b[zw] * a[wx];132dest[zy] = b[zx] * a[xy] + b[zy] * a[yy] + b[zz] * a[zy] + b[zw] * a[wy];133dest[zz] = b[zx] * a[xz] + b[zy] * a[yz] + b[zz] * a[zz] + b[zw] * a[wz];134dest[zw] = b[zx] * a[xw] + b[zy] * a[yw] + b[zz] * a[zw] + b[zw] * a[ww];135136dest[wx] = b[wx] * a[xx] + b[wy] * a[yx] + b[wz] * a[zx] + b[ww] * a[wx];137dest[wy] = b[wx] * a[xy] + b[wy] * a[yy] + b[wz] * a[zy] + b[ww] * a[wy];138dest[wz] = b[wx] * a[xz] + b[wy] * a[yz] + b[wz] * a[zz] + b[ww] * a[wz];139dest[ww] = b[wx] * a[xw] + b[wy] * a[yw] + b[wz] * a[zw] + b[ww] * a[ww];140}141142#endif143144145