CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Common/Data/Convert/SmallDataConvert.h
Views: 1401
#pragma once12#include <cstdint>3#include <cstring>4#include <cmath>56#include "Common/Common.h"7#include "ppsspp_config.h"89#ifdef _M_SSE10#include <emmintrin.h>11#endif12#if PPSSPP_ARCH(ARM_NEON)13#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)14#include <arm64_neon.h>15#else16#include <arm_neon.h>17#endif18#endif1920extern const float one_over_255_x4[4];21extern const float exactly_255_x4[4];2223// Utilities useful for filling in std140-layout uniform buffers, and similar.24// NEON intrinsics: https://developer.arm.com/documentation/den0018/a/NEON-Intrinsics?lang=en2526// LSBs in f[0], etc.27inline void Uint8x4ToFloat4(float f[4], uint32_t u) {28#ifdef _M_SSE29__m128i zero = _mm_setzero_si128();30__m128i value = _mm_set1_epi32(u);31__m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, zero), zero);32__m128 fvalues = _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_load_ps(one_over_255_x4));33_mm_storeu_ps(f, fvalues);34#elif PPSSPP_ARCH(ARM_NEON)35const uint8x8_t value = (uint8x8_t)vdup_n_u32(u);36const uint16x8_t value16 = vmovl_u8(value);37const uint32x4_t value32 = vmovl_u16(vget_low_u16(value16));38const float32x4_t valueFloat = vmulq_f32(vcvtq_f32_u32(value32), vdupq_n_f32(1.0f / 255.0f));39vst1q_f32(f, valueFloat);40#else41f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);42f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);43f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);44f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f);45#endif46}4748// Could be SSE optimized.49inline uint32_t Float4ToUint8x4(const float f[4]) {50#ifdef _M_SSE51__m128i zero = _mm_setzero_si128();52__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));53__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);54return _mm_cvtsi128_si32(ivalue);55#elif PPSSPP_ARCH(ARM_NEON)56const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));57uint32x4_t ivalue32 = vcvtq_u32_f32(value);58uint16x4_t ivalue16 = vqmovn_u32(ivalue32);59uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16)); // Is there no way to avoid the combine here?60uint32x2_t outValue32 = vreinterpret_u32_u8(ivalue8);61return vget_lane_u32(outValue32, 0);62#else63int i4[4];64for (int i = 0; i < 4; i++) {65if (f[i] > 1.0f) {66i4[i] = 255;67} else if (f[i] < 0.0f) {68i4[i] = 0;69} else {70i4[i] = (int)(f[i] * 255.0f);71}72}73return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);74#endif75}7677inline uint32_t Float4ToUint8x4_NoClamp(const float f[4]) {78#ifdef _M_SSE79// Does actually clamp, no way to avoid it with the pack ops!80__m128i zero = _mm_setzero_si128();81__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));82__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);83return _mm_cvtsi128_si32(ivalue);84#elif PPSSPP_ARCH(ARM_NEON)85const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));86uint32x4_t ivalue32 = vcvtq_u32_f32(value);87uint16x4_t ivalue16 = vqmovn_u32(ivalue32);88uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16)); // Is there no way to avoid the combine here?89uint32x2_t outValue32 = vreinterpret_u32_u8(ivalue8);90return vget_lane_u32(outValue32, 0);91#else92u32 i4[4];93for (int i = 0; i < 4; i++) {94i4[i] = (int)(f[i] * 255.0f);95}96return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);97#endif98}99100inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {101#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)102Uint8x4ToFloat4(f, (u & 0xFFFFFF) | (alpha << 24));103#else104f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);105f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);106f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);107f[3] = alpha * (1.0f / 255.0f);108#endif109}110111inline void Uint8x3ToFloat4(float f[4], uint32_t u) {112#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)113Uint8x4ToFloat4(f, u & 0xFFFFFF);114#else115f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);116f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);117f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);118f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f);119#endif120}121122inline void Uint8x3ToFloat3(float f[4], uint32_t u) {123#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)124float temp[4];125Uint8x4ToFloat4(temp, u & 0xFFFFFF);126f[0] = temp[0];127f[1] = temp[1];128f[2] = temp[2];129#else130f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);131f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);132f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);133#endif134}135136inline void Uint8x3ToInt4(int i[4], uint32_t u) {137i[0] = ((u >> 0) & 0xFF);138i[1] = ((u >> 8) & 0xFF);139i[2] = ((u >> 16) & 0xFF);140i[3] = 0;141}142143inline void Uint8x3ToInt4_Alpha(int i[4], uint32_t u, uint8_t alpha) {144i[0] = ((u >> 0) & 0xFF);145i[1] = ((u >> 8) & 0xFF);146i[2] = ((u >> 16) & 0xFF);147i[3] = alpha;148}149150inline void Uint8x3ToFloat4_Alpha(float f[4], uint32_t u, float alpha) {151f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);152f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);153f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);154f[3] = alpha;155}156157inline void Uint8x1ToFloat4(float f[4], uint32_t u) {158f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);159f[1] = 0.0f;160f[2] = 0.0f;161f[3] = 0.0f;162}163164// These are just for readability.165166inline void CopyFloat2(float dest[2], const float src[2]) {167dest[0] = src[0];168dest[1] = src[1];169}170171inline void CopyFloat3(float dest[3], const float src[3]) {172dest[0] = src[0];173dest[1] = src[1];174dest[2] = src[2];175}176177inline void CopyFloat4(float dest[4], const float src[4]) {178#ifdef _M_SSE179_mm_storeu_ps(dest, _mm_loadu_ps(src));180#else181dest[0] = src[0];182dest[1] = src[1];183dest[2] = src[2];184dest[3] = src[3];185#endif186}187188inline void CopyFloat1To4(float dest[4], const float src) {189#ifdef _M_SSE190_mm_storeu_ps(dest, _mm_set_ss(src));191#else192dest[0] = src;193dest[1] = 0.0f;194dest[2] = 0.0f;195dest[3] = 0.0f;196#endif197}198199inline void CopyFloat2To4(float dest[4], const float src[2]) {200dest[0] = src[0];201dest[1] = src[1];202dest[2] = 0.0f;203dest[3] = 0.0f;204}205206inline void CopyFloat3To4(float dest[4], const float src[3]) {207dest[0] = src[0];208dest[1] = src[1];209dest[2] = src[2];210dest[3] = 0.0f;211}212213inline void CopyMatrix4x4(float dest[16], const float src[16]) {214memcpy(dest, src, sizeof(float) * 16);215}216217inline void ExpandFloat24x3ToFloat4(float dest[4], const uint32_t src[3]) {218#ifdef _M_SSE219__m128i values = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8);220_mm_storeu_si128((__m128i *)dest, values);221#elif PPSSPP_ARCH(ARM_NEON)222const uint32x4_t values = vshlq_n_u32(vld1q_u32(src), 8);223vst1q_u32((uint32_t *)dest, values);224#else225uint32_t temp[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 };226memcpy(dest, temp, sizeof(float) * 4);227#endif228}229230// Note: If length is 0.0, it's gonna be left as 0.0 instead of trying to normalize. This is important.231inline void ExpandFloat24x3ToFloat4AndNormalize(float dest[4], const uint32_t src[3]) {232float temp[4];233ExpandFloat24x3ToFloat4(temp, src);234// TODO: Reuse code from NormalizedOr001 and optimize235float x = temp[0];236float y = temp[1];237float z = temp[2];238float len = sqrtf(x * x + y * y + z * z);239if (len != 0.0f)240len = 1.0f / len;241dest[0] = x * len;242dest[1] = y * len;243dest[2] = z * len;244dest[3] = 0.0f;245}246247inline uint32_t BytesToUint32(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {248return (a) | (b << 8) | (c << 16) | (d << 24);249}250251constexpr int32_t SignExtend8ToS32(uint32_t value) {252// This extends this sign at the 8th bit to the other 24 bits.253return (int8_t)(value & 0xFF);254}255256constexpr uint32_t SignExtend8ToU32(uint32_t value) {257// Just treat the bits as unsigned.258return (uint32_t)SignExtend8ToS32(value);259}260261constexpr int32_t SignExtend16ToS32(uint32_t value) {262// Same as SignExtend8toS32, but from the 16th bit.263return (int16_t)(value & 0xFFFF);264}265266constexpr uint32_t SignExtend16ToU32(uint32_t value) {267return (uint32_t)SignExtend16ToS32(value);268}269270271