CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Software/Lighting.cpp
Views: 1401
// Copyright (c) 2013- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#include <cmath>19#include "Common/Common.h"20#include "Common/CPUDetect.h"21#include "GPU/GPUState.h"22#include "GPU/Software/Lighting.h"2324namespace Lighting {2526static inline Vec3f GetLightVec(const u32 lparams[12], int light) {27#if defined(_M_SSE) && !PPSSPP_ARCH(X86)28__m128i values = _mm_loadu_si128((__m128i *)&lparams[3 * light]);29__m128i from24 = _mm_slli_epi32(values, 8);30return _mm_castsi128_ps(from24);31#elif PPSSPP_ARCH(ARM64_NEON)32uint32x4_t values = vld1q_u32((uint32_t *)&lparams[3 * light]);33uint32x4_t from24 = vshlq_n_u32(values, 8);34return vreinterpretq_f32_u32(from24);35#else36return Vec3<float>(getFloat24(lparams[3 * light]), getFloat24(lparams[3 * light + 1]), getFloat24(lparams[3 * light + 2]));37#endif38}3940static inline float pspLightPow(float v, float e) {41if (e <= 0.0f) {42return 1.0f;43}44if (v > 0.0f) {45return pow(v, e);46}47// Negative stays negative, so let's just return the original.48return v;49}5051static inline Vec4<int> LightColorFactor(const Vec4<int> &expanded, const Vec4<int> &ones) {52#if defined(_M_SSE) && !PPSSPP_ARCH(X86)53return _mm_add_epi32(_mm_slli_epi32(expanded.ivec, 1), ones.ivec);54#elif PPSSPP_ARCH(ARM64_NEON)55return vaddq_s32(vshlq_n_s32(expanded.ivec, 1), ones.ivec);56#else57return expanded * 2 + ones;58#endif59}6061static inline Vec4<int> LightColorFactor(uint32_t c, const Vec4<int> &ones) {62return LightColorFactor(Vec4<int>::FromRGBA(c), ones);63}6465static inline bool IsLargerThanHalf(const Vec4<int> &v) {66#if defined(_M_SSE) && !PPSSPP_ARCH(X86)67__m128i add23 = _mm_add_epi32(v.ivec, _mm_shuffle_epi32(v.ivec, _MM_SHUFFLE(3, 2, 3, 2)));68__m128i add1 = _mm_add_epi32(add23, _mm_shuffle_epi32(add23, _MM_SHUFFLE(1, 1, 1, 1)));69return _mm_cvtsi128_si32(add1) > 4;70#elif PPSSPP_ARCH(ARM64_NEON)71int32x2_t add02 = vpmax_s32(vget_low_s32(v.ivec), vget_high_s32(v.ivec));72int32x2_t add1 = vpmax_s32(add02, add02);73return vget_lane_s32(add1, 0) > 4;74#else75bool larger = false;76for (int i = 0; i < 3; ++i)77larger = v[i] > 1;78return larger;79#endif80}8182void ComputeState(State *state, bool hasColor0) {83const Vec4<int> ones = Vec4<int>::AssignToAll(1);8485bool anyAmbient = false;86bool anyDiffuse = false;87bool anySpecular = false;88bool anyNonDirectional = false;89for (int light = 0; light < 4; ++light) {90auto &lstate = state->lights[light];91lstate.enabled = gstate.isLightChanEnabled(light);92if (!lstate.enabled)93continue;9495lstate.poweredDiffuse = gstate.isUsingPoweredDiffuseLight(light);96lstate.specular = gstate.isUsingSpecularLight(light);9798lstate.ambientColorFactor = LightColorFactor(gstate.getLightAmbientColor(light), ones);99lstate.ambient = IsLargerThanHalf(lstate.ambientColorFactor);100anyAmbient = anyAmbient || lstate.ambient;101102lstate.diffuseColorFactor = LightColorFactor(gstate.getDiffuseColor(light), ones);103lstate.diffuse = IsLargerThanHalf(lstate.diffuseColorFactor);104anyDiffuse = anyDiffuse || lstate.diffuse;105106if (lstate.specular) {107lstate.specularColorFactor = LightColorFactor(gstate.getSpecularColor(light), ones);108lstate.specular = IsLargerThanHalf(lstate.specularColorFactor);109anySpecular = anySpecular || lstate.specular;110}111112// Doesn't actually need to be on if nothing will affect it.113if (!lstate.specular && !lstate.ambient && !lstate.diffuse) {114lstate.enabled = false;115continue;116}117118lstate.pos = GetLightVec(gstate.lpos, light);119lstate.directional = gstate.isDirectionalLight(light);120if (lstate.directional) {121lstate.pos.NormalizeOr001();122} else {123lstate.att = GetLightVec(gstate.latt, light);124anyNonDirectional = true;125}126127lstate.spot = gstate.isSpotLight(light);128if (lstate.spot) {129lstate.spotDir = GetLightVec(gstate.ldir, light);130lstate.spotDir.Normalize();131lstate.spotCutoff = getFloat24(gstate.lcutoff[light]);132if (std::isnan(lstate.spotCutoff) && std::signbit(lstate.spotCutoff))133lstate.spotCutoff = 0.0f;134135lstate.spotExp = getFloat24(gstate.lconv[light]);136if (lstate.spotExp <= 0.0f)137lstate.spotExp = 0.0f;138else if (std::isnan(lstate.spotExp))139lstate.spotExp = std::signbit(lstate.spotExp) ? 0.0f : INFINITY;140}141}142143const int materialupdate = gstate.materialupdate & (hasColor0 ? 7 : 0);144state->colorForAmbient = (materialupdate & 1) != 0;145state->colorForDiffuse = (materialupdate & 2) != 0;146state->colorForSpecular = (materialupdate & 4) != 0;147148if (!state->colorForAmbient) {149state->material.ambientColorFactor = LightColorFactor(gstate.getMaterialAmbientRGBA(), ones);150if (!IsLargerThanHalf(state->material.ambientColorFactor) && anyAmbient) {151for (int i = 0; i < 4; ++i)152state->lights[i].ambient = false;153}154}155156if (anyDiffuse && !state->colorForDiffuse) {157state->material.diffuseColorFactor = LightColorFactor(gstate.getMaterialDiffuse(), ones);158if (!IsLargerThanHalf(state->material.diffuseColorFactor)) {159anyDiffuse = false;160for (int i = 0; i < 4; ++i)161state->lights[i].diffuse = false;162}163}164165if (anySpecular && !state->colorForSpecular) {166state->material.specularColorFactor = LightColorFactor(gstate.getMaterialSpecular(), ones);167if (!IsLargerThanHalf(state->material.specularColorFactor)) {168anySpecular = false;169for (int i = 0; i < 4; ++i)170state->lights[i].specular = false;171}172}173174if (anyDiffuse || anySpecular) {175state->specularExp = gstate.getMaterialSpecularCoef();176if (state->specularExp <= 0.0f)177state->specularExp = 0.0f;178else if (std::isnan(state->specularExp))179state->specularExp = std::signbit(state->specularExp) ? 0.0f : INFINITY;180}181182state->baseAmbientColorFactor = LightColorFactor(gstate.getAmbientRGBA(), ones);183state->setColor1 = gstate.isUsingSecondaryColor() && anySpecular;184state->addColor1 = !gstate.isUsingSecondaryColor() && anySpecular;185state->usesWorldPos = anyNonDirectional;186state->usesWorldNormal = gstate.getUVGenMode() == GE_TEXMAP_ENVIRONMENT_MAP || anyDiffuse || anySpecular;187}188189static inline float GenerateLightCoord(VertexData &vertex, const WorldCoords &worldnormal, int light) {190// TODO: Should specular lighting should affect this, too? Doesn't in GLES.191Vec3<float> L = GetLightVec(gstate.lpos, light);192// In other words, L.Length2() == 0.0f means Dot({0, 0, 1}, worldnormal).193float diffuse_factor = Dot(L.NormalizedOr001(cpu_info.bSSE4_1), worldnormal);194195return (diffuse_factor + 1.0f) / 2.0f;196}197198void GenerateLightST(VertexData &vertex, const WorldCoords &worldnormal) {199// Always calculate texture coords from lighting results if environment mapping is active200// This should be done even if lighting is disabled altogether.201vertex.texturecoords.s() = GenerateLightCoord(vertex, worldnormal, gstate.getUVLS0());202vertex.texturecoords.t() = GenerateLightCoord(vertex, worldnormal, gstate.getUVLS1());203}204205#if defined(_M_SSE)206#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)207[[gnu::target("sse4.1")]]208#endif209static inline int LightCeilSSE4(float f) {210__m128 v = _mm_set_ss(f);211// This isn't terribly fast, but seems to be better than calling ceilf().212return _mm_cvt_ss2si(_mm_ceil_ss(v, v));213}214215#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)216[[gnu::target("sse4.1")]]217#endif218static inline __m128i LightColorScaleBy512SSE4(__m128i factor, __m128i color, __m128i scale) {219// We can use 16-bit multiply here (faster than 32-bit multiply) since our top bits are zero.220__m128i result18 = _mm_madd_epi16(factor, color);221// But now with 18 bits, we need a full multiply.222__m128i multiplied = _mm_mullo_epi32(result18, scale);223return _mm_srai_epi32(multiplied, 10 + 9);224}225#endif226227template <bool useSSE4>228static inline int LightCeil(float f) {229#if defined(_M_SSE)230if (useSSE4)231return LightCeilSSE4(f);232#elif PPSSPP_ARCH(ARM64_NEON)233return vcvtps_s32_f32(f);234#endif235return (int)ceilf(f);236}237238template <bool useSSE4>239static Vec4<int> LightColorScaleBy512(const Vec4<int> &factor, const Vec4<int> &color, int scale) {240// We multiply s9 * s9 * s9, resulting in s27, then shift off 19 to get 8-bit.241// The reason all factors are s9 is to account for rounding.242// Also note that all values are positive, so can be treated as unsigned.243#if defined(_M_SSE) && !PPSSPP_ARCH(X86)244if (useSSE4)245return LightColorScaleBy512SSE4(factor.ivec, color.ivec, _mm_set1_epi32(scale));246#elif PPSSPP_ARCH(ARM64_NEON)247int32x4_t multiplied = vmulq_n_s32(vmulq_s32(factor.ivec, color.ivec), scale);248return vshrq_n_s32(multiplied, 10 + 9);249#endif250return (factor * color * scale) >> (10 + 9);251}252253static inline void LightColorSum(Vec4<int> &sum, const Vec4<int> &src) {254#if defined(_M_SSE) && !PPSSPP_ARCH(X86)255sum.ivec = _mm_add_epi32(sum.ivec, src.ivec);256#elif PPSSPP_ARCH(ARM64_NEON)257sum.ivec = vaddq_s32(sum.ivec, src.ivec);258#else259sum += src;260#endif261}262263static inline float Dot33(const Vec3f &a, const Vec3f &b) {264#if defined(_M_SSE)265__m128 v = _mm_mul_ps(SAFE_M128(a.vec), SAFE_M128(b.vec)); // [X, Y, Z, W]266__m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1)); // [Y, X, Z, W]267__m128 sums = _mm_add_ps(v, shuf); // [X + Y, X + Y, Z + Z, W + W]268shuf = _mm_movehl_ps(shuf, shuf); // [Z, W, Z, W]269return _mm_cvtss_f32(_mm_add_ss(sums, shuf)); // X + Y + Z270#elif PPSSPP_ARCH(ARM64_NEON)271float32x4_t multipled = vsetq_lane_f32(0.0f, vmulq_f32(a.vec, b.vec), 3);272float32x2_t add1 = vget_low_f32(vpaddq_f32(multipled, multipled));273float32x2_t add2 = vpadd_f32(add1, add1);274return vget_lane_f32(add2, 0);275#else276return Dot(a, b);277#endif278}279280template <bool useSSE4>281static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords &worldnormal, const State &state) {282// Lighting blending rounds using the half offset method (like alpha blend.)283Vec4<int> colorFactor;284if (state.colorForAmbient || state.colorForDiffuse || state.colorForSpecular) {285const Vec4<int> ones = Vec4<int>::AssignToAll(1);286colorFactor = LightColorFactor(vertex.color0, ones);287}288289Vec4<int> mec = Vec4<int>::FromRGBA(gstate.getMaterialEmissive());290291Vec4<int> mac = state.colorForAmbient ? colorFactor : state.material.ambientColorFactor;292Vec4<int> ambient = (mac * state.baseAmbientColorFactor) >> 10;293294Vec4<int> final_color = mec + ambient;295Vec4<int> specular_color = Vec4<int>::AssignToAll(0);296297for (unsigned int light = 0; light < 4; ++light) {298const auto &lstate = state.lights[light];299if (!lstate.enabled)300continue;301302// L = vector from vertex to light source303// TODO: Should transfer the light positions to world/view space for these calculations?304Vec3<float> L = lstate.pos;305float attspot = 1.0f;306if (!lstate.directional) {307L -= worldpos;308// TODO: Should this normalize (0, 0, 0) to (0, 0, 1)?309float d = L.NormalizeOr001();310311float att = 1.0f / Dot33(lstate.att, Vec3f(1.0f, d, d * d));312if (!(att > 0.0f))313att = 0.0f;314else if (att > 1.0f)315att = 1.0f;316attspot = att;317}318319if (lstate.spot) {320float rawSpot = Dot33(lstate.spotDir, L);321if (std::isnan(rawSpot))322rawSpot = std::signbit(rawSpot) ? 0.0f : 1.0f;323324float spot = 1.0f;325if (rawSpot >= lstate.spotCutoff) {326spot = pspLightPow(rawSpot, lstate.spotExp);327if (std::isnan(spot))328spot = 0.0f;329} else {330spot = 0.0f;331}332333attspot *= spot;334}335336// ambient lighting337if (lstate.ambient) {338int attspot512 = (int)LightCeil<useSSE4>(256 * 2 * attspot + 1);339if (attspot512 > 512)340attspot512 = 512;341Vec4<int> lambient = LightColorScaleBy512<useSSE4>(lstate.ambientColorFactor, mac, attspot512);342LightColorSum(final_color, lambient);343}344345// diffuse lighting346float diffuse_factor;347if (lstate.diffuse || lstate.specular) {348diffuse_factor = Dot33(L, worldnormal);349if (lstate.poweredDiffuse) {350diffuse_factor = pspLightPow(diffuse_factor, state.specularExp);351}352}353354if (lstate.diffuse && diffuse_factor > 0.0f) {355int diffuse_attspot = (int)LightCeil<useSSE4>(256 * 2 * attspot * diffuse_factor + 1);356if (diffuse_attspot > 512)357diffuse_attspot = 512;358Vec4<int> mdc = state.colorForDiffuse ? colorFactor : state.material.diffuseColorFactor;359Vec4<int> ldiffuse = LightColorScaleBy512<useSSE4>(lstate.diffuseColorFactor, mdc, diffuse_attspot);360LightColorSum(final_color, ldiffuse);361}362363if (lstate.specular && diffuse_factor >= 0.0f) {364Vec3<float> H = L + Vec3<float>(0.f, 0.f, 1.f);365366float specular_factor = Dot33(H.NormalizedOr001(useSSE4), worldnormal);367specular_factor = pspLightPow(specular_factor, state.specularExp);368369if (specular_factor > 0.0f) {370int specular_attspot = (int)LightCeil<useSSE4>(256 * 2 * attspot * specular_factor + 1);371if (specular_attspot > 512)372specular_attspot = 512;373374Vec4<int> msc = state.colorForSpecular ? colorFactor : state.material.specularColorFactor;375Vec4<int> lspecular = LightColorScaleBy512<useSSE4>(lstate.specularColorFactor, msc, specular_attspot);376LightColorSum(specular_color, lspecular);377}378}379}380381// Note: these are all naturally clamped by ToRGBA/toRGB.382if (state.setColor1) {383vertex.color0 = final_color.ToRGBA();384vertex.color1 = specular_color.rgb().ToRGB();385} else if (state.addColor1) {386vertex.color0 = (final_color + specular_color).ToRGBA();387} else {388vertex.color0 = final_color.ToRGBA();389}390}391392void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords &worldnormal, const State &state) {393#ifdef _M_SSE394if (cpu_info.bSSE4_1) {395ProcessSIMD<true>(vertex, worldpos, worldnormal, state);396return;397}398#endif399ProcessSIMD<false>(vertex, worldpos, worldnormal, state);400}401402} // namespace403404405