CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Common/IndexGenerator.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include <cstring>1819#include "ppsspp_config.h"2021#include "Common/CPUDetect.h"22#include "Common/Common.h"23#include "Common/Log.h"2425#ifdef _M_SSE26#include <emmintrin.h>27#endif28#if PPSSPP_ARCH(ARM_NEON)2930#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)31#include <arm64_neon.h>32#else33#include <arm_neon.h>34#endif35#endif3637#include "GPU/Common/IndexGenerator.h"3839// Points don't need indexing...40const u8 IndexGenerator::indexedPrimitiveType[7] = {41GE_PRIM_POINTS,42GE_PRIM_LINES,43GE_PRIM_LINES,44GE_PRIM_TRIANGLES,45GE_PRIM_TRIANGLES,46GE_PRIM_TRIANGLES,47GE_PRIM_RECTANGLES,48};4950void IndexGenerator::Setup(u16 *inds) {51this->indsBase_ = inds;52Reset();53}5455void IndexGenerator::AddPrim(int prim, int vertexCount, int indexOffset, bool clockwise) {56switch (prim) {57case GE_PRIM_POINTS: AddPoints(vertexCount, indexOffset); break;58case GE_PRIM_LINES: AddLineList(vertexCount, indexOffset); break;59case GE_PRIM_LINE_STRIP: AddLineStrip(vertexCount, indexOffset); break;60case GE_PRIM_TRIANGLES: AddList(vertexCount, indexOffset, clockwise); break;61case GE_PRIM_TRIANGLE_STRIP: AddStrip(vertexCount, indexOffset, clockwise); break;62case GE_PRIM_TRIANGLE_FAN: AddFan(vertexCount, indexOffset, clockwise); break;63case GE_PRIM_RECTANGLES: AddRectangles(vertexCount, indexOffset); break; // Same64}65}6667void IndexGenerator::AddPoints(int numVerts, int indexOffset) {68u16 *outInds = inds_;69for (int i = 0; i < numVerts; i++)70*outInds++ = indexOffset + i;71inds_ = outInds;72}7374void IndexGenerator::AddList(int numVerts, int indexOffset, bool clockwise) {75u16 *outInds = inds_;76const int v1 = clockwise ? 1 : 2;77const int v2 = clockwise ? 2 : 1;78for (int i = 0; i < numVerts; i += 3) {79*outInds++ = indexOffset + i;80*outInds++ = indexOffset + i + v1;81*outInds++ = indexOffset + i + v2;82}83inds_ = outInds;84}8586alignas(16) static const u16 offsets_clockwise[24] = {870, (u16)(0 + 1), (u16)(0 + 2),88(u16)(1 + 1), 1, (u16)(1 + 2),892, (u16)(2 + 1), (u16)(2 + 2),90(u16)(3 + 1), 3, (u16)(3 + 2),914, (u16)(4 + 1), (u16)(4 + 2),92(u16)(5 + 1), 5, (u16)(5 + 2),936, (u16)(6 + 1), (u16)(6 + 2),94(u16)(7 + 1), 7, (u16)(7 + 2),95};9697alignas(16) static const uint16_t offsets_counter_clockwise[24] = {980, (u16)(0 + 2), (u16)(0 + 1),991, (u16)(1 + 1), (u16)(1 + 2),1002, (u16)(2 + 2), (u16)(2 + 1),1013, (u16)(3 + 1), (u16)(3 + 2),1024, (u16)(4 + 2), (u16)(4 + 1),1035, (u16)(5 + 1), (u16)(5 + 2),1046, (u16)(6 + 2), (u16)(6 + 1),1057, (u16)(7 + 1), (u16)(7 + 2),106};107108void IndexGenerator::AddStrip(int numVerts, int indexOffset, bool clockwise) {109int numTris = numVerts - 2;110if (numTris <= 0) {111return;112}113#ifdef _M_SSE114// In an SSE2 register we can fit 8 16-bit integers.115// However, we need to output a multiple of 3 indices.116// The first such multiple is 24, which means we'll generate 24 indices per cycle,117// which corresponds to 8 triangles. That's pretty cool.118119// We allow ourselves to write some extra indices to avoid the fallback loop.120// That's alright as we're appending to a buffer - they will get overwritten anyway.121__m128i ibase8 = _mm_set1_epi16(indexOffset);122const __m128i *offsets = (const __m128i *)(clockwise ? offsets_clockwise : offsets_counter_clockwise);123__m128i *dst = (__m128i *)inds_;124__m128i offsets0 = _mm_add_epi16(ibase8, _mm_load_si128(offsets));125// A single store is always enough for two triangles, which is a very common case.126_mm_storeu_si128(dst, offsets0);127if (numTris > 2) {128__m128i offsets1 = _mm_add_epi16(ibase8, _mm_load_si128(offsets + 1));129_mm_storeu_si128(dst + 1, offsets1);130if (numTris > 5) {131__m128i offsets2 = _mm_add_epi16(ibase8, _mm_load_si128(offsets + 2));132_mm_storeu_si128(dst + 2, offsets2);133__m128i increment = _mm_set1_epi16(8);134int numChunks = (numTris + 7) >> 3;135for (int i = 1; i < numChunks; i++) {136dst += 3;137offsets0 = _mm_add_epi16(offsets0, increment);138offsets1 = _mm_add_epi16(offsets1, increment);139offsets2 = _mm_add_epi16(offsets2, increment);140_mm_storeu_si128(dst, offsets0);141_mm_storeu_si128(dst + 1, offsets1);142_mm_storeu_si128(dst + 2, offsets2);143}144}145}146inds_ += numTris * 3;147// wind doesn't need to be updated, an even number of triangles have been drawn.148#elif PPSSPP_ARCH(ARM_NEON)149uint16x8_t ibase8 = vdupq_n_u16(indexOffset);150const u16 *offsets = clockwise ? offsets_clockwise : offsets_counter_clockwise;151u16 *dst = inds_;152uint16x8_t offsets0 = vaddq_u16(ibase8, vld1q_u16(offsets));153vst1q_u16(dst, offsets0);154if (numTris > 2) {155uint16x8_t offsets1 = vaddq_u16(ibase8, vld1q_u16(offsets + 8));156vst1q_u16(dst + 8, offsets1);157if (numTris > 5) {158uint16x8_t offsets2 = vaddq_u16(ibase8, vld1q_u16(offsets + 16));159vst1q_u16(dst + 16, offsets2);160uint16x8_t increment = vdupq_n_u16(8);161int numChunks = (numTris + 7) >> 3;162for (int i = 1; i < numChunks; i++) {163dst += 3 * 8;164offsets0 = vaddq_u16(offsets0, increment);165offsets1 = vaddq_u16(offsets1, increment);166offsets2 = vaddq_u16(offsets2, increment);167vst1q_u16(dst, offsets0);168vst1q_u16(dst + 8, offsets1);169vst1q_u16(dst + 16, offsets2);170}171}172}173inds_ += numTris * 3;174#else175// Slow fallback loop.176int wind = clockwise ? 1 : 2;177int ibase = indexOffset;178size_t numPairs = numTris / 2;179u16 *outInds = inds_;180while (numPairs > 0) {181*outInds++ = ibase;182*outInds++ = ibase + wind;183*outInds++ = ibase + (wind ^ 3);184*outInds++ = ibase + 1;185*outInds++ = ibase + 1 + (wind ^ 3);186*outInds++ = ibase + 1 + wind;187ibase += 2;188numPairs--;189}190if (numTris & 1) {191*outInds++ = ibase;192*outInds++ = ibase + wind;193wind ^= 3; // toggle between 1 and 2194*outInds++ = ibase + wind;195}196inds_ = outInds;197#endif198}199200void IndexGenerator::AddFan(int numVerts, int indexOffset, bool clockwise) {201const int numTris = numVerts - 2;202u16 *outInds = inds_;203const int v1 = clockwise ? 1 : 2;204const int v2 = clockwise ? 2 : 1;205for (int i = 0; i < numTris; i++) {206*outInds++ = indexOffset;207*outInds++ = indexOffset + i + v1;208*outInds++ = indexOffset + i + v2;209}210inds_ = outInds;211}212213//Lines214void IndexGenerator::AddLineList(int numVerts, int indexOffset) {215u16 *outInds = inds_;216numVerts &= ~1;217for (int i = 0; i < numVerts; i += 2) {218*outInds++ = indexOffset + i;219*outInds++ = indexOffset + i + 1;220}221inds_ = outInds;222}223224void IndexGenerator::AddLineStrip(int numVerts, int indexOffset) {225const int numLines = numVerts - 1;226u16 *outInds = inds_;227for (int i = 0; i < numLines; i++) {228*outInds++ = indexOffset + i;229*outInds++ = indexOffset + i + 1;230}231inds_ = outInds;232}233234void IndexGenerator::AddRectangles(int numVerts, int indexOffset) {235u16 *outInds = inds_;236//rectangles always need 2 vertices, disregard the last one if there's an odd number237numVerts = numVerts & ~1;238for (int i = 0; i < numVerts; i += 2) {239*outInds++ = indexOffset + i;240*outInds++ = indexOffset + i + 1;241}242inds_ = outInds;243}244245template <class ITypeLE>246void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int indexOffset) {247u16 *outInds = inds_;248for (int i = 0; i < numInds; i++)249*outInds++ = indexOffset + inds[i];250inds_ = outInds;251}252253template <class ITypeLE>254void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int indexOffset) {255u16 *outInds = inds_;256numInds = numInds & ~1;257for (int i = 0; i < numInds; i += 2) {258*outInds++ = indexOffset + inds[i];259*outInds++ = indexOffset + inds[i + 1];260}261inds_ = outInds;262}263264template <class ITypeLE>265void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int indexOffset) {266int numLines = numInds - 1;267u16 *outInds = inds_;268for (int i = 0; i < numLines; i++) {269*outInds++ = indexOffset + inds[i];270*outInds++ = indexOffset + inds[i + 1];271}272inds_ = outInds;273}274275template <class ITypeLE>276void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {277// We only bother doing this minor optimization in triangle list, since it's by far the most278// common operation that can benefit.279if (sizeof(ITypeLE) == sizeof(inds_[0]) && indexOffset == 0 && clockwise) {280memcpy(inds_, inds, numInds * sizeof(ITypeLE));281inds_ += numInds;282} else {283u16 *outInds = inds_;284int numTris = numInds / 3; // Round to whole triangles285numInds = numTris * 3;286const int v1 = clockwise ? 1 : 2;287const int v2 = clockwise ? 2 : 1;288// TODO: This can actually be SIMD-d, although will need complex shuffles if clockwise.289for (int i = 0; i < numInds; i += 3) {290*outInds++ = indexOffset + inds[i];291*outInds++ = indexOffset + inds[i + v1];292*outInds++ = indexOffset + inds[i + v2];293}294inds_ = outInds;295}296}297298template <class ITypeLE>299void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {300int wind = clockwise ? 1 : 2;301int numTris = numInds - 2;302u16 *outInds = inds_;303for (int i = 0; i < numTris; i++) {304*outInds++ = indexOffset + inds[i];305*outInds++ = indexOffset + inds[i + wind];306wind ^= 3; // Toggle between 1 and 2307*outInds++ = indexOffset + inds[i + wind];308}309inds_ = outInds;310}311312template <class ITypeLE>313void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {314if (numInds <= 0) return;315int numTris = numInds - 2;316u16 *outInds = inds_;317const int v1 = clockwise ? 1 : 2;318const int v2 = clockwise ? 2 : 1;319for (int i = 0; i < numTris; i++) {320*outInds++ = indexOffset + inds[0];321*outInds++ = indexOffset + inds[i + v1];322*outInds++ = indexOffset + inds[i + v2];323}324inds_ = outInds;325}326327template <class ITypeLE>328inline void IndexGenerator::TranslateRectangles(int numInds, const ITypeLE *inds, int indexOffset) {329u16 *outInds = inds_;330//rectangles always need 2 vertices, disregard the last one if there's an odd number331numInds = numInds & ~1;332for (int i = 0; i < numInds; i += 2) {333*outInds++ = indexOffset + inds[i];334*outInds++ = indexOffset + inds[i+1];335}336inds_ = outInds;337}338339// Could template this too, but would have to define in header.340void IndexGenerator::TranslatePrim(int prim, int numInds, const u8 *inds, int indexOffset, bool clockwise) {341switch (prim) {342case GE_PRIM_POINTS: TranslatePoints<u8>(numInds, inds, indexOffset); break;343case GE_PRIM_LINES: TranslateLineList<u8>(numInds, inds, indexOffset); break;344case GE_PRIM_LINE_STRIP: TranslateLineStrip<u8>(numInds, inds, indexOffset); break;345case GE_PRIM_TRIANGLES: TranslateList<u8>(numInds, inds, indexOffset, clockwise); break;346case GE_PRIM_TRIANGLE_STRIP: TranslateStrip<u8>(numInds, inds, indexOffset, clockwise); break;347case GE_PRIM_TRIANGLE_FAN: TranslateFan<u8>(numInds, inds, indexOffset, clockwise); break;348case GE_PRIM_RECTANGLES: TranslateRectangles<u8>(numInds, inds, indexOffset); break; // Same349}350}351352void IndexGenerator::TranslatePrim(int prim, int numInds, const u16_le *inds, int indexOffset, bool clockwise) {353switch (prim) {354case GE_PRIM_POINTS: TranslatePoints<u16_le>(numInds, inds, indexOffset); break;355case GE_PRIM_LINES: TranslateLineList<u16_le>(numInds, inds, indexOffset); break;356case GE_PRIM_LINE_STRIP: TranslateLineStrip<u16_le>(numInds, inds, indexOffset); break;357case GE_PRIM_TRIANGLES: TranslateList<u16_le>(numInds, inds, indexOffset, clockwise); break;358case GE_PRIM_TRIANGLE_STRIP: TranslateStrip<u16_le>(numInds, inds, indexOffset, clockwise); break;359case GE_PRIM_TRIANGLE_FAN: TranslateFan<u16_le>(numInds, inds, indexOffset, clockwise); break;360case GE_PRIM_RECTANGLES: TranslateRectangles<u16_le>(numInds, inds, indexOffset); break; // Same361}362}363364void IndexGenerator::TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise) {365switch (prim) {366case GE_PRIM_POINTS: TranslatePoints<u32_le>(numInds, inds, indexOffset); break;367case GE_PRIM_LINES: TranslateLineList<u32_le>(numInds, inds, indexOffset); break;368case GE_PRIM_LINE_STRIP: TranslateLineStrip<u32_le>(numInds, inds, indexOffset); break;369case GE_PRIM_TRIANGLES: TranslateList<u32_le>(numInds, inds, indexOffset, clockwise); break;370case GE_PRIM_TRIANGLE_STRIP: TranslateStrip<u32_le>(numInds, inds, indexOffset, clockwise); break;371case GE_PRIM_TRIANGLE_FAN: TranslateFan<u32_le>(numInds, inds, indexOffset, clockwise); break;372case GE_PRIM_RECTANGLES: TranslateRectangles<u32_le>(numInds, inds, indexOffset); break; // Same373}374}375376377