CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Common/TextureScalerCommon.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include <cstddef>18#include <algorithm>19#include <cstdlib>20#include <cstring>21#include <cmath>2223#include "GPU/Common/TextureScalerCommon.h"2425#include "Core/Config.h"26#include "Common/Common.h"27#include "Common/Log.h"28#include "Common/CommonFuncs.h"29#include "Common/Thread/ParallelLoop.h"30#include "Core/ThreadPools.h"31#include "Common/CPUDetect.h"32#include "ext/xbrz/xbrz.h"3334#if defined(_M_SSE)35#include <emmintrin.h>36#include <smmintrin.h>37#endif3839// Report the time and throughput for each larger scaling operation in the log40//#define SCALING_MEASURE_TIME4142//#define DEBUG_SCALER_OUTPUT4344#ifdef SCALING_MEASURE_TIME45#include "Common/TimeUtil.h"46#endif4748/////////////////////////////////////// Helper Functions (mostly math for parallelization)4950namespace {51//////////////////////////////////////////////////////////////////// Various image processing5253#define R(_col) ((_col>> 0)&0xFF)54#define G(_col) ((_col>> 8)&0xFF)55#define B(_col) ((_col>>16)&0xFF)56#define A(_col) ((_col>>24)&0xFF)5758#define DISTANCE(_p1,_p2) ( abs(static_cast<int>(static_cast<int>(R(_p1))-R(_p2))) + abs(static_cast<int>(static_cast<int>(G(_p1))-G(_p2))) \59+ abs(static_cast<int>(static_cast<int>(B(_p1))-B(_p2))) + abs(static_cast<int>(static_cast<int>(A(_p1))-A(_p2))) )6061// this is sadly much faster than an inline function with a loop, at least in VC1062#define MIX_PIXELS(_p0, _p1, _factors) \63( (R(_p0)*(_factors)[0] + R(_p1)*(_factors)[1])/255 << 0 ) | \64( (G(_p0)*(_factors)[0] + G(_p1)*(_factors)[1])/255 << 8 ) | \65( (B(_p0)*(_factors)[0] + B(_p1)*(_factors)[1])/255 << 16 ) | \66( (A(_p0)*(_factors)[0] + A(_p1)*(_factors)[1])/255 << 24 )6768#define BLOCK_SIZE 326970// 3x3 convolution with Neumann boundary conditions, parallelizable71// quite slow, could be sped up a lot72// especially handling of separable kernels73void convolve3x3(const u32 *data, u32 *out, const int kernel[3][3], int width, int height, int l, int u) {74for (int yb = 0; yb < (u - l) / BLOCK_SIZE + 1; ++yb) {75for (int xb = 0; xb < width / BLOCK_SIZE + 1; ++xb) {76for (int y = l + yb*BLOCK_SIZE; y < l + (yb + 1)*BLOCK_SIZE && y < u; ++y) {77for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < width; ++x) {78int val = 0;79for (int yoff = -1; yoff <= 1; ++yoff) {80int yy = std::max(std::min(y + yoff, height - 1), 0);81for (int xoff = -1; xoff <= 1; ++xoff) {82int xx = std::max(std::min(x + xoff, width - 1), 0);83val += data[yy*width + xx] * kernel[yoff + 1][xoff + 1];84}85}86out[y*width + x] = abs(val);87}88}89}90}91}9293// deposterization: smoothes posterized gradients from low-color-depth (e.g. 444, 565, compressed) sources94void deposterizeH(const u32 *data, u32 *out, int w, int l, int u) {95static const int T = 8;96for (int y = l; y < u; ++y) {97for (int x = 0; x < w; ++x) {98int inpos = y*w + x;99u32 center = data[inpos];100if (x == 0 || x == w - 1) {101out[y*w + x] = center;102continue;103}104u32 left = data[inpos - 1];105u32 right = data[inpos + 1];106out[y*w + x] = 0;107for (int c = 0; c < 4; ++c) {108u8 lc = ((left >> c * 8) & 0xFF);109u8 cc = ((center >> c * 8) & 0xFF);110u8 rc = ((right >> c * 8) & 0xFF);111if ((lc != rc) && ((lc == cc && abs((int)((int)rc) - cc) <= T) || (rc == cc && abs((int)((int)lc) - cc) <= T))) {112// blend this component113out[y*w + x] |= ((rc + lc) / 2) << (c * 8);114} else {115// no change for this component116out[y*w + x] |= cc << (c * 8);117}118}119}120}121}122void deposterizeV(const u32 *data, u32 *out, int w, int h, int l, int u) {123static const int T = 8;124for (int xb = 0; xb < w / BLOCK_SIZE + 1; ++xb) {125for (int y = l; y < u; ++y) {126for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < w; ++x) {127u32 center = data[y * w + x];128if (y == 0 || y == h - 1) {129out[y*w + x] = center;130continue;131}132u32 upper = data[(y - 1) * w + x];133u32 lower = data[(y + 1) * w + x];134out[y*w + x] = 0;135for (int c = 0; c < 4; ++c) {136u8 uc = ((upper >> c * 8) & 0xFF);137u8 cc = ((center >> c * 8) & 0xFF);138u8 lc = ((lower >> c * 8) & 0xFF);139if ((uc != lc) && ((uc == cc && abs((int)((int)lc) - cc) <= T) || (lc == cc && abs((int)((int)uc) - cc) <= T))) {140// blend this component141out[y*w + x] |= ((lc + uc) / 2) << (c * 8);142} else {143// no change for this component144out[y*w + x] |= cc << (c * 8);145}146}147}148}149}150}151152// generates a distance mask value for each pixel in data153// higher values -> larger distance to the surrounding pixels154void generateDistanceMask(const u32 *data, u32 *out, int width, int height, int l, int u) {155for (int yb = 0; yb < (u - l) / BLOCK_SIZE + 1; ++yb) {156for (int xb = 0; xb < width / BLOCK_SIZE + 1; ++xb) {157for (int y = l + yb*BLOCK_SIZE; y < l + (yb + 1)*BLOCK_SIZE && y < u; ++y) {158for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < width; ++x) {159const u32 center = data[y*width + x];160u32 dist = 0;161for (int yoff = -1; yoff <= 1; ++yoff) {162int yy = y + yoff;163if (yy == height || yy == -1) {164dist += 1200; // assume distance at borders, usually makes for better result165continue;166}167for (int xoff = -1; xoff <= 1; ++xoff) {168if (yoff == 0 && xoff == 0) continue;169int xx = x + xoff;170if (xx == width || xx == -1) {171dist += 400; // assume distance at borders, usually makes for better result172continue;173}174dist += DISTANCE(data[yy*width + xx], center);175}176}177out[y*width + x] = dist;178}179}180}181}182}183184// mix two images based on a mask185void mix(u32 *data, const u32 *source, const u32 *mask, u32 maskmax, int width, int l, int u) {186for (int y = l; y < u; ++y) {187for (int x = 0; x < width; ++x) {188int pos = y*width + x;189u8 mixFactors[2] = { 0, static_cast<u8>((std::min(mask[pos], maskmax) * 255) / maskmax) };190mixFactors[0] = 255 - mixFactors[1];191data[pos] = MIX_PIXELS(data[pos], source[pos], mixFactors);192if (A(source[pos]) == 0) data[pos] = data[pos] & 0x00FFFFFF; // xBRZ always does a better job with hard alpha193}194}195}196197//////////////////////////////////////////////////////////////////// Bicubic scaling198199// Code for the cubic upscaler is pasted below as-is.200// WARNING: different codestyle.201202// NOTE: in several places memcpy is used instead of type punning,203// to avoid strict aliasing problems. This may produce suboptimal204// code, especially on MSVC.205206// Loads a sample (4 bytes) from image into 'output'.207static void load_sample(ptrdiff_t w, ptrdiff_t h, ptrdiff_t s, const u8 *pixels, int wrap_mode, ptrdiff_t x, ptrdiff_t y, u8 *output) {208// Check if the sample is inside. NOTE: for b>=0209// the expression (UNSIGNED)a<(UNSIGNED)b is210// equivalent to a>=0&&a<b.211static_assert(sizeof(ptrdiff_t) == sizeof(size_t), "Assumes ptrdiff_t same width as size_t");212213if((size_t)x >= (size_t)w || (size_t)y >= (size_t)h) {214switch(wrap_mode) {215case 0: // Wrap216if(!((w & (w-1)) | (h & (h-1)))) {217// Both w and h are powers of 2.218x &= w-1;219y &= h-1;220} else {221// For e.g. 1x1 images we might need to wrap several222// times, hence 'while', instead of 'if'. Probably223// still faster, than modulo.224while(x < 0) x += w;225while(y < 0) y += h;226while(x >= w) x -= w;227while(y >= h) y -= h;228}229break;230case 1: // Clamp231if(x < 0) x = 0;232if(y < 0) y = 0;233if(x >= w) x = w-1;234if(y >= h) y = h-1;235break;236case 2: // Zero237memset(output, 0, 4);238return;239}240}241memcpy(output, pixels + s*y + 4*x, 4);242}243244#define BLOCK 8245246static void init_block(247ptrdiff_t w, ptrdiff_t h,248ptrdiff_t src_stride, const u8 *src_pixels,249int wrap_mode, ptrdiff_t factor, float B, float C,250ptrdiff_t x0, ptrdiff_t y0,251float (*cx)[4], float (*cy)[4],252ptrdiff_t *lx, ptrdiff_t *ly, ptrdiff_t *lx0, ptrdiff_t *ly0, ptrdiff_t *sx, ptrdiff_t *sy,253u8 (*src)[(BLOCK+4)*4]) {254// Precomputed coefficients for pixel weights255// in the Mitchell-Netravali filter:256// output = SUM(wij*pixel[i]*t^j)257// where t is distance from pixel[1] to the258// sampling position.259float w00 = B/6.0f , w01 = -C-0.5f*B, w02 = 2.0f*C+0.5f*B , w03 = -C-B/6.0f ;260float w10 = 1.0f-B/3.0f,/*w11 = 0.0f ,*/w12 = C+2.0f*B-3.0f , w13 = -C-1.5f*B+2.0f;261float w20 = B/6.0f , w21 = C+0.5f*B, w22 = -2.0f*C-2.5f*B+3.0f, w23 = C+1.5f*B-2.0f;262float /*w30 = 0.0f , w31 = 0.0f ,*/w32 = -C , w33 = C+B/6.0f ;263// Express the sampling position as a rational264// number num/den-1 (off by one, so that num is265// always positive, since the C language does266// not do Euclidean division). Sampling points267// for both src and dst are assumed at pixel centers.268ptrdiff_t den = 2*factor;269float inv_den = 1.0f/(float)den;270for(int dir = 0; dir < 2; ++dir) {271ptrdiff_t num = (dir ? 2*y0+1+factor : 2*x0+1+factor);272ptrdiff_t *l = (dir ? ly : lx), *l0 = (dir ? ly0 : lx0), *s = (dir ? sy : sx);273float (*c)[4] = (dir ? cy : cx);274(*l0) = num/den-2;275num = num%den;276for(ptrdiff_t i = 0, j = 0; i < BLOCK; ++i) {277l[i] = j; // i-th dst pixel accesses src pixels (l0+l[i])..(l0+l[i]+3) in {x|y} direction.278float t = (float)num*inv_den; // Fractional part of the sampling position.279// Write out pixel weights.280c[i][0] = ((w03*t+w02)*t +w01 )*t +w00 ;281c[i][1] = ((w13*t+w12)*t/*+w11*/)*t +w10 ;282c[i][2] = ((w23*t+w22)*t +w21 )*t +w20 ;283c[i][3] = ((w33*t+w32)*t/*+w31*/)*t/*+w30*/;284// Increment the sampling position.285if((num += 2) >= den) {num -= den; j += 1;}286}287(*s) = l[BLOCK-1]+4; // Total sampled src pixels in {x|y} direction.288}289// Get a local copy of the source pixels.290if((*lx0) >=0 && (*ly0) >= 0 && *lx0 + (*sx) <= w && *ly0 + (*sy) <= h) {291for(ptrdiff_t iy = 0; iy < (*sy); ++iy)292memcpy(src[iy], src_pixels+src_stride*((*ly0) + iy) + 4*(*lx0), (size_t)(4*(*sx)));293}294else {295for(ptrdiff_t iy = 0; iy < (*sy); ++iy) for(ptrdiff_t ix = 0; ix < (*sx); ++ix)296load_sample(w, h, src_stride, src_pixels, wrap_mode, (*lx0) + ix, (*ly0) + iy, src[iy] + 4*ix);297}298}299300static void upscale_block_c(301ptrdiff_t w, ptrdiff_t h,302ptrdiff_t src_stride, const u8 *src_pixels,303int wrap_mode, ptrdiff_t factor, float B, float C,304ptrdiff_t x0, ptrdiff_t y0,305u8 *dst_pixels) {306float cx[BLOCK][4], cy[BLOCK][4];307ptrdiff_t lx[BLOCK], ly[BLOCK], lx0, ly0, sx, sy;308u8 src[BLOCK+4][(BLOCK+4)*4];309float buf[2][BLOCK+4][BLOCK+4][4];310init_block(311w, h, src_stride, src_pixels, wrap_mode, factor, B, C, x0, y0,312cx, cy, lx, ly, &lx0, &ly0, &sx, &sy, src);313// Unpack source pixels.314for(ptrdiff_t iy = 0; iy < sy; ++iy)315for(ptrdiff_t ix = 0; ix < sx; ++ix)316for(ptrdiff_t k = 0; k < 4; ++k)317buf[0][iy][ix][k] = (float)(int)src[iy][4*ix + k];318// Horizontal pass.319for(ptrdiff_t ix = 0; ix < BLOCK; ++ix) {320#define S(i) (buf[0][iy][lx[ix] + i][k])321float C0 = cx[ix][0], C1 = cx[ix][1], C2 = cx[ix][2], C3 = cx[ix][3];322for(ptrdiff_t iy = 0; iy < sy; ++iy)323for(ptrdiff_t k = 0; k < 4; ++k)324buf[1][iy][ix][k] = S(0)*C0 + S(1)*C1 + S(2)*C2 + S(3)*C3;325#undef S326}327// Vertical pass.328for(ptrdiff_t iy = 0; iy < BLOCK; ++iy) {329#define S(i) (buf[1][ly[iy]+i][ix][k])330float C0 = cy[iy][0], C1 = cy[iy][1], C2 = cy[iy][2], C3 = cy[iy][3];331for(ptrdiff_t ix = 0; ix < BLOCK; ++ix)332for(ptrdiff_t k = 0; k < 4; ++k)333buf[0][iy][ix][k] = S(0)*C0 + S(1)*C1 + S(2)*C2 + S(3)*C3;334#undef S335}336// Pack destination pixels.337for(ptrdiff_t iy = 0; iy < BLOCK; ++iy)338for(ptrdiff_t ix = 0; ix < BLOCK; ++ix) {339u8 pixel[4];340for(ptrdiff_t k = 0; k < 4; ++k) {341float C = buf[0][iy][ix][k];342if(!(C>0.0f)) C = 0.0f;343if(C>255.0f) C = 255.0f;344pixel[k] = (u8)(int)(C + 0.5f);345}346memcpy(dst_pixels + 4*(BLOCK*iy + ix), pixel, 4);347}348}349350#if defined(_M_SSE)351352#if defined(__GNUC__)353#define ALIGNED(n) __attribute__((aligned(n)))354#elif defined(_MSC_VER)355#define ALIGNED(n) __declspec(align(n))356#else357// For our use case, ALIGNED is a hint, not a requirement,358// so it's fine to ignore it.359#define ALIGNED(n)360#endif361362static void upscale_block_sse2(363ptrdiff_t w, ptrdiff_t h,364ptrdiff_t src_stride, const u8 *src_pixels,365int wrap_mode, ptrdiff_t factor, float B, float C,366ptrdiff_t x0, ptrdiff_t y0,367u8 *dst_pixels) {368float cx[BLOCK][4], cy[BLOCK][4];369ptrdiff_t lx[BLOCK], ly[BLOCK], lx0, ly0, sx, sy;370ALIGNED(16) u8 src[BLOCK+4][(BLOCK+4)*4];371ALIGNED(16) float buf[2][BLOCK+4][BLOCK+4][4];372init_block(373w, h, src_stride, src_pixels, wrap_mode, factor, B, C, x0, y0,374cx, cy, lx, ly, &lx0, &ly0, &sx, &sy, src);375// Unpack source pixels.376for(ptrdiff_t iy = 0; iy < sy; ++iy)377for(ptrdiff_t ix = 0; ix < sx; ++ix) {378int pixel;379memcpy(&pixel, src[iy] + 4*ix, 4);380__m128i C = _mm_cvtsi32_si128(pixel);381C = _mm_unpacklo_epi8(C, _mm_set1_epi32(0));382C = _mm_unpacklo_epi8(C, _mm_set1_epi32(0));383_mm_storeu_ps(buf[0][iy][ix], _mm_cvtepi32_ps(C));384}385// Horizontal pass.386for(ptrdiff_t ix = 0; ix < BLOCK; ++ix) {387#define S(i) (buf[0][iy][lx[ix] + i])388__m128 C0 = _mm_set1_ps(cx[ix][0]),389C1 = _mm_set1_ps(cx[ix][1]),390C2 = _mm_set1_ps(cx[ix][2]),391C3 = _mm_set1_ps(cx[ix][3]);392for(ptrdiff_t iy = 0; iy < sy; ++iy)393_mm_storeu_ps(buf[1][iy][ix],394_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(0)), C0),395_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(1)), C1),396_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(2)), C2),397_mm_mul_ps(_mm_loadu_ps(S(3)), C3)))));398#undef S399}400// Vertical pass.401for(ptrdiff_t iy = 0; iy < BLOCK; ++iy) {402#define S(i) (buf[1][ly[iy] + i][ix])403__m128 C0 = _mm_set1_ps(cy[iy][0]),404C1 = _mm_set1_ps(cy[iy][1]),405C2 = _mm_set1_ps(cy[iy][2]),406C3 = _mm_set1_ps(cy[iy][3]);407for(ptrdiff_t ix = 0; ix < BLOCK; ++ix)408_mm_storeu_ps(buf[0][iy][ix],409_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(0)), C0),410_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(1)), C1),411_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(2)), C2),412_mm_mul_ps(_mm_loadu_ps(S(3)), C3)))));413#undef S414}415// Pack destination pixels.416for(ptrdiff_t iy = 0; iy < BLOCK; ++iy)417for(ptrdiff_t ix = 0; ix < BLOCK; ++ix) {418__m128 C = _mm_loadu_ps(buf[0][iy][ix]);419C = _mm_min_ps(_mm_max_ps(C, _mm_set1_ps(0.0f)), _mm_set1_ps(255.0f));420C = _mm_add_ps(C, _mm_set1_ps(0.5f));421__m128i R = _mm_cvttps_epi32(C);422R = _mm_packus_epi16(R, R);423R = _mm_packus_epi16(R, R);424int pixel = _mm_cvtsi128_si32(R);425memcpy(dst_pixels + 4*(BLOCK*iy+ix), &pixel, 4);426}427}428#endif // defined(_M_SSE)429430static void upscale_cubic(431ptrdiff_t width, ptrdiff_t height, ptrdiff_t src_stride_in_bytes, const void *src_pixels,432ptrdiff_t dst_stride_in_bytes, void *dst_pixels,433ptrdiff_t scale, float B, float C, int wrap_mode,434ptrdiff_t x0, ptrdiff_t y0, ptrdiff_t x1, ptrdiff_t y1) {435u8 pixels[BLOCK*BLOCK*4];436for(ptrdiff_t y = y0; y < y1; y+= BLOCK)437for(ptrdiff_t x = x0; x < x1; x+= BLOCK) {438#if defined(_M_SSE)439upscale_block_sse2(width, height, src_stride_in_bytes, (const u8*)src_pixels, wrap_mode, scale, B, C, x, y, pixels);440#else441upscale_block_c (width, height, src_stride_in_bytes, (const u8*)src_pixels, wrap_mode, scale, B, C, x, y, pixels);442#endif443for(ptrdiff_t iy = 0, ny = (y1-y < BLOCK ? y1-y : BLOCK), nx = (x1-x < BLOCK ? x1-x : BLOCK); iy < ny; ++iy)444memcpy((u8*)dst_pixels + dst_stride_in_bytes*(y+iy) + 4*x, pixels + BLOCK*4*iy, (size_t)(4*nx));445}446}447448// End of pasted cubic upscaler.449450void scaleBicubicBSpline(int factor, const u32 *data, u32 *out, int w, int h, int l, int u) {451const float B = 1.0f, C = 0.0f;452const int wrap_mode = 1; // Clamp453upscale_cubic(454w, h, w*4, data,455factor*w*4, out,456factor, B, C, wrap_mode,4570, factor*l, factor*w, factor*u);458}459460void scaleBicubicMitchell(int factor, const u32 *data, u32 *out, int w, int h, int l, int u) {461const float B = 0.0f, C = 0.5f; // Actually, Catmull-Rom462const int wrap_mode = 1; // Clamp463upscale_cubic(464w, h, w*4, data,465factor*w*4, out,466factor, B, C, wrap_mode,4670, factor*l, factor*w, factor*u);468}469470//////////////////////////////////////////////////////////////////// Bilinear scaling471472const static u8 BILINEAR_FACTORS[4][3][2] = {473{ { 44, 211 }, { 0, 0 }, { 0, 0 } }, // x2474{ { 64, 191 }, { 0, 255 }, { 0, 0 } }, // x3475{ { 77, 178 }, { 26, 229 }, { 0, 0 } }, // x4476{ { 102, 153 }, { 51, 204 }, { 0, 255 } }, // x5477};478// integral bilinear upscaling by factor f, horizontal part479template<int f>480void bilinearHt(const u32 *data, u32 *out, int w, int l, int u) {481static_assert(f > 1 && f <= 5, "Bilinear scaling only implemented for factors 2 to 5");482int outw = w*f;483for (int y = l; y < u; ++y) {484for (int x = 0; x < w; ++x) {485int inpos = y*w + x;486u32 left = data[inpos - (x == 0 ? 0 : 1)];487u32 center = data[inpos];488u32 right = data[inpos + (x == w - 1 ? 0 : 1)];489int i = 0;490for (; i < f / 2 + f % 2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this491out[y*outw + x*f + i] = MIX_PIXELS(left, center, BILINEAR_FACTORS[f - 2][i]);492}493for (; i < f; ++i) { // second half of the new pixels, hope the compiler unrolls this494out[y*outw + x*f + i] = MIX_PIXELS(right, center, BILINEAR_FACTORS[f - 2][f - 1 - i]);495}496}497}498}499void bilinearH(int factor, const u32 *data, u32 *out, int w, int l, int u) {500switch (factor) {501case 2: bilinearHt<2>(data, out, w, l, u); break;502case 3: bilinearHt<3>(data, out, w, l, u); break;503case 4: bilinearHt<4>(data, out, w, l, u); break;504case 5: bilinearHt<5>(data, out, w, l, u); break;505default: ERROR_LOG(Log::G3D, "Bilinear upsampling only implemented for factors 2 to 5");506}507}508// integral bilinear upscaling by factor f, vertical part509// gl/gu == global lower and upper bound510template<int f>511void bilinearVt(const u32 *data, u32 *out, int w, int gl, int gu, int l, int u) {512static_assert(f>1 && f <= 5, "Bilinear scaling only implemented for 2x, 3x, 4x, and 5x");513int outw = w*f;514for (int xb = 0; xb < outw / BLOCK_SIZE + 1; ++xb) {515for (int y = l; y < u; ++y) {516u32 uy = y - (y == gl ? 0 : 1);517u32 ly = y + (y == gu - 1 ? 0 : 1);518for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < outw; ++x) {519u32 upper = data[uy * outw + x];520u32 center = data[y * outw + x];521u32 lower = data[ly * outw + x];522int i = 0;523for (; i < f / 2 + f % 2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this524out[(y*f + i)*outw + x] = MIX_PIXELS(upper, center, BILINEAR_FACTORS[f - 2][i]);525}526for (; i < f; ++i) { // second half of the new pixels, hope the compiler unrolls this527out[(y*f + i)*outw + x] = MIX_PIXELS(lower, center, BILINEAR_FACTORS[f - 2][f - 1 - i]);528}529}530}531}532}533void bilinearV(int factor, const u32 *data, u32 *out, int w, int gl, int gu, int l, int u) {534switch (factor) {535case 2: bilinearVt<2>(data, out, w, gl, gu, l, u); break;536case 3: bilinearVt<3>(data, out, w, gl, gu, l, u); break;537case 4: bilinearVt<4>(data, out, w, gl, gu, l, u); break;538case 5: bilinearVt<5>(data, out, w, gl, gu, l, u); break;539default: ERROR_LOG(Log::G3D, "Bilinear upsampling only implemented for factors 2 to 5");540}541}542543#undef BLOCK_SIZE544#undef MIX_PIXELS545#undef DISTANCE546#undef R547#undef G548#undef B549#undef A550551#ifdef DEBUG_SCALER_OUTPUT552553// used for debugging texture scaling (writing textures to files)554static int g_imgCount = 0;555void dbgPPM(int w, int h, u8* pixels, const char* prefix = "dbg") { // 3 component RGB556char fn[32];557snprintf(fn, 32, "%s%04d.ppm", prefix, g_imgCount++);558FILE *fp = fopen(fn, "wb");559fprintf(fp, "P6\n%d %d\n255\n", w, h);560for (int j = 0; j < h; ++j) {561for (int i = 0; i < w; ++i) {562static unsigned char color[3];563color[0] = pixels[(j*w + i) * 4 + 0]; /* red */564color[1] = pixels[(j*w + i) * 4 + 1]; /* green */565color[2] = pixels[(j*w + i) * 4 + 2]; /* blue */566fwrite(color, 1, 3, fp);567}568}569fclose(fp);570}571void dbgPGM(int w, int h, u32* pixels, const char* prefix = "dbg") { // 1 component572char fn[32];573snprintf(fn, 32, "%s%04d.pgm", prefix, g_imgCount++);574FILE *fp = fopen(fn, "wb");575fprintf(fp, "P5\n%d %d\n65536\n", w, h);576for (int j = 0; j < h; ++j) {577for (int i = 0; i < w; ++i) {578fwrite((pixels + (j*w + i)), 1, 2, fp);579}580}581fclose(fp);582}583584#endif585586}587588/////////////////////////////////////// Texture Scaler589590TextureScalerCommon::TextureScalerCommon() {591// initBicubicWeights() used to be here.592}593594TextureScalerCommon::~TextureScalerCommon() {595}596597bool TextureScalerCommon::IsEmptyOrFlat(const u32 *data, int pixels) {598u32 ref = data[0];599// TODO: SIMD-ify this (although, for most textures we'll get out very early)600for (int i = 1; i < pixels; ++i) {601if (data[i] != ref)602return false;603}604return true;605}606607void TextureScalerCommon::ScaleAlways(u32 *out, u32 *src, int width, int height, int *scaledWidth, int *scaledHeight, int factor) {608if (IsEmptyOrFlat(src, width * height)) {609// This means it was a flat texture. Vulkan wants the size up front, so we need to make it happen.610u32 pixel = *src;611612*scaledWidth = width * factor;613*scaledHeight = height * factor;614615size_t pixelCount = *scaledWidth * *scaledHeight;616617// ABCD. If A = D, and AB = CD, then they must all be equal (B = C, etc.)618if ((pixel & 0x000000FF) == (pixel >> 24) && (pixel & 0x0000FFFF) == (pixel >> 16)) {619memset(out, pixel & 0xFF, pixelCount * sizeof(u32));620} else {621// Let's hope this is vectorized.622for (int i = 0; i < pixelCount; ++i) {623out[i] = pixel;624}625}626} else {627ScaleInto(out, src, width, height, scaledWidth, scaledHeight, factor);628}629}630631bool TextureScalerCommon::ScaleInto(u32 *outputBuf, u32 *src, int width, int height, int *scaledWidth, int *scaledHeight, int factor) {632#ifdef SCALING_MEASURE_TIME633double t_start = time_now_d();634#endif635636u32 *inputBuf = src;637638// deposterize639if (g_Config.bTexDeposterize) {640bufDeposter.resize(width * height);641DePosterize(inputBuf, bufDeposter.data(), width, height);642inputBuf = bufDeposter.data();643}644645// scale646switch (g_Config.iTexScalingType) {647case XBRZ:648ScaleXBRZ(factor, inputBuf, outputBuf, width, height);649break;650case HYBRID:651ScaleHybrid(factor, inputBuf, outputBuf, width, height);652break;653case BICUBIC:654ScaleBicubicMitchell(factor, inputBuf, outputBuf, width, height);655break;656case HYBRID_BICUBIC:657ScaleHybrid(factor, inputBuf, outputBuf, width, height, true);658break;659default:660ERROR_LOG(Log::G3D, "Unknown scaling type: %d", g_Config.iTexScalingType);661}662663// update values accordingly664*scaledWidth = width * factor;665*scaledHeight = height * factor;666667#ifdef SCALING_MEASURE_TIME668if (*scaledWidth* *scaledHeight > 64 * 64 * factor*factor) {669double t = time_now_d() - t_start;670NOTICE_LOG(Log::G3D, "TextureScaler: processed %9d pixels in %6.5lf seconds. (%9.2lf Mpixels/second)",671*scaledWidth * *scaledHeight, t, (*scaledWidth * *scaledHeight) / (t * 1000 * 1000));672}673#endif674675return true;676}677678bool TextureScalerCommon::Scale(u32* &data, int width, int height, int *scaledWidth, int *scaledHeight, int factor) {679// prevent processing empty or flat textures (this happens a lot in some games)680// doesn't hurt the standard case, will be very quick for textures with actual texture681if (IsEmptyOrFlat(data, width*height)) {682DEBUG_LOG(Log::G3D, "TextureScaler: early exit -- empty/flat texture");683return false;684}685686bufOutput.resize(width * height * (factor * factor)); // used to store the upscaled image687u32 *outputBuf = bufOutput.data();688689if (ScaleInto(outputBuf, data, width, height, scaledWidth, scaledHeight, factor)) {690data = outputBuf;691return true;692}693return false;694}695696const int MIN_LINES_PER_THREAD = 4;697698void TextureScalerCommon::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {699xbrz::ScalerCfg cfg;700ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);701}702703void TextureScalerCommon::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) {704bufTmp1.resize(width * height * factor);705u32 *tmpBuf = bufTmp1.data();706ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);707ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);708}709710void TextureScalerCommon::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) {711ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);712}713714void TextureScalerCommon::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) {715ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);716}717718void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) {719// Basic algorithm:720// 1) determine a feature mask C based on a sobel-ish filter + splatting, and upscale that mask bilinearly721// 2) generate 2 scaled images: A - using Bilinear filtering, B - using xBRZ722// 3) output = A*C + B*(1-C)723724const static int KERNEL_SPLAT[3][3] = {725{ 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }726};727728bufTmp1.resize(width*height);729bufTmp2.resize(width*height*factor*factor);730bufTmp3.resize(width*height*factor*factor);731732ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);733ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);734ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height);735// mask C is now in bufTmp3736737ScaleXBRZ(factor, source, bufTmp2.data(), width, height);738// xBRZ upscaled source is in bufTmp2739740if (bicubic) ScaleBicubicBSpline(factor, source, dest, width, height);741else ScaleBilinear(factor, source, dest, width, height);742// Upscaled source is in dest743744// Now we can mix it all together745// The factor 8192 was found through practical testing on a variety of textures746ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, MIN_LINES_PER_THREAD);747}748749void TextureScalerCommon::DePosterize(u32* source, u32* dest, int width, int height) {750bufTmp3.resize(width*height);751ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);752ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);753ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);754ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);755}756757758