CoCalc -- TextureScalerCommon.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/TextureScalerCommon.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. 
17

18
#include <cstddef>
19
#include <algorithm>
20
#include <cstdlib>
21
#include <cstring>
22
#include <cmath>
23

24
#include "GPU/Common/TextureScalerCommon.h"
25

26
#include "Core/Config.h"
27
#include "Common/Common.h"
28
#include "Common/Log.h"
29
#include "Common/CommonFuncs.h"
30
#include "Common/Thread/ParallelLoop.h"
31
#include "Core/ThreadPools.h"
32
#include "Common/CPUDetect.h"
33
#include "ext/xbrz/xbrz.h"
34

35
#if defined(_M_SSE)
36
#include <emmintrin.h>
37
#include <smmintrin.h>
38
#endif
39

40
// Report the time and throughput for each larger scaling operation in the log
41
//#define SCALING_MEASURE_TIME
42

43
//#define DEBUG_SCALER_OUTPUT
44

45
#ifdef SCALING_MEASURE_TIME
46
#include "Common/TimeUtil.h"
47
#endif
48

49
/////////////////////////////////////// Helper Functions (mostly math for parallelization)
50

51
namespace {
52
//////////////////////////////////////////////////////////////////// Various image processing
53

54
#define R(_col) ((_col>> 0)&0xFF)
55
#define G(_col) ((_col>> 8)&0xFF)
56
#define B(_col) ((_col>>16)&0xFF)
57
#define A(_col) ((_col>>24)&0xFF)
58

59
#define DISTANCE(_p1,_p2) ( abs(static_cast<int>(static_cast<int>(R(_p1))-R(_p2))) + abs(static_cast<int>(static_cast<int>(G(_p1))-G(_p2))) \
60
							  + abs(static_cast<int>(static_cast<int>(B(_p1))-B(_p2))) + abs(static_cast<int>(static_cast<int>(A(_p1))-A(_p2))) )
61

62
// this is sadly much faster than an inline function with a loop, at least in VC10
63
#define MIX_PIXELS(_p0, _p1, _factors) \
64
		( (R(_p0)*(_factors)[0] + R(_p1)*(_factors)[1])/255 <<  0 ) | \
65
		( (G(_p0)*(_factors)[0] + G(_p1)*(_factors)[1])/255 <<  8 ) | \
66
		( (B(_p0)*(_factors)[0] + B(_p1)*(_factors)[1])/255 << 16 ) | \
67
		( (A(_p0)*(_factors)[0] + A(_p1)*(_factors)[1])/255 << 24 )
68

69
#define BLOCK_SIZE 32
70

71
// 3x3 convolution with Neumann boundary conditions, parallelizable
72
// quite slow, could be sped up a lot
73
// especially handling of separable kernels
74
void convolve3x3(const u32 *data, u32 *out, const int kernel[3][3], int width, int height, int l, int u) {
75
	for (int yb = 0; yb < (u - l) / BLOCK_SIZE + 1; ++yb) {
76
		for (int xb = 0; xb < width / BLOCK_SIZE + 1; ++xb) {
77
			for (int y = l + yb*BLOCK_SIZE; y < l + (yb + 1)*BLOCK_SIZE && y < u; ++y) {
78
				for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < width; ++x) {
79
					int val = 0;
80
					for (int yoff = -1; yoff <= 1; ++yoff) {
81
						int yy = std::max(std::min(y + yoff, height - 1), 0);
82
						for (int xoff = -1; xoff <= 1; ++xoff) {
83
							int xx = std::max(std::min(x + xoff, width - 1), 0);
84
							val += data[yy*width + xx] * kernel[yoff + 1][xoff + 1];
85
						}
86
					}
87
					out[y*width + x] = abs(val);
88
				}
89
			}
90
		}
91
	}
92
}
93

94
// deposterization: smoothes posterized gradients from low-color-depth (e.g. 444, 565, compressed) sources
95
void deposterizeH(const u32 *data, u32 *out, int w, int l, int u) {
96
	static const int T = 8;
97
	for (int y = l; y < u; ++y) {
98
		for (int x = 0; x < w; ++x) {
99
			int inpos = y*w + x;
100
			u32 center = data[inpos];
101
			if (x == 0 || x == w - 1) {
102
				out[y*w + x] = center;
103
				continue;
104
			}
105
			u32 left = data[inpos - 1];
106
			u32 right = data[inpos + 1];
107
			out[y*w + x] = 0;
108
			for (int c = 0; c < 4; ++c) {
109
				u8 lc = ((left >> c * 8) & 0xFF);
110
				u8 cc = ((center >> c * 8) & 0xFF);
111
				u8 rc = ((right >> c * 8) & 0xFF);
112
				if ((lc != rc) && ((lc == cc && abs((int)((int)rc) - cc) <= T) || (rc == cc && abs((int)((int)lc) - cc) <= T))) {
113
					// blend this component
114
					out[y*w + x] |= ((rc + lc) / 2) << (c * 8);
115
				} else {
116
					// no change for this component
117
					out[y*w + x] |= cc << (c * 8);
118
				}
119
			}
120
		}
121
	}
122
}
123
void deposterizeV(const u32 *data, u32 *out, int w, int h, int l, int u) {
124
	static const int T = 8;
125
	for (int xb = 0; xb < w / BLOCK_SIZE + 1; ++xb) {
126
		for (int y = l; y < u; ++y) {
127
			for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < w; ++x) {
128
				u32 center = data[y    * w + x];
129
				if (y == 0 || y == h - 1) {
130
					out[y*w + x] = center;
131
					continue;
132
				}
133
				u32 upper = data[(y - 1) * w + x];
134
				u32 lower = data[(y + 1) * w + x];
135
				out[y*w + x] = 0;
136
				for (int c = 0; c < 4; ++c) {
137
					u8 uc = ((upper >> c * 8) & 0xFF);
138
					u8 cc = ((center >> c * 8) & 0xFF);
139
					u8 lc = ((lower >> c * 8) & 0xFF);
140
					if ((uc != lc) && ((uc == cc && abs((int)((int)lc) - cc) <= T) || (lc == cc && abs((int)((int)uc) - cc) <= T))) {
141
						// blend this component
142
						out[y*w + x] |= ((lc + uc) / 2) << (c * 8);
143
					} else {
144
						// no change for this component
145
						out[y*w + x] |= cc << (c * 8);
146
					}
147
				}
148
			}
149
		}
150
	}
151
}
152

153
// generates a distance mask value for each pixel in data
154
// higher values -> larger distance to the surrounding pixels
155
void generateDistanceMask(const u32 *data, u32 *out, int width, int height, int l, int u) {
156
	for (int yb = 0; yb < (u - l) / BLOCK_SIZE + 1; ++yb) {
157
		for (int xb = 0; xb < width / BLOCK_SIZE + 1; ++xb) {
158
			for (int y = l + yb*BLOCK_SIZE; y < l + (yb + 1)*BLOCK_SIZE && y < u; ++y) {
159
				for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < width; ++x) {
160
					const u32 center = data[y*width + x];
161
					u32 dist = 0;
162
					for (int yoff = -1; yoff <= 1; ++yoff) {
163
						int yy = y + yoff;
164
						if (yy == height || yy == -1) {
165
							dist += 1200; // assume distance at borders, usually makes for better result
166
							continue;
167
						}
168
						for (int xoff = -1; xoff <= 1; ++xoff) {
169
							if (yoff == 0 && xoff == 0) continue;
170
							int xx = x + xoff;
171
							if (xx == width || xx == -1) {
172
								dist += 400; // assume distance at borders, usually makes for better result
173
								continue;
174
							}
175
							dist += DISTANCE(data[yy*width + xx], center);
176
						}
177
					}
178
					out[y*width + x] = dist;
179
				}
180
			}
181
		}
182
	}
183
}
184

185
// mix two images based on a mask
186
void mix(u32 *data, const u32 *source, const u32 *mask, u32 maskmax, int width, int l, int u) {
187
	for (int y = l; y < u; ++y) {
188
		for (int x = 0; x < width; ++x) {
189
			int pos = y*width + x;
190
			u8 mixFactors[2] = { 0, static_cast<u8>((std::min(mask[pos], maskmax) * 255) / maskmax) };
191
			mixFactors[0] = 255 - mixFactors[1];
192
			data[pos] = MIX_PIXELS(data[pos], source[pos], mixFactors);
193
			if (A(source[pos]) == 0) data[pos] = data[pos] & 0x00FFFFFF; // xBRZ always does a better job with hard alpha
194
		}
195
	}
196
}
197

198
//////////////////////////////////////////////////////////////////// Bicubic scaling
199

200
// Code for the cubic upscaler is pasted below as-is.
201
// WARNING: different codestyle.
202

203
// NOTE: in several places memcpy is used instead of type punning,
204
// to avoid strict aliasing problems. This may produce suboptimal
205
// code, especially on MSVC.
206

207
// Loads a sample (4 bytes) from image into 'output'.
208
static void load_sample(ptrdiff_t w, ptrdiff_t h, ptrdiff_t s, const u8 *pixels, int wrap_mode, ptrdiff_t x, ptrdiff_t y, u8 *output) {
209
	// Check if the sample is inside. NOTE: for b>=0
210
	// the expression (UNSIGNED)a<(UNSIGNED)b is
211
	// equivalent to a>=0&&a<b.
212
	static_assert(sizeof(ptrdiff_t) == sizeof(size_t), "Assumes ptrdiff_t same width as size_t");
213

214
	if((size_t)x >= (size_t)w || (size_t)y >= (size_t)h) {
215
		switch(wrap_mode) {
216
			case 0: // Wrap
217
				if(!((w & (w-1)) | (h & (h-1)))) {
218
					// Both w and h are powers of 2.
219
					x &= w-1;
220
					y &= h-1;
221
				} else {
222
					// For e.g. 1x1 images we might need to wrap several
223
					// times, hence 'while', instead of 'if'. Probably
224
					// still faster, than modulo.
225
					while(x <  0) x += w;
226
					while(y <  0) y += h;
227
					while(x >= w) x -= w;
228
					while(y >= h) y -= h;
229
				}
230
				break;
231
			case 1: // Clamp
232
				if(x <  0) x = 0;
233
				if(y <  0) y = 0;
234
				if(x >= w) x = w-1;
235
				if(y >= h) y = h-1;
236
				break;
237
			case 2: // Zero
238
				memset(output, 0, 4);
239
				return;
240
		}
241
	}
242
	memcpy(output, pixels + s*y + 4*x, 4);
243
}
244

245
#define BLOCK 8
246

247
static void init_block(
248
	ptrdiff_t w, ptrdiff_t h,
249
	ptrdiff_t src_stride, const u8 *src_pixels,
250
	int wrap_mode, ptrdiff_t factor, float B, float C,
251
	ptrdiff_t x0, ptrdiff_t y0,
252
	float (*cx)[4], float (*cy)[4],
253
	ptrdiff_t *lx, ptrdiff_t *ly, ptrdiff_t *lx0, ptrdiff_t *ly0, ptrdiff_t *sx, ptrdiff_t *sy,
254
	u8 (*src)[(BLOCK+4)*4]) {
255
	// Precomputed coefficients for pixel weights
256
	// in the Mitchell-Netravali filter:
257
	//   output = SUM(wij*pixel[i]*t^j)
258
	// where t is distance from pixel[1] to the
259
	// sampling position.
260
	float   w00 = B/6.0f     ,  w01 = -C-0.5f*B,  w02 = 2.0f*C+0.5f*B      , w03 = -C-B/6.0f     ;
261
	float   w10 = 1.0f-B/3.0f,/*w11 = 0.0f     ,*/w12 = C+2.0f*B-3.0f      , w13 = -C-1.5f*B+2.0f;
262
	float   w20 = B/6.0f     ,  w21 =  C+0.5f*B,  w22 = -2.0f*C-2.5f*B+3.0f, w23 =  C+1.5f*B-2.0f;
263
	float /*w30 = 0.0f       ,  w31 = 0.0f     ,*/w32 = -C                 , w33 =  C+B/6.0f     ;
264
	// Express the sampling position as a rational
265
	// number num/den-1 (off by one, so that num is
266
	// always positive, since the C language does
267
	// not do Euclidean division). Sampling points
268
	// for both src and dst are assumed at pixel centers.
269
	ptrdiff_t den = 2*factor;
270
	float inv_den = 1.0f/(float)den;
271
	for(int dir = 0; dir < 2; ++dir) {
272
		ptrdiff_t num = (dir ? 2*y0+1+factor : 2*x0+1+factor);
273
		ptrdiff_t *l = (dir ? ly : lx), *l0 = (dir ? ly0 : lx0), *s = (dir ? sy : sx);
274
		float (*c)[4] = (dir ? cy : cx);
275
		(*l0) = num/den-2;
276
		num = num%den;
277
		for(ptrdiff_t i = 0, j = 0; i < BLOCK; ++i) {
278
			l[i] = j; // i-th dst pixel accesses src pixels (l0+l[i])..(l0+l[i]+3) in {x|y} direction.
279
			float t = (float)num*inv_den; // Fractional part of the sampling position.
280
			// Write out pixel weights.
281
			c[i][0] = ((w03*t+w02)*t  +w01  )*t  +w00  ;
282
			c[i][1] = ((w13*t+w12)*t/*+w11*/)*t  +w10  ;
283
			c[i][2] = ((w23*t+w22)*t  +w21  )*t  +w20  ;
284
			c[i][3] = ((w33*t+w32)*t/*+w31*/)*t/*+w30*/;
285
			// Increment the sampling position.
286
			if((num += 2) >= den) {num -= den; j += 1;}
287
		}
288
		(*s) = l[BLOCK-1]+4; // Total sampled src pixels in {x|y} direction.
289
	}
290
	// Get a local copy of the source pixels.
291
	if((*lx0) >=0 && (*ly0) >= 0 && *lx0 + (*sx) <= w && *ly0 + (*sy) <= h) {
292
		for(ptrdiff_t iy = 0; iy < (*sy); ++iy)
293
			memcpy(src[iy], src_pixels+src_stride*((*ly0) + iy) + 4*(*lx0), (size_t)(4*(*sx)));
294
	}
295
	else {
296
		for(ptrdiff_t iy = 0; iy < (*sy); ++iy) for(ptrdiff_t ix = 0; ix < (*sx); ++ix)
297
			load_sample(w, h, src_stride, src_pixels, wrap_mode, (*lx0) + ix, (*ly0) + iy, src[iy] + 4*ix);
298
	}
299
}
300

301
static void upscale_block_c(
302
	ptrdiff_t w, ptrdiff_t h,
303
	ptrdiff_t src_stride, const u8 *src_pixels,
304
	int wrap_mode, ptrdiff_t factor, float B, float C,
305
	ptrdiff_t x0, ptrdiff_t y0,
306
	u8 *dst_pixels) {
307
	float cx[BLOCK][4], cy[BLOCK][4];
308
	ptrdiff_t lx[BLOCK], ly[BLOCK], lx0, ly0, sx, sy;
309
	u8 src[BLOCK+4][(BLOCK+4)*4];
310
	float buf[2][BLOCK+4][BLOCK+4][4];
311
	init_block(
312
		w, h, src_stride, src_pixels, wrap_mode, factor, B, C, x0, y0,
313
		cx, cy, lx, ly, &lx0, &ly0, &sx, &sy, src);
314
	// Unpack source pixels.
315
	for(ptrdiff_t iy = 0; iy < sy; ++iy)
316
		for(ptrdiff_t ix = 0; ix < sx; ++ix)
317
			for(ptrdiff_t k = 0; k < 4; ++k)
318
				buf[0][iy][ix][k] = (float)(int)src[iy][4*ix + k];
319
	// Horizontal pass.
320
	for(ptrdiff_t ix = 0; ix < BLOCK; ++ix) {
321
		#define S(i) (buf[0][iy][lx[ix] + i][k])
322
		float C0 = cx[ix][0], C1 = cx[ix][1], C2 = cx[ix][2], C3 = cx[ix][3];
323
		for(ptrdiff_t iy = 0; iy < sy; ++iy)
324
			for(ptrdiff_t k = 0; k < 4; ++k)
325
				buf[1][iy][ix][k] = S(0)*C0 + S(1)*C1 + S(2)*C2 + S(3)*C3;
326
		#undef S
327
	}
328
	// Vertical pass.
329
	for(ptrdiff_t iy = 0; iy < BLOCK; ++iy) {
330
		#define S(i) (buf[1][ly[iy]+i][ix][k])
331
		float C0 = cy[iy][0], C1 = cy[iy][1], C2 = cy[iy][2], C3 = cy[iy][3];
332
		for(ptrdiff_t ix = 0; ix < BLOCK; ++ix)
333
			for(ptrdiff_t k = 0; k < 4; ++k)
334
				buf[0][iy][ix][k] = S(0)*C0 + S(1)*C1 + S(2)*C2 + S(3)*C3;
335
		#undef S
336
	}
337
	// Pack destination pixels.
338
	for(ptrdiff_t iy = 0; iy < BLOCK; ++iy)
339
		for(ptrdiff_t ix = 0; ix < BLOCK; ++ix) {
340
			u8 pixel[4];
341
			for(ptrdiff_t k = 0; k < 4; ++k) {
342
				float C = buf[0][iy][ix][k];
343
				if(!(C>0.0f)) C = 0.0f;
344
				if(C>255.0f)  C = 255.0f;
345
				pixel[k] = (u8)(int)(C + 0.5f);
346
			}
347
			memcpy(dst_pixels + 4*(BLOCK*iy + ix), pixel, 4);
348
		}
349
}
350

351
#if defined(_M_SSE)
352

353
#if defined(__GNUC__)
354
#define ALIGNED(n) __attribute__((aligned(n)))
355
#elif defined(_MSC_VER)
356
#define ALIGNED(n) __declspec(align(n))
357
#else
358
// For our use case, ALIGNED is a hint, not a requirement,
359
// so it's fine to ignore it.
360
#define ALIGNED(n)
361
#endif
362

363
static void upscale_block_sse2(
364
	ptrdiff_t w, ptrdiff_t h,
365
	ptrdiff_t src_stride, const u8 *src_pixels,
366
	int wrap_mode, ptrdiff_t factor, float B, float C,
367
	ptrdiff_t x0, ptrdiff_t y0,
368
	u8 *dst_pixels) {
369
	float cx[BLOCK][4], cy[BLOCK][4];
370
	ptrdiff_t lx[BLOCK], ly[BLOCK], lx0, ly0, sx, sy;
371
	ALIGNED(16) u8 src[BLOCK+4][(BLOCK+4)*4];
372
	ALIGNED(16) float buf[2][BLOCK+4][BLOCK+4][4];
373
	init_block(
374
		w, h, src_stride, src_pixels, wrap_mode, factor, B, C, x0, y0,
375
		cx, cy, lx, ly, &lx0, &ly0, &sx, &sy, src);
376
	// Unpack source pixels.
377
	for(ptrdiff_t iy = 0; iy < sy; ++iy)
378
		for(ptrdiff_t ix = 0; ix < sx; ++ix) {
379
			int pixel;
380
			memcpy(&pixel, src[iy] + 4*ix, 4);
381
			__m128i C = _mm_cvtsi32_si128(pixel);
382
			C = _mm_unpacklo_epi8(C, _mm_set1_epi32(0));
383
			C = _mm_unpacklo_epi8(C, _mm_set1_epi32(0));
384
			_mm_storeu_ps(buf[0][iy][ix], _mm_cvtepi32_ps(C));
385
		}
386
	// Horizontal pass.
387
	for(ptrdiff_t ix = 0; ix < BLOCK; ++ix) {
388
		#define S(i) (buf[0][iy][lx[ix] + i])
389
		__m128 C0 = _mm_set1_ps(cx[ix][0]),
390
			C1 = _mm_set1_ps(cx[ix][1]),
391
			C2 = _mm_set1_ps(cx[ix][2]),
392
			C3 = _mm_set1_ps(cx[ix][3]);
393
		for(ptrdiff_t iy = 0; iy < sy; ++iy)
394
			_mm_storeu_ps(buf[1][iy][ix],
395
				_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(0)), C0),
396
				_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(1)), C1),
397
				_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(2)), C2),
398
					   _mm_mul_ps(_mm_loadu_ps(S(3)), C3)))));
399
		#undef S
400
	}
401
	// Vertical pass.
402
	for(ptrdiff_t iy = 0; iy < BLOCK; ++iy) {
403
		#define S(i) (buf[1][ly[iy] + i][ix])
404
		__m128 C0 = _mm_set1_ps(cy[iy][0]),
405
			C1 = _mm_set1_ps(cy[iy][1]),
406
			C2 = _mm_set1_ps(cy[iy][2]),
407
			C3 = _mm_set1_ps(cy[iy][3]);
408
		for(ptrdiff_t ix = 0; ix < BLOCK; ++ix)
409
			_mm_storeu_ps(buf[0][iy][ix],
410
				_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(0)), C0),
411
				_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(1)), C1),
412
				_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(2)), C2),
413
					   _mm_mul_ps(_mm_loadu_ps(S(3)), C3)))));
414
		#undef S
415
	}
416
	// Pack destination pixels.
417
	for(ptrdiff_t iy = 0; iy < BLOCK; ++iy)
418
		for(ptrdiff_t ix = 0; ix < BLOCK; ++ix) {
419
			__m128 C = _mm_loadu_ps(buf[0][iy][ix]);
420
			C = _mm_min_ps(_mm_max_ps(C, _mm_set1_ps(0.0f)), _mm_set1_ps(255.0f));
421
			C = _mm_add_ps(C, _mm_set1_ps(0.5f));
422
			__m128i R = _mm_cvttps_epi32(C);
423
			R = _mm_packus_epi16(R, R);
424
			R = _mm_packus_epi16(R, R);
425
			int pixel = _mm_cvtsi128_si32(R);
426
			memcpy(dst_pixels + 4*(BLOCK*iy+ix), &pixel, 4);
427
		}
428
}
429
#endif // defined(_M_SSE)
430

431
static void upscale_cubic(
432
	ptrdiff_t width, ptrdiff_t height,	ptrdiff_t src_stride_in_bytes, const void *src_pixels,
433
									  	ptrdiff_t dst_stride_in_bytes, void       *dst_pixels,
434
	ptrdiff_t scale, float B, float C, int wrap_mode,
435
	ptrdiff_t x0, ptrdiff_t y0, ptrdiff_t x1, ptrdiff_t y1) {
436
	u8 pixels[BLOCK*BLOCK*4];
437
	for(ptrdiff_t y = y0; y < y1; y+= BLOCK)
438
		for(ptrdiff_t x = x0; x < x1; x+= BLOCK) {
439
#if defined(_M_SSE)
440
			upscale_block_sse2(width, height, src_stride_in_bytes, (const u8*)src_pixels, wrap_mode, scale, B, C, x, y, pixels);
441
#else
442
			upscale_block_c   (width, height, src_stride_in_bytes, (const u8*)src_pixels, wrap_mode, scale, B, C, x, y, pixels);
443
#endif
444
			for(ptrdiff_t iy = 0, ny = (y1-y < BLOCK ? y1-y : BLOCK), nx = (x1-x < BLOCK ? x1-x : BLOCK); iy < ny; ++iy)
445
				memcpy((u8*)dst_pixels + dst_stride_in_bytes*(y+iy) + 4*x, pixels + BLOCK*4*iy, (size_t)(4*nx));
446
		}
447
}
448

449
// End of pasted cubic upscaler.
450

451
void scaleBicubicBSpline(int factor, const u32 *data, u32 *out, int w, int h, int l, int u) {
452
	const float B = 1.0f, C = 0.0f;
453
	const int wrap_mode = 1; // Clamp
454
	upscale_cubic(
455
		w, h, w*4, data,
456
		factor*w*4, out,
457
		factor, B, C, wrap_mode,
458
		0, factor*l, factor*w, factor*u);
459
}
460

461
void scaleBicubicMitchell(int factor, const u32 *data, u32 *out, int w, int h, int l, int u) {
462
	const float B = 0.0f, C = 0.5f; // Actually, Catmull-Rom
463
	const int wrap_mode = 1; // Clamp
464
	upscale_cubic(
465
		w, h, w*4, data,
466
		factor*w*4, out,
467
		factor, B, C, wrap_mode,
468
		0, factor*l, factor*w, factor*u);
469
}
470

471
//////////////////////////////////////////////////////////////////// Bilinear scaling
472

473
const static u8 BILINEAR_FACTORS[4][3][2] = {
474
		{ { 44, 211 }, { 0, 0 }, { 0, 0 } }, // x2
475
		{ { 64, 191 }, { 0, 255 }, { 0, 0 } }, // x3
476
		{ { 77, 178 }, { 26, 229 }, { 0, 0 } }, // x4
477
		{ { 102, 153 }, { 51, 204 }, { 0, 255 } }, // x5
478
};
479
// integral bilinear upscaling by factor f, horizontal part
480
template<int f>
481
void bilinearHt(const u32 *data, u32 *out, int w, int l, int u) {
482
	static_assert(f > 1 && f <= 5, "Bilinear scaling only implemented for factors 2 to 5");
483
	int outw = w*f;
484
	for (int y = l; y < u; ++y) {
485
		for (int x = 0; x < w; ++x) {
486
			int inpos = y*w + x;
487
			u32 left = data[inpos - (x == 0 ? 0 : 1)];
488
			u32 center = data[inpos];
489
			u32 right = data[inpos + (x == w - 1 ? 0 : 1)];
490
			int i = 0;
491
			for (; i < f / 2 + f % 2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
492
				out[y*outw + x*f + i] = MIX_PIXELS(left, center, BILINEAR_FACTORS[f - 2][i]);
493
			}
494
			for (; i < f; ++i) { // second half of the new pixels, hope the compiler unrolls this
495
				out[y*outw + x*f + i] = MIX_PIXELS(right, center, BILINEAR_FACTORS[f - 2][f - 1 - i]);
496
			}
497
		}
498
	}
499
}
500
void bilinearH(int factor, const u32 *data, u32 *out, int w, int l, int u) {
501
	switch (factor) {
502
	case 2: bilinearHt<2>(data, out, w, l, u); break;
503
	case 3: bilinearHt<3>(data, out, w, l, u); break;
504
	case 4: bilinearHt<4>(data, out, w, l, u); break;
505
	case 5: bilinearHt<5>(data, out, w, l, u); break;
506
	default: ERROR_LOG(Log::G3D, "Bilinear upsampling only implemented for factors 2 to 5");
507
	}
508
}
509
// integral bilinear upscaling by factor f, vertical part
510
// gl/gu == global lower and upper bound
511
template<int f>
512
void bilinearVt(const u32 *data, u32 *out, int w, int gl, int gu, int l, int u) {
513
	static_assert(f>1 && f <= 5, "Bilinear scaling only implemented for 2x, 3x, 4x, and 5x");
514
	int outw = w*f;
515
	for (int xb = 0; xb < outw / BLOCK_SIZE + 1; ++xb) {
516
		for (int y = l; y < u; ++y) {
517
			u32 uy = y - (y == gl ? 0 : 1);
518
			u32 ly = y + (y == gu - 1 ? 0 : 1);
519
			for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < outw; ++x) {
520
				u32 upper = data[uy * outw + x];
521
				u32 center = data[y * outw + x];
522
				u32 lower = data[ly * outw + x];
523
				int i = 0;
524
				for (; i < f / 2 + f % 2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
525
					out[(y*f + i)*outw + x] = MIX_PIXELS(upper, center, BILINEAR_FACTORS[f - 2][i]);
526
				}
527
				for (; i < f; ++i) { // second half of the new pixels, hope the compiler unrolls this
528
					out[(y*f + i)*outw + x] = MIX_PIXELS(lower, center, BILINEAR_FACTORS[f - 2][f - 1 - i]);
529
				}
530
			}
531
		}
532
	}
533
}
534
void bilinearV(int factor, const u32 *data, u32 *out, int w, int gl, int gu, int l, int u) {
535
	switch (factor) {
536
	case 2: bilinearVt<2>(data, out, w, gl, gu, l, u); break;
537
	case 3: bilinearVt<3>(data, out, w, gl, gu, l, u); break;
538
	case 4: bilinearVt<4>(data, out, w, gl, gu, l, u); break;
539
	case 5: bilinearVt<5>(data, out, w, gl, gu, l, u); break;
540
	default: ERROR_LOG(Log::G3D, "Bilinear upsampling only implemented for factors 2 to 5");
541
	}
542
}
543

544
#undef BLOCK_SIZE
545
#undef MIX_PIXELS
546
#undef DISTANCE
547
#undef R
548
#undef G
549
#undef B
550
#undef A
551

552
#ifdef DEBUG_SCALER_OUTPUT
553

554
// used for debugging texture scaling (writing textures to files)
555
static int g_imgCount = 0;
556
void dbgPPM(int w, int h, u8* pixels, const char* prefix = "dbg") { // 3 component RGB
557
	char fn[32];
558
	snprintf(fn, 32, "%s%04d.ppm", prefix, g_imgCount++);
559
	FILE *fp = fopen(fn, "wb");
560
	fprintf(fp, "P6\n%d %d\n255\n", w, h);
561
	for (int j = 0; j < h; ++j) {
562
		for (int i = 0; i < w; ++i) {
563
			static unsigned char color[3];
564
			color[0] = pixels[(j*w + i) * 4 + 0];  /* red */
565
			color[1] = pixels[(j*w + i) * 4 + 1];  /* green */
566
			color[2] = pixels[(j*w + i) * 4 + 2];  /* blue */
567
			fwrite(color, 1, 3, fp);
568
		}
569
	}
570
	fclose(fp);
571
}
572
void dbgPGM(int w, int h, u32* pixels, const char* prefix = "dbg") { // 1 component
573
	char fn[32];
574
	snprintf(fn, 32, "%s%04d.pgm", prefix, g_imgCount++);
575
	FILE *fp = fopen(fn, "wb");
576
	fprintf(fp, "P5\n%d %d\n65536\n", w, h);
577
	for (int j = 0; j < h; ++j) {
578
		for (int i = 0; i < w; ++i) {
579
			fwrite((pixels + (j*w + i)), 1, 2, fp);
580
		}
581
	}
582
	fclose(fp);
583
}
584

585
#endif
586

587
}
588

589
/////////////////////////////////////// Texture Scaler
590

591
TextureScalerCommon::TextureScalerCommon() {
592
	// initBicubicWeights() used to be here.
593
}
594

595
TextureScalerCommon::~TextureScalerCommon() {
596
}
597

598
bool TextureScalerCommon::IsEmptyOrFlat(const u32 *data, int pixels) {
599
	u32 ref = data[0];
600
	// TODO: SIMD-ify this (although, for most textures we'll get out very early)
601
	for (int i = 1; i < pixels; ++i) {
602
		if (data[i] != ref)
603
			return false;
604
	}
605
	return true;
606
}
607

608
void TextureScalerCommon::ScaleAlways(u32 *out, u32 *src, int width, int height, int *scaledWidth, int *scaledHeight, int factor) {
609
	if (IsEmptyOrFlat(src, width * height)) {
610
		// This means it was a flat texture.  Vulkan wants the size up front, so we need to make it happen.
611
		u32 pixel = *src;
612

613
		*scaledWidth = width * factor;
614
		*scaledHeight = height * factor;
615

616
		size_t pixelCount = *scaledWidth * *scaledHeight;
617

618
		// ABCD.  If A = D, and AB = CD, then they must all be equal (B = C, etc.)
619
		if ((pixel & 0x000000FF) == (pixel >> 24) && (pixel & 0x0000FFFF) == (pixel >> 16)) {
620
			memset(out, pixel & 0xFF, pixelCount * sizeof(u32));
621
		} else {
622
			// Let's hope this is vectorized.
623
			for (int i = 0; i < pixelCount; ++i) {
624
				out[i] = pixel;
625
			}
626
		}
627
	} else {
628
		ScaleInto(out, src, width, height, scaledWidth, scaledHeight, factor);
629
	}
630
}
631

632
bool TextureScalerCommon::ScaleInto(u32 *outputBuf, u32 *src, int width, int height, int *scaledWidth, int *scaledHeight, int factor) {
633
#ifdef SCALING_MEASURE_TIME
634
	double t_start = time_now_d();
635
#endif
636

637
	u32 *inputBuf = src;
638

639
	// deposterize
640
	if (g_Config.bTexDeposterize) {
641
		bufDeposter.resize(width * height);
642
		DePosterize(inputBuf, bufDeposter.data(), width, height);
643
		inputBuf = bufDeposter.data();
644
	}
645

646
	// scale 
647
	switch (g_Config.iTexScalingType) {
648
	case XBRZ:
649
		ScaleXBRZ(factor, inputBuf, outputBuf, width, height);
650
		break;
651
	case HYBRID:
652
		ScaleHybrid(factor, inputBuf, outputBuf, width, height);
653
		break;
654
	case BICUBIC:
655
		ScaleBicubicMitchell(factor, inputBuf, outputBuf, width, height);
656
		break;
657
	case HYBRID_BICUBIC:
658
		ScaleHybrid(factor, inputBuf, outputBuf, width, height, true);
659
		break;
660
	default:
661
		ERROR_LOG(Log::G3D, "Unknown scaling type: %d", g_Config.iTexScalingType);
662
	}
663

664
	// update values accordingly
665
	*scaledWidth = width * factor;
666
	*scaledHeight = height * factor;
667

668
#ifdef SCALING_MEASURE_TIME
669
	if (*scaledWidth* *scaledHeight > 64 * 64 * factor*factor) {
670
		double t = time_now_d() - t_start;
671
		NOTICE_LOG(Log::G3D, "TextureScaler: processed %9d pixels in %6.5lf seconds. (%9.2lf Mpixels/second)",
672
			*scaledWidth * *scaledHeight, t, (*scaledWidth * *scaledHeight) / (t * 1000 * 1000));
673
	}
674
#endif
675

676
	return true;
677
}
678

679
bool TextureScalerCommon::Scale(u32* &data, int width, int height, int *scaledWidth, int *scaledHeight, int factor) {
680
	// prevent processing empty or flat textures (this happens a lot in some games)
681
	// doesn't hurt the standard case, will be very quick for textures with actual texture
682
	if (IsEmptyOrFlat(data, width*height)) {
683
		DEBUG_LOG(Log::G3D, "TextureScaler: early exit -- empty/flat texture");
684
		return false;
685
	}
686

687
	bufOutput.resize(width * height * (factor * factor)); // used to store the upscaled image
688
	u32 *outputBuf = bufOutput.data();
689

690
	if (ScaleInto(outputBuf, data, width, height, scaledWidth, scaledHeight, factor)) {
691
		data = outputBuf;
692
		return true;
693
	}
694
	return false;
695
}
696

697
const int MIN_LINES_PER_THREAD = 4;
698

699
void TextureScalerCommon::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {
700
	xbrz::ScalerCfg cfg;
701
	ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
702
}
703

704
void TextureScalerCommon::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) {
705
	bufTmp1.resize(width * height * factor);
706
	u32 *tmpBuf = bufTmp1.data();
707
	ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
708
	ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
709
}
710

711
void TextureScalerCommon::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) {
712
	ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
713
}
714

715
void TextureScalerCommon::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) {
716
	ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
717
}
718

719
void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) {
720
	// Basic algorithm:
721
	// 1) determine a feature mask C based on a sobel-ish filter + splatting, and upscale that mask bilinearly
722
	// 2) generate 2 scaled images: A - using Bilinear filtering, B - using xBRZ
723
	// 3) output = A*C + B*(1-C)
724

725
	const static int KERNEL_SPLAT[3][3] = {
726
			{ 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }
727
	};
728

729
	bufTmp1.resize(width*height);
730
	bufTmp2.resize(width*height*factor*factor);
731
	bufTmp3.resize(width*height*factor*factor);
732

733
	ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
734
	ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
735
	ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height);
736
	// mask C is now in bufTmp3
737

738
	ScaleXBRZ(factor, source, bufTmp2.data(), width, height);
739
	// xBRZ upscaled source is in bufTmp2
740

741
	if (bicubic) ScaleBicubicBSpline(factor, source, dest, width, height);
742
	else ScaleBilinear(factor, source, dest, width, height);
743
	// Upscaled source is in dest
744

745
	// Now we can mix it all together
746
	// The factor 8192 was found through practical testing on a variety of textures
747
	ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, MIN_LINES_PER_THREAD);
748
}
749

750
void TextureScalerCommon::DePosterize(u32* source, u32* dest, int width, int height) {
751
	bufTmp3.resize(width*height);
752
	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
753
	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
754
	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
755
	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
756
}
757

758
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company