Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libwebp/src/enc/quant_enc.c
20952 views
1
// Copyright 2011 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// Quantization
11
//
12
// Author: Skal ([email protected])
13
14
#include <assert.h>
15
#include <math.h>
16
#include <stdlib.h> // for abs()
17
#include <string.h>
18
19
#include "src/dec/common_dec.h"
20
#include "src/dsp/dsp.h"
21
#include "src/dsp/quant.h"
22
#include "src/enc/cost_enc.h"
23
#include "src/enc/vp8i_enc.h"
24
#include "src/webp/types.h"
25
26
#define DO_TRELLIS_I4 1
27
#define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
28
#define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
29
#define USE_TDISTO 1
30
31
#define MID_ALPHA 64 // neutral value for susceptibility
32
#define MIN_ALPHA 30 // lowest usable value for susceptibility
33
#define MAX_ALPHA 100 // higher meaningful value for susceptibility
34
35
#define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
36
// power-law modulation. Must be strictly less than 1.
37
38
// number of non-zero coeffs below which we consider the block very flat
39
// (and apply a penalty to complex predictions)
40
#define FLATNESS_LIMIT_I16 0 // I16 mode (special case)
41
#define FLATNESS_LIMIT_I4 3 // I4 mode
42
#define FLATNESS_LIMIT_UV 2 // UV mode
43
#define FLATNESS_PENALTY 140 // roughly ~1bit per block
44
45
#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
46
47
#define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
48
49
// #define DEBUG_BLOCK
50
51
//------------------------------------------------------------------------------
52
53
#if defined(DEBUG_BLOCK)
54
55
#include <stdio.h>
56
#include <stdlib.h>
57
58
static void PrintBlockInfo(const VP8EncIterator* const it,
59
const VP8ModeScore* const rd) {
60
int i, j;
61
const int is_i16 = (it->mb->type == 1);
62
const uint8_t* const y_in = it->yuv_in + Y_OFF_ENC;
63
const uint8_t* const y_out = it->yuv_out + Y_OFF_ENC;
64
const uint8_t* const uv_in = it->yuv_in + U_OFF_ENC;
65
const uint8_t* const uv_out = it->yuv_out + U_OFF_ENC;
66
printf("SOURCE / OUTPUT / ABS DELTA\n");
67
for (j = 0; j < 16; ++j) {
68
for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
69
printf(" ");
70
for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
71
printf(" ");
72
for (i = 0; i < 16; ++i) {
73
printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
74
}
75
printf("\n");
76
}
77
printf("\n"); // newline before the U/V block
78
for (j = 0; j < 8; ++j) {
79
for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
80
printf(" ");
81
for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
82
printf(" ");
83
for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
84
printf(" ");
85
for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
86
printf(" ");
87
for (i = 0; i < 8; ++i) {
88
printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89
}
90
printf(" ");
91
for (i = 8; i < 16; ++i) {
92
printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
93
}
94
printf("\n");
95
}
96
printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
97
(int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
98
(int)rd->score);
99
if (is_i16) {
100
printf("Mode: %d\n", rd->mode_i16);
101
printf("y_dc_levels:");
102
for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
103
printf("\n");
104
} else {
105
printf("Modes[16]: ");
106
for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
107
printf("\n");
108
}
109
printf("y_ac_levels:\n");
110
for (j = 0; j < 16; ++j) {
111
for (i = is_i16 ? 1 : 0; i < 16; ++i) {
112
printf("%4d ", rd->y_ac_levels[j][i]);
113
}
114
printf("\n");
115
}
116
printf("\n");
117
printf("uv_levels (mode=%d):\n", rd->mode_uv);
118
for (j = 0; j < 8; ++j) {
119
for (i = 0; i < 16; ++i) {
120
printf("%4d ", rd->uv_levels[j][i]);
121
}
122
printf("\n");
123
}
124
}
125
126
#endif // DEBUG_BLOCK
127
128
//------------------------------------------------------------------------------
129
130
static WEBP_INLINE int clip(int v, int m, int M) {
131
return v < m ? m : v > M ? M : v;
132
}
133
134
static const uint8_t kZigzag[16] = {
135
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
136
};
137
138
static const uint8_t kDcTable[128] = {
139
4, 5, 6, 7, 8, 9, 10, 10,
140
11, 12, 13, 14, 15, 16, 17, 17,
141
18, 19, 20, 20, 21, 21, 22, 22,
142
23, 23, 24, 25, 25, 26, 27, 28,
143
29, 30, 31, 32, 33, 34, 35, 36,
144
37, 37, 38, 39, 40, 41, 42, 43,
145
44, 45, 46, 46, 47, 48, 49, 50,
146
51, 52, 53, 54, 55, 56, 57, 58,
147
59, 60, 61, 62, 63, 64, 65, 66,
148
67, 68, 69, 70, 71, 72, 73, 74,
149
75, 76, 76, 77, 78, 79, 80, 81,
150
82, 83, 84, 85, 86, 87, 88, 89,
151
91, 93, 95, 96, 98, 100, 101, 102,
152
104, 106, 108, 110, 112, 114, 116, 118,
153
122, 124, 126, 128, 130, 132, 134, 136,
154
138, 140, 143, 145, 148, 151, 154, 157
155
};
156
157
static const uint16_t kAcTable[128] = {
158
4, 5, 6, 7, 8, 9, 10, 11,
159
12, 13, 14, 15, 16, 17, 18, 19,
160
20, 21, 22, 23, 24, 25, 26, 27,
161
28, 29, 30, 31, 32, 33, 34, 35,
162
36, 37, 38, 39, 40, 41, 42, 43,
163
44, 45, 46, 47, 48, 49, 50, 51,
164
52, 53, 54, 55, 56, 57, 58, 60,
165
62, 64, 66, 68, 70, 72, 74, 76,
166
78, 80, 82, 84, 86, 88, 90, 92,
167
94, 96, 98, 100, 102, 104, 106, 108,
168
110, 112, 114, 116, 119, 122, 125, 128,
169
131, 134, 137, 140, 143, 146, 149, 152,
170
155, 158, 161, 164, 167, 170, 173, 177,
171
181, 185, 189, 193, 197, 201, 205, 209,
172
213, 217, 221, 225, 229, 234, 239, 245,
173
249, 254, 259, 264, 269, 274, 279, 284
174
};
175
176
static const uint16_t kAcTable2[128] = {
177
8, 8, 9, 10, 12, 13, 15, 17,
178
18, 20, 21, 23, 24, 26, 27, 29,
179
31, 32, 34, 35, 37, 38, 40, 41,
180
43, 44, 46, 48, 49, 51, 52, 54,
181
55, 57, 58, 60, 62, 63, 65, 66,
182
68, 69, 71, 72, 74, 75, 77, 79,
183
80, 82, 83, 85, 86, 88, 89, 93,
184
96, 99, 102, 105, 108, 111, 114, 117,
185
120, 124, 127, 130, 133, 136, 139, 142,
186
145, 148, 151, 155, 158, 161, 164, 167,
187
170, 173, 176, 179, 184, 189, 193, 198,
188
203, 207, 212, 217, 221, 226, 230, 235,
189
240, 244, 249, 254, 258, 263, 268, 274,
190
280, 286, 292, 299, 305, 311, 317, 323,
191
330, 336, 342, 348, 354, 362, 370, 379,
192
385, 393, 401, 409, 416, 424, 432, 440
193
};
194
195
static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
196
{ 96, 110 }, { 96, 108 }, { 110, 115 }
197
};
198
199
// Sharpening by (slightly) raising the hi-frequency coeffs.
200
// Hack-ish but helpful for mid-bitrate range. Use with care.
201
#define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
202
static const uint8_t kFreqSharpening[16] = {
203
0, 30, 60, 90,
204
30, 60, 90, 90,
205
60, 90, 90, 90,
206
90, 90, 90, 90
207
};
208
209
//------------------------------------------------------------------------------
210
// Initialize quantization parameters in VP8Matrix
211
212
// Returns the average quantizer
213
static int ExpandMatrix(VP8Matrix* const m, int type) {
214
int i, sum;
215
for (i = 0; i < 2; ++i) {
216
const int is_ac_coeff = (i > 0);
217
const int bias = kBiasMatrices[type][is_ac_coeff];
218
m->iq[i] = (1 << QFIX) / m->q[i];
219
m->bias[i] = BIAS(bias);
220
// zthresh is the exact value such that QUANTDIV(coeff, iQ, B) is:
221
// * zero if coeff <= zthresh
222
// * non-zero if coeff > zthresh
223
m->zthresh[i] = ((1 << QFIX) - 1 - m->bias[i]) / m->iq[i];
224
}
225
for (i = 2; i < 16; ++i) {
226
m->q[i] = m->q[1];
227
m->iq[i] = m->iq[1];
228
m->bias[i] = m->bias[1];
229
m->zthresh[i] = m->zthresh[1];
230
}
231
for (sum = 0, i = 0; i < 16; ++i) {
232
if (type == 0) { // we only use sharpening for AC luma coeffs
233
m->sharpen[i] = (kFreqSharpening[i] * m->q[i]) >> SHARPEN_BITS;
234
} else {
235
m->sharpen[i] = 0;
236
}
237
sum += m->q[i];
238
}
239
return (sum + 8) >> 4;
240
}
241
242
static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
243
244
static void SetupMatrices(VP8Encoder* enc) {
245
int i;
246
const int tlambda_scale =
247
(enc->method >= 4) ? enc->config->sns_strength
248
: 0;
249
const int num_segments = enc->segment_hdr.num_segments;
250
for (i = 0; i < num_segments; ++i) {
251
VP8SegmentInfo* const m = &enc->dqm[i];
252
const int q = m->quant;
253
int q_i4, q_i16, q_uv;
254
m->y1.q[0] = kDcTable[clip(q + enc->dq_y1_dc, 0, 127)];
255
m->y1.q[1] = kAcTable[clip(q, 0, 127)];
256
257
m->y2.q[0] = kDcTable[ clip(q + enc->dq_y2_dc, 0, 127)] * 2;
258
m->y2.q[1] = kAcTable2[clip(q + enc->dq_y2_ac, 0, 127)];
259
260
m->uv.q[0] = kDcTable[clip(q + enc->dq_uv_dc, 0, 117)];
261
m->uv.q[1] = kAcTable[clip(q + enc->dq_uv_ac, 0, 127)];
262
263
q_i4 = ExpandMatrix(&m->y1, 0);
264
q_i16 = ExpandMatrix(&m->y2, 1);
265
q_uv = ExpandMatrix(&m->uv, 2);
266
267
m->lambda_i4 = (3 * q_i4 * q_i4) >> 7;
268
m->lambda_i16 = (3 * q_i16 * q_i16);
269
m->lambda_uv = (3 * q_uv * q_uv) >> 6;
270
m->lambda_mode = (1 * q_i4 * q_i4) >> 7;
271
m->lambda_trellis_i4 = (7 * q_i4 * q_i4) >> 3;
272
m->lambda_trellis_i16 = (q_i16 * q_i16) >> 2;
273
m->lambda_trellis_uv = (q_uv * q_uv) << 1;
274
m->tlambda = (tlambda_scale * q_i4) >> 5;
275
276
// none of these constants should be < 1
277
CheckLambdaValue(&m->lambda_i4);
278
CheckLambdaValue(&m->lambda_i16);
279
CheckLambdaValue(&m->lambda_uv);
280
CheckLambdaValue(&m->lambda_mode);
281
CheckLambdaValue(&m->lambda_trellis_i4);
282
CheckLambdaValue(&m->lambda_trellis_i16);
283
CheckLambdaValue(&m->lambda_trellis_uv);
284
CheckLambdaValue(&m->tlambda);
285
286
m->min_disto = 20 * m->y1.q[0]; // quantization-aware min disto
287
m->max_edge = 0;
288
289
m->i4_penalty = 1000 * q_i4 * q_i4;
290
}
291
}
292
293
//------------------------------------------------------------------------------
294
// Initialize filtering parameters
295
296
// Very small filter-strength values have close to no visual effect. So we can
297
// save a little decoding-CPU by turning filtering off for these.
298
#define FSTRENGTH_CUTOFF 2
299
300
static void SetupFilterStrength(VP8Encoder* const enc) {
301
int i;
302
// level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
303
const int level0 = 5 * enc->config->filter_strength;
304
for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
305
VP8SegmentInfo* const m = &enc->dqm[i];
306
// We focus on the quantization of AC coeffs.
307
const int qstep = kAcTable[clip(m->quant, 0, 127)] >> 2;
308
const int base_strength =
309
VP8FilterStrengthFromDelta(enc->filter_hdr.sharpness, qstep);
310
// Segments with lower complexity ('beta') will be less filtered.
311
const int f = base_strength * level0 / (256 + m->beta);
312
m->fstrength = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
313
}
314
// We record the initial strength (mainly for the case of 1-segment only).
315
enc->filter_hdr.level = enc->dqm[0].fstrength;
316
enc->filter_hdr.simple = (enc->config->filter_type == 0);
317
enc->filter_hdr.sharpness = enc->config->filter_sharpness;
318
}
319
320
//------------------------------------------------------------------------------
321
322
// Note: if you change the values below, remember that the max range
323
// allowed by the syntax for DQ_UV is [-16,16].
324
#define MAX_DQ_UV (6)
325
#define MIN_DQ_UV (-4)
326
327
// We want to emulate jpeg-like behaviour where the expected "good" quality
328
// is around q=75. Internally, our "good" middle is around c=50. So we
329
// map accordingly using linear piece-wise function
330
static double QualityToCompression(double c) {
331
const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
332
// The file size roughly scales as pow(quantizer, 3.). Actually, the
333
// exponent is somewhere between 2.8 and 3.2, but we're mostly interested
334
// in the mid-quant range. So we scale the compressibility inversely to
335
// this power-law: quant ~= compression ^ 1/3. This law holds well for
336
// low quant. Finer modeling for high-quant would make use of kAcTable[]
337
// more explicitly.
338
const double v = pow(linear_c, 1 / 3.);
339
return v;
340
}
341
342
static double QualityToJPEGCompression(double c, double alpha) {
343
// We map the complexity 'alpha' and quality setting 'c' to a compression
344
// exponent empirically matched to the compression curve of libjpeg6b.
345
// On average, the WebP output size will be roughly similar to that of a
346
// JPEG file compressed with same quality factor.
347
const double amin = 0.30;
348
const double amax = 0.85;
349
const double exp_min = 0.4;
350
const double exp_max = 0.9;
351
const double slope = (exp_min - exp_max) / (amax - amin);
352
// Linearly interpolate 'expn' from exp_min to exp_max
353
// in the [amin, amax] range.
354
const double expn = (alpha > amax) ? exp_min
355
: (alpha < amin) ? exp_max
356
: exp_max + slope * (alpha - amin);
357
const double v = pow(c, expn);
358
return v;
359
}
360
361
static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
362
const VP8SegmentInfo* const S2) {
363
return (S1->quant == S2->quant) && (S1->fstrength == S2->fstrength);
364
}
365
366
static void SimplifySegments(VP8Encoder* const enc) {
367
int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
368
// 'num_segments' is previously validated and <= NUM_MB_SEGMENTS, but an
369
// explicit check is needed to avoid a spurious warning about 'i' exceeding
370
// array bounds of 'dqm' with some compilers (noticed with gcc-4.9).
371
const int num_segments = (enc->segment_hdr.num_segments < NUM_MB_SEGMENTS)
372
? enc->segment_hdr.num_segments
373
: NUM_MB_SEGMENTS;
374
int num_final_segments = 1;
375
int s1, s2;
376
for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
377
const VP8SegmentInfo* const S1 = &enc->dqm[s1];
378
int found = 0;
379
// check if we already have similar segment
380
for (s2 = 0; s2 < num_final_segments; ++s2) {
381
const VP8SegmentInfo* const S2 = &enc->dqm[s2];
382
if (SegmentsAreEquivalent(S1, S2)) {
383
found = 1;
384
break;
385
}
386
}
387
map[s1] = s2;
388
if (!found) {
389
if (num_final_segments != s1) {
390
enc->dqm[num_final_segments] = enc->dqm[s1];
391
}
392
++num_final_segments;
393
}
394
}
395
if (num_final_segments < num_segments) { // Remap
396
int i = enc->mb_w * enc->mb_h;
397
while (i-- > 0) enc->mb_info[i].segment = map[enc->mb_info[i].segment];
398
enc->segment_hdr.num_segments = num_final_segments;
399
// Replicate the trailing segment infos (it's mostly cosmetics)
400
for (i = num_final_segments; i < num_segments; ++i) {
401
enc->dqm[i] = enc->dqm[num_final_segments - 1];
402
}
403
}
404
}
405
406
void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
407
int i;
408
int dq_uv_ac, dq_uv_dc;
409
const int num_segments = enc->segment_hdr.num_segments;
410
const double amp = SNS_TO_DQ * enc->config->sns_strength / 100. / 128.;
411
const double Q = quality / 100.;
412
const double c_base = enc->config->emulate_jpeg_size ?
413
QualityToJPEGCompression(Q, enc->alpha / 255.) :
414
QualityToCompression(Q);
415
for (i = 0; i < num_segments; ++i) {
416
// We modulate the base coefficient to accommodate for the quantization
417
// susceptibility and allow denser segments to be quantized more.
418
const double expn = 1. - amp * enc->dqm[i].alpha;
419
const double c = pow(c_base, expn);
420
const int q = (int)(127. * (1. - c));
421
assert(expn > 0.);
422
enc->dqm[i].quant = clip(q, 0, 127);
423
}
424
425
// purely indicative in the bitstream (except for the 1-segment case)
426
enc->base_quant = enc->dqm[0].quant;
427
428
// fill-in values for the unused segments (required by the syntax)
429
for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
430
enc->dqm[i].quant = enc->base_quant;
431
}
432
433
// uv_alpha is normally spread around ~60. The useful range is
434
// typically ~30 (quite bad) to ~100 (ok to decimate UV more).
435
// We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
436
dq_uv_ac = (enc->uv_alpha - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
437
/ (MAX_ALPHA - MIN_ALPHA);
438
// we rescale by the user-defined strength of adaptation
439
dq_uv_ac = dq_uv_ac * enc->config->sns_strength / 100;
440
// and make it safe.
441
dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
442
// We also boost the dc-uv-quant a little, based on sns-strength, since
443
// U/V channels are quite more reactive to high quants (flat DC-blocks
444
// tend to appear, and are unpleasant).
445
dq_uv_dc = -4 * enc->config->sns_strength / 100;
446
dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
447
448
enc->dq_y1_dc = 0; // TODO(skal): dq-lum
449
enc->dq_y2_dc = 0;
450
enc->dq_y2_ac = 0;
451
enc->dq_uv_dc = dq_uv_dc;
452
enc->dq_uv_ac = dq_uv_ac;
453
454
SetupFilterStrength(enc); // initialize segments' filtering, eventually
455
456
if (num_segments > 1) SimplifySegments(enc);
457
458
SetupMatrices(enc); // finalize quantization matrices
459
}
460
461
//------------------------------------------------------------------------------
462
// Form the predictions in cache
463
464
// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
465
const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
466
const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
467
468
// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
469
static const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
470
I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
471
};
472
473
void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
474
const uint8_t* const left = it->x ? it->y_left : NULL;
475
const uint8_t* const top = it->y ? it->y_top : NULL;
476
VP8EncPredLuma16(it->yuv_p, left, top);
477
}
478
479
void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
480
const uint8_t* const left = it->x ? it->u_left : NULL;
481
const uint8_t* const top = it->y ? it->uv_top : NULL;
482
VP8EncPredChroma8(it->yuv_p, left, top);
483
}
484
485
// Form all the ten Intra4x4 predictions in the 'yuv_p' cache
486
// for the 4x4 block it->i4
487
static void MakeIntra4Preds(const VP8EncIterator* const it) {
488
VP8EncPredLuma4(it->yuv_p, it->i4_top);
489
}
490
491
//------------------------------------------------------------------------------
492
// Quantize
493
494
// Layout:
495
// +----+----+
496
// |YYYY|UUVV| 0
497
// |YYYY|UUVV| 4
498
// |YYYY|....| 8
499
// |YYYY|....| 12
500
// +----+----+
501
502
const uint16_t VP8Scan[16] = { // Luma
503
0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
504
0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
505
0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
506
0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
507
};
508
509
static const uint16_t VP8ScanUV[4 + 4] = {
510
0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
511
8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
512
};
513
514
//------------------------------------------------------------------------------
515
// Distortion measurement
516
517
static const uint16_t kWeightY[16] = {
518
38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
519
};
520
521
static const uint16_t kWeightTrellis[16] = {
522
#if USE_TDISTO == 0
523
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
524
#else
525
30, 27, 19, 11,
526
27, 24, 17, 10,
527
19, 17, 12, 8,
528
11, 10, 8, 6
529
#endif
530
};
531
532
// Init/Copy the common fields in score.
533
static void InitScore(VP8ModeScore* const rd) {
534
rd->D = 0;
535
rd->SD = 0;
536
rd->R = 0;
537
rd->H = 0;
538
rd->nz = 0;
539
rd->score = MAX_COST;
540
}
541
542
static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
543
const VP8ModeScore* WEBP_RESTRICT const src) {
544
dst->D = src->D;
545
dst->SD = src->SD;
546
dst->R = src->R;
547
dst->H = src->H;
548
dst->nz = src->nz; // note that nz is not accumulated, but just copied.
549
dst->score = src->score;
550
}
551
552
static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
553
const VP8ModeScore* WEBP_RESTRICT const src) {
554
dst->D += src->D;
555
dst->SD += src->SD;
556
dst->R += src->R;
557
dst->H += src->H;
558
dst->nz |= src->nz; // here, new nz bits are accumulated.
559
dst->score += src->score;
560
}
561
562
//------------------------------------------------------------------------------
563
// Performs trellis-optimized quantization.
564
565
// Prevents Visual Studio debugger from using this Node struct in place of the Godot Node class.
566
#define Node Node_libwebp_quant
567
568
// Trellis node
569
typedef struct {
570
int8_t prev; // best previous node
571
int8_t sign; // sign of coeff_i
572
int16_t level; // level
573
} Node;
574
575
// Score state
576
typedef struct {
577
score_t score; // partial RD score
578
const uint16_t* costs; // shortcut to cost tables
579
} ScoreState;
580
581
// If a coefficient was quantized to a value Q (using a neutral bias),
582
// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
583
// We don't test negative values though.
584
#define MIN_DELTA 0 // how much lower level to try
585
#define MAX_DELTA 1 // how much higher
586
#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
587
#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
588
#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
589
590
static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
591
rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
592
}
593
594
static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
595
score_t distortion) {
596
return rate * lambda + RD_DISTO_MULT * distortion;
597
}
598
599
// Coefficient type.
600
enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
601
602
static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
603
int16_t in[16], int16_t out[16],
604
int ctx0, int coeff_type,
605
const VP8Matrix* WEBP_RESTRICT const mtx,
606
int lambda) {
607
const ProbaArray* const probas = enc->proba.coeffs[coeff_type];
608
CostArrayPtr const costs =
609
(CostArrayPtr)enc->proba.remapped_costs[coeff_type];
610
const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
611
Node nodes[16][NUM_NODES];
612
ScoreState score_states[2][NUM_NODES];
613
ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
614
ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
615
int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
616
score_t best_score;
617
int n, m, p, last;
618
619
{
620
score_t cost;
621
const int thresh = mtx->q[1] * mtx->q[1] / 4;
622
const int last_proba = probas[VP8EncBands[first]][ctx0][0];
623
624
// compute the position of the last interesting coefficient
625
last = first - 1;
626
for (n = 15; n >= first; --n) {
627
const int j = kZigzag[n];
628
const int err = in[j] * in[j];
629
if (err > thresh) {
630
last = n;
631
break;
632
}
633
}
634
// we don't need to go inspect up to n = 16 coeffs. We can just go up
635
// to last + 1 (inclusive) without losing much.
636
if (last < 15) ++last;
637
638
// compute 'skip' score. This is the max score one can do.
639
cost = VP8BitCost(0, last_proba);
640
best_score = RDScoreTrellis(lambda, cost, 0);
641
642
// initialize source node.
643
for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
644
const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
645
ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
646
ss_cur[m].costs = costs[first][ctx0];
647
}
648
}
649
650
// traverse trellis.
651
for (n = first; n <= last; ++n) {
652
const int j = kZigzag[n];
653
const uint32_t Q = mtx->q[j];
654
const uint32_t iQ = mtx->iq[j];
655
const uint32_t B = BIAS(0x00); // neutral bias
656
// note: it's important to take sign of the _original_ coeff,
657
// so we don't have to consider level < 0 afterward.
658
const int sign = (in[j] < 0);
659
const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen[j];
660
int level0 = QUANTDIV(coeff0, iQ, B);
661
int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
662
if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
663
if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
664
665
{ // Swap current and previous score states
666
ScoreState* const tmp = ss_cur;
667
ss_cur = ss_prev;
668
ss_prev = tmp;
669
}
670
671
// test all alternate level values around level0.
672
for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
673
Node* const cur = &NODE(n, m);
674
const int level = level0 + m;
675
const int ctx = (level > 2) ? 2 : level;
676
const int band = VP8EncBands[n + 1];
677
score_t base_score;
678
score_t best_cur_score;
679
int best_prev;
680
score_t cost, score;
681
682
ss_cur[m].costs = costs[n + 1][ctx];
683
if (level < 0 || level > thresh_level) {
684
ss_cur[m].score = MAX_COST;
685
// Node is dead.
686
continue;
687
}
688
689
{
690
// Compute delta_error = how much coding this level will
691
// subtract to max_error as distortion.
692
// Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
693
const int new_error = coeff0 - level * Q;
694
const int delta_error =
695
kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
696
base_score = RDScoreTrellis(lambda, 0, delta_error);
697
}
698
699
// Inspect all possible non-dead predecessors. Retain only the best one.
700
// The base_score is added to all scores so it is only added for the final
701
// value after the loop.
702
cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
703
best_cur_score =
704
ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
705
best_prev = -MIN_DELTA;
706
for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
707
// Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
708
// eliminated since their score can't be better than the current best.
709
cost = VP8LevelCost(ss_prev[p].costs, level);
710
// Examine node assuming it's a non-terminal one.
711
score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
712
if (score < best_cur_score) {
713
best_cur_score = score;
714
best_prev = p;
715
}
716
}
717
best_cur_score += base_score;
718
// Store best finding in current node.
719
cur->sign = sign;
720
cur->level = level;
721
cur->prev = best_prev;
722
ss_cur[m].score = best_cur_score;
723
724
// Now, record best terminal node (and thus best entry in the graph).
725
if (level != 0 && best_cur_score < best_score) {
726
const score_t last_pos_cost =
727
(n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
728
const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
729
score = best_cur_score + last_pos_score;
730
if (score < best_score) {
731
best_score = score;
732
best_path[0] = n; // best eob position
733
best_path[1] = m; // best node index
734
best_path[2] = best_prev; // best predecessor
735
}
736
}
737
}
738
}
739
740
// Fresh start
741
// Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
742
if (coeff_type == TYPE_I16_AC) {
743
memset(in + 1, 0, 15 * sizeof(*in));
744
memset(out + 1, 0, 15 * sizeof(*out));
745
} else {
746
memset(in, 0, 16 * sizeof(*in));
747
memset(out, 0, 16 * sizeof(*out));
748
}
749
if (best_path[0] == -1) {
750
return 0; // skip!
751
}
752
753
{
754
// Unwind the best path.
755
// Note: best-prev on terminal node is not necessarily equal to the
756
// best_prev for non-terminal. So we patch best_path[2] in.
757
int nz = 0;
758
int best_node = best_path[1];
759
n = best_path[0];
760
NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
761
762
for (; n >= first; --n) {
763
const Node* const node = &NODE(n, best_node);
764
const int j = kZigzag[n];
765
out[n] = node->sign ? -node->level : node->level;
766
nz |= node->level;
767
in[j] = out[n] * mtx->q[j];
768
best_node = node->prev;
769
}
770
return (nz != 0);
771
}
772
}
773
774
#undef NODE
775
776
//------------------------------------------------------------------------------
777
// Performs: difference, transform, quantize, back-transform, add
778
// all at once. Output is the reconstructed block in *yuv_out, and the
779
// quantized levels in *levels.
780
781
static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
782
VP8ModeScore* WEBP_RESTRICT const rd,
783
uint8_t* WEBP_RESTRICT const yuv_out,
784
int mode) {
785
const VP8Encoder* const enc = it->enc;
786
const uint8_t* const ref = it->yuv_p + VP8I16ModeOffsets[mode];
787
const uint8_t* const src = it->yuv_in + Y_OFF_ENC;
788
const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment];
789
int nz = 0;
790
int n;
791
int16_t tmp[16][16], dc_tmp[16];
792
793
for (n = 0; n < 16; n += 2) {
794
VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
795
}
796
VP8FTransformWHT(tmp[0], dc_tmp);
797
nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2) << 24;
798
799
if (DO_TRELLIS_I16 && it->do_trellis) {
800
int x, y;
801
VP8IteratorNzToBytes(it);
802
for (y = 0, n = 0; y < 4; ++y) {
803
for (x = 0; x < 4; ++x, ++n) {
804
const int ctx = it->top_nz[x] + it->left_nz[y];
805
const int non_zero = TrellisQuantizeBlock(
806
enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1,
807
dqm->lambda_trellis_i16);
808
it->top_nz[x] = it->left_nz[y] = non_zero;
809
rd->y_ac_levels[n][0] = 0;
810
nz |= non_zero << n;
811
}
812
}
813
} else {
814
for (n = 0; n < 16; n += 2) {
815
// Zero-out the first coeff, so that: a) nz is correct below, and
816
// b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
817
tmp[n][0] = tmp[n + 1][0] = 0;
818
nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1) << n;
819
assert(rd->y_ac_levels[n + 0][0] == 0);
820
assert(rd->y_ac_levels[n + 1][0] == 0);
821
}
822
}
823
824
// Transform back
825
VP8TransformWHT(dc_tmp, tmp[0]);
826
for (n = 0; n < 16; n += 2) {
827
VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
828
}
829
830
return nz;
831
}
832
833
static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
834
int16_t levels[16],
835
const uint8_t* WEBP_RESTRICT const src,
836
uint8_t* WEBP_RESTRICT const yuv_out,
837
int mode) {
838
const VP8Encoder* const enc = it->enc;
839
const uint8_t* const ref = it->yuv_p + VP8I4ModeOffsets[mode];
840
const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment];
841
int nz = 0;
842
int16_t tmp[16];
843
844
VP8FTransform(src, ref, tmp);
845
if (DO_TRELLIS_I4 && it->do_trellis) {
846
const int x = it->i4 & 3, y = it->i4 >> 2;
847
const int ctx = it->top_nz[x] + it->left_nz[y];
848
nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1,
849
dqm->lambda_trellis_i4);
850
} else {
851
nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1);
852
}
853
VP8ITransform(ref, tmp, yuv_out, 0);
854
return nz;
855
}
856
857
//------------------------------------------------------------------------------
858
// DC-error diffusion
859
860
// Diffusion weights. We under-correct a bit (15/16th of the error is actually
861
// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
862
#define C1 7 // fraction of error sent to the 4x4 block below
863
#define C2 8 // fraction of error sent to the 4x4 block on the right
864
#define DSHIFT 4
865
#define DSCALE 1 // storage descaling, needed to make the error fit int8_t
866
867
// Quantize as usual, but also compute and return the quantization error.
868
// Error is already divided by DSHIFT.
869
static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
870
const VP8Matrix* WEBP_RESTRICT const mtx) {
871
int V = *v;
872
const int sign = (V < 0);
873
if (sign) V = -V;
874
if (V > (int)mtx->zthresh[0]) {
875
const int qV = QUANTDIV(V, mtx->iq[0], mtx->bias[0]) * mtx->q[0];
876
const int err = (V - qV);
877
*v = sign ? -qV : qV;
878
return (sign ? -err : err) >> DSCALE;
879
}
880
*v = 0;
881
return (sign ? -V : V) >> DSCALE;
882
}
883
884
static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
885
const VP8Matrix* WEBP_RESTRICT const mtx,
886
int16_t tmp[][16],
887
VP8ModeScore* WEBP_RESTRICT const rd) {
888
// | top[0] | top[1]
889
// --------+--------+---------
890
// left[0] | tmp[0] tmp[1] <-> err0 err1
891
// left[1] | tmp[2] tmp[3] err2 err3
892
//
893
// Final errors {err1,err2,err3} are preserved and later restored
894
// as top[]/left[] on the next block.
895
int ch;
896
for (ch = 0; ch <= 1; ++ch) {
897
const int8_t* const top = it->top_derr[it->x][ch];
898
const int8_t* const left = it->left_derr[ch];
899
int16_t (* const c)[16] = &tmp[ch * 4];
900
int err0, err1, err2, err3;
901
c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
902
err0 = QuantizeSingle(&c[0][0], mtx);
903
c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
904
err1 = QuantizeSingle(&c[1][0], mtx);
905
c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
906
err2 = QuantizeSingle(&c[2][0], mtx);
907
c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
908
err3 = QuantizeSingle(&c[3][0], mtx);
909
// error 'err' is bounded by mtx->q[0] which is 132 at max. Hence
910
// err >> DSCALE will fit in an int8_t type if DSCALE>=1.
911
assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
912
rd->derr[ch][0] = (int8_t)err1;
913
rd->derr[ch][1] = (int8_t)err2;
914
rd->derr[ch][2] = (int8_t)err3;
915
}
916
}
917
918
static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
919
const VP8ModeScore* WEBP_RESTRICT const rd) {
920
int ch;
921
for (ch = 0; ch <= 1; ++ch) {
922
int8_t* const top = it->top_derr[it->x][ch];
923
int8_t* const left = it->left_derr[ch];
924
left[0] = rd->derr[ch][0]; // restore err1
925
left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3
926
top[0] = rd->derr[ch][1]; // ... err2
927
top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3.
928
}
929
}
930
931
#undef C1
932
#undef C2
933
#undef DSHIFT
934
#undef DSCALE
935
936
//------------------------------------------------------------------------------
937
938
static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
939
VP8ModeScore* WEBP_RESTRICT const rd,
940
uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
941
const VP8Encoder* const enc = it->enc;
942
const uint8_t* const ref = it->yuv_p + VP8UVModeOffsets[mode];
943
const uint8_t* const src = it->yuv_in + U_OFF_ENC;
944
const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment];
945
int nz = 0;
946
int n;
947
int16_t tmp[8][16];
948
949
for (n = 0; n < 8; n += 2) {
950
VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
951
}
952
if (it->top_derr != NULL) CorrectDCValues(it, &dqm->uv, tmp, rd);
953
954
if (DO_TRELLIS_UV && it->do_trellis) {
955
int ch, x, y;
956
for (ch = 0, n = 0; ch <= 2; ch += 2) {
957
for (y = 0; y < 2; ++y) {
958
for (x = 0; x < 2; ++x, ++n) {
959
const int ctx = it->top_nz[4 + ch + x] + it->left_nz[4 + ch + y];
960
const int non_zero = TrellisQuantizeBlock(
961
enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv,
962
dqm->lambda_trellis_uv);
963
it->top_nz[4 + ch + x] = it->left_nz[4 + ch + y] = non_zero;
964
nz |= non_zero << n;
965
}
966
}
967
}
968
} else {
969
for (n = 0; n < 8; n += 2) {
970
nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv) << n;
971
}
972
}
973
974
for (n = 0; n < 8; n += 2) {
975
VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
976
}
977
return (nz << 16);
978
}
979
980
//------------------------------------------------------------------------------
981
// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
982
// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
983
984
static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
985
// We look at the first three AC coefficients to determine what is the average
986
// delta between each sub-4x4 block.
987
const int v0 = abs(DCs[1]);
988
const int v1 = abs(DCs[2]);
989
const int v2 = abs(DCs[4]);
990
int max_v = (v1 > v0) ? v1 : v0;
991
max_v = (v2 > max_v) ? v2 : max_v;
992
if (max_v > dqm->max_edge) dqm->max_edge = max_v;
993
}
994
995
static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
996
VP8ModeScore* const tmp = *a;
997
*a = *b;
998
*b = tmp;
999
}
1000
1001
static void SwapPtr(uint8_t** a, uint8_t** b) {
1002
uint8_t* const tmp = *a;
1003
*a = *b;
1004
*b = tmp;
1005
}
1006
1007
static void SwapOut(VP8EncIterator* const it) {
1008
SwapPtr(&it->yuv_out, &it->yuv_out2);
1009
}
1010
1011
static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
1012
VP8ModeScore* WEBP_RESTRICT rd) {
1013
const int kNumBlocks = 16;
1014
VP8SegmentInfo* const dqm = &it->enc->dqm[it->mb->segment];
1015
const int lambda = dqm->lambda_i16;
1016
const int tlambda = dqm->tlambda;
1017
const uint8_t* const src = it->yuv_in + Y_OFF_ENC;
1018
VP8ModeScore rd_tmp;
1019
VP8ModeScore* rd_cur = &rd_tmp;
1020
VP8ModeScore* rd_best = rd;
1021
int mode;
1022
int is_flat = IsFlatSource16(it->yuv_in + Y_OFF_ENC);
1023
1024
rd->mode_i16 = -1;
1025
for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1026
uint8_t* const tmp_dst = it->yuv_out2 + Y_OFF_ENC; // scratch buffer
1027
rd_cur->mode_i16 = mode;
1028
1029
// Reconstruct
1030
rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1031
1032
// Measure RD-score
1033
rd_cur->D = VP8SSE16x16(src, tmp_dst);
1034
rd_cur->SD =
1035
tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1036
rd_cur->H = VP8FixedCostsI16[mode];
1037
rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1038
if (is_flat) {
1039
// refine the first impression (which was in pixel space)
1040
is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1041
if (is_flat) {
1042
// Block is very flat. We put emphasis on the distortion being very low!
1043
rd_cur->D *= 2;
1044
rd_cur->SD *= 2;
1045
}
1046
}
1047
1048
// Since we always examine Intra16 first, we can overwrite *rd directly.
1049
SetRDScore(lambda, rd_cur);
1050
if (mode == 0 || rd_cur->score < rd_best->score) {
1051
SwapModeScore(&rd_cur, &rd_best);
1052
SwapOut(it);
1053
}
1054
}
1055
if (rd_best != rd) {
1056
memcpy(rd, rd_best, sizeof(*rd));
1057
}
1058
SetRDScore(dqm->lambda_mode, rd); // finalize score for mode decision.
1059
VP8SetIntra16Mode(it, rd->mode_i16);
1060
1061
// we have a blocky macroblock (only DCs are non-zero) with fairly high
1062
// distortion, record max delta so we can later adjust the minimal filtering
1063
// strength needed to smooth these blocks out.
1064
if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto) {
1065
StoreMaxDelta(dqm, rd->y_dc_levels);
1066
}
1067
}
1068
1069
//------------------------------------------------------------------------------
1070
1071
// return the cost array corresponding to the surrounding prediction modes.
1072
static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
1073
const uint8_t modes[16]) {
1074
const int preds_w = it->enc->preds_w;
1075
const int x = (it->i4 & 3), y = it->i4 >> 2;
1076
const int left = (x == 0) ? it->preds[y * preds_w - 1] : modes[it->i4 - 1];
1077
const int top = (y == 0) ? it->preds[-preds_w + x] : modes[it->i4 - 4];
1078
return VP8FixedCostsI4[top][left];
1079
}
1080
1081
static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
1082
VP8ModeScore* WEBP_RESTRICT const rd) {
1083
const VP8Encoder* const enc = it->enc;
1084
const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment];
1085
const int lambda = dqm->lambda_i4;
1086
const int tlambda = dqm->tlambda;
1087
const uint8_t* const src0 = it->yuv_in + Y_OFF_ENC;
1088
uint8_t* const best_blocks = it->yuv_out2 + Y_OFF_ENC;
1089
int total_header_bits = 0;
1090
VP8ModeScore rd_best;
1091
1092
if (enc->max_i4_header_bits == 0) {
1093
return 0;
1094
}
1095
1096
InitScore(&rd_best);
1097
rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
1098
SetRDScore(dqm->lambda_mode, &rd_best);
1099
VP8IteratorStartI4(it);
1100
do {
1101
const int kNumBlocks = 1;
1102
VP8ModeScore rd_i4;
1103
int mode;
1104
int best_mode = -1;
1105
const uint8_t* const src = src0 + VP8Scan[it->i4];
1106
const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1107
uint8_t* best_block = best_blocks + VP8Scan[it->i4];
1108
uint8_t* tmp_dst = it->yuv_p + I4TMP; // scratch buffer.
1109
1110
InitScore(&rd_i4);
1111
MakeIntra4Preds(it);
1112
for (mode = 0; mode < NUM_BMODES; ++mode) {
1113
VP8ModeScore rd_tmp;
1114
int16_t tmp_levels[16];
1115
1116
// Reconstruct
1117
rd_tmp.nz =
1118
ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4;
1119
1120
// Compute RD-score
1121
rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1122
rd_tmp.SD =
1123
tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1124
: 0;
1125
rd_tmp.H = mode_costs[mode];
1126
1127
// Add flatness penalty, to avoid flat area to be mispredicted
1128
// by a complex mode.
1129
if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1130
rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1131
} else {
1132
rd_tmp.R = 0;
1133
}
1134
1135
// early-out check
1136
SetRDScore(lambda, &rd_tmp);
1137
if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1138
1139
// finish computing score
1140
rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1141
SetRDScore(lambda, &rd_tmp);
1142
1143
if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1144
CopyScore(&rd_i4, &rd_tmp);
1145
best_mode = mode;
1146
SwapPtr(&tmp_dst, &best_block);
1147
memcpy(rd_best.y_ac_levels[it->i4], tmp_levels,
1148
sizeof(rd_best.y_ac_levels[it->i4]));
1149
}
1150
}
1151
SetRDScore(dqm->lambda_mode, &rd_i4);
1152
AddScore(&rd_best, &rd_i4);
1153
if (rd_best.score >= rd->score) {
1154
return 0;
1155
}
1156
total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1157
if (total_header_bits > enc->max_i4_header_bits) {
1158
return 0;
1159
}
1160
// Copy selected samples if not in the right place already.
1161
if (best_block != best_blocks + VP8Scan[it->i4]) {
1162
VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4]);
1163
}
1164
rd->modes_i4[it->i4] = best_mode;
1165
it->top_nz[it->i4 & 3] = it->left_nz[it->i4 >> 2] = (rd_i4.nz ? 1 : 0);
1166
} while (VP8IteratorRotateI4(it, best_blocks));
1167
1168
// finalize state
1169
CopyScore(rd, &rd_best);
1170
VP8SetIntra4Mode(it, rd->modes_i4);
1171
SwapOut(it);
1172
memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1173
return 1; // select intra4x4 over intra16x16
1174
}
1175
1176
//------------------------------------------------------------------------------
1177
1178
static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
1179
VP8ModeScore* WEBP_RESTRICT const rd) {
1180
const int kNumBlocks = 8;
1181
const VP8SegmentInfo* const dqm = &it->enc->dqm[it->mb->segment];
1182
const int lambda = dqm->lambda_uv;
1183
const uint8_t* const src = it->yuv_in + U_OFF_ENC;
1184
uint8_t* tmp_dst = it->yuv_out2 + U_OFF_ENC; // scratch buffer
1185
uint8_t* dst0 = it->yuv_out + U_OFF_ENC;
1186
uint8_t* dst = dst0;
1187
VP8ModeScore rd_best;
1188
int mode;
1189
1190
rd->mode_uv = -1;
1191
InitScore(&rd_best);
1192
for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1193
VP8ModeScore rd_uv;
1194
1195
// Reconstruct
1196
rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1197
1198
// Compute RD-score
1199
rd_uv.D = VP8SSE16x8(src, tmp_dst);
1200
rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas.
1201
rd_uv.H = VP8FixedCostsUV[mode];
1202
rd_uv.R = VP8GetCostUV(it, &rd_uv);
1203
if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1204
rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1205
}
1206
1207
SetRDScore(lambda, &rd_uv);
1208
if (mode == 0 || rd_uv.score < rd_best.score) {
1209
CopyScore(&rd_best, &rd_uv);
1210
rd->mode_uv = mode;
1211
memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1212
if (it->top_derr != NULL) {
1213
memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1214
}
1215
SwapPtr(&dst, &tmp_dst);
1216
}
1217
}
1218
VP8SetIntraUVMode(it, rd->mode_uv);
1219
AddScore(rd, &rd_best);
1220
if (dst != dst0) { // copy 16x8 block if needed
1221
VP8Copy16x8(dst, dst0);
1222
}
1223
if (it->top_derr != NULL) { // store diffusion errors for next block
1224
StoreDiffusionErrors(it, rd);
1225
}
1226
}
1227
1228
//------------------------------------------------------------------------------
1229
// Final reconstruction and quantization.
1230
1231
static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
1232
VP8ModeScore* WEBP_RESTRICT const rd) {
1233
const VP8Encoder* const enc = it->enc;
1234
const int is_i16 = (it->mb->type == 1);
1235
int nz = 0;
1236
1237
if (is_i16) {
1238
nz = ReconstructIntra16(it, rd, it->yuv_out + Y_OFF_ENC, it->preds[0]);
1239
} else {
1240
VP8IteratorStartI4(it);
1241
do {
1242
const int mode =
1243
it->preds[(it->i4 & 3) + (it->i4 >> 2) * enc->preds_w];
1244
const uint8_t* const src = it->yuv_in + Y_OFF_ENC + VP8Scan[it->i4];
1245
uint8_t* const dst = it->yuv_out + Y_OFF_ENC + VP8Scan[it->i4];
1246
MakeIntra4Preds(it);
1247
nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4],
1248
src, dst, mode) << it->i4;
1249
} while (VP8IteratorRotateI4(it, it->yuv_out + Y_OFF_ENC));
1250
}
1251
1252
nz |= ReconstructUV(it, rd, it->yuv_out + U_OFF_ENC, it->mb->uv_mode);
1253
rd->nz = nz;
1254
}
1255
1256
// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1257
static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
1258
int try_both_modes, int refine_uv_mode,
1259
VP8ModeScore* WEBP_RESTRICT const rd) {
1260
score_t best_score = MAX_COST;
1261
int nz = 0;
1262
int mode;
1263
int is_i16 = try_both_modes || (it->mb->type == 1);
1264
1265
const VP8SegmentInfo* const dqm = &it->enc->dqm[it->mb->segment];
1266
// Some empiric constants, of approximate order of magnitude.
1267
const int lambda_d_i16 = 106;
1268
const int lambda_d_i4 = 11;
1269
const int lambda_d_uv = 120;
1270
score_t score_i4 = dqm->i4_penalty;
1271
score_t i4_bit_sum = 0;
1272
const score_t bit_limit = try_both_modes ? it->enc->mb_header_limit
1273
: MAX_COST; // no early-out allowed
1274
1275
if (is_i16) { // First, evaluate Intra16 distortion
1276
int best_mode = -1;
1277
const uint8_t* const src = it->yuv_in + Y_OFF_ENC;
1278
for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1279
const uint8_t* const ref = it->yuv_p + VP8I16ModeOffsets[mode];
1280
const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1281
+ VP8FixedCostsI16[mode] * lambda_d_i16;
1282
if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1283
continue;
1284
}
1285
1286
if (score < best_score) {
1287
best_mode = mode;
1288
best_score = score;
1289
}
1290
}
1291
if (it->x == 0 || it->y == 0) {
1292
// avoid starting a checkerboard resonance from the border. See bug #432.
1293
if (IsFlatSource16(src)) {
1294
best_mode = (it->x == 0) ? 0 : 2;
1295
try_both_modes = 0; // stick to i16
1296
}
1297
}
1298
VP8SetIntra16Mode(it, best_mode);
1299
// we'll reconstruct later, if i16 mode actually gets selected
1300
}
1301
1302
// Next, evaluate Intra4
1303
if (try_both_modes || !is_i16) {
1304
// We don't evaluate the rate here, but just account for it through a
1305
// constant penalty (i4 mode usually needs more bits compared to i16).
1306
is_i16 = 0;
1307
VP8IteratorStartI4(it);
1308
do {
1309
int best_i4_mode = -1;
1310
score_t best_i4_score = MAX_COST;
1311
const uint8_t* const src = it->yuv_in + Y_OFF_ENC + VP8Scan[it->i4];
1312
const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1313
1314
MakeIntra4Preds(it);
1315
for (mode = 0; mode < NUM_BMODES; ++mode) {
1316
const uint8_t* const ref = it->yuv_p + VP8I4ModeOffsets[mode];
1317
const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1318
+ mode_costs[mode] * lambda_d_i4;
1319
if (score < best_i4_score) {
1320
best_i4_mode = mode;
1321
best_i4_score = score;
1322
}
1323
}
1324
i4_bit_sum += mode_costs[best_i4_mode];
1325
rd->modes_i4[it->i4] = best_i4_mode;
1326
score_i4 += best_i4_score;
1327
if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1328
// Intra4 won't be better than Intra16. Bail out and pick Intra16.
1329
is_i16 = 1;
1330
break;
1331
} else { // reconstruct partial block inside yuv_out2 buffer
1332
uint8_t* const tmp_dst = it->yuv_out2 + Y_OFF_ENC + VP8Scan[it->i4];
1333
nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4],
1334
src, tmp_dst, best_i4_mode) << it->i4;
1335
}
1336
} while (VP8IteratorRotateI4(it, it->yuv_out2 + Y_OFF_ENC));
1337
}
1338
1339
// Final reconstruction, depending on which mode is selected.
1340
if (!is_i16) {
1341
VP8SetIntra4Mode(it, rd->modes_i4);
1342
SwapOut(it);
1343
best_score = score_i4;
1344
} else {
1345
nz = ReconstructIntra16(it, rd, it->yuv_out + Y_OFF_ENC, it->preds[0]);
1346
}
1347
1348
// ... and UV!
1349
if (refine_uv_mode) {
1350
int best_mode = -1;
1351
score_t best_uv_score = MAX_COST;
1352
const uint8_t* const src = it->yuv_in + U_OFF_ENC;
1353
for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1354
const uint8_t* const ref = it->yuv_p + VP8UVModeOffsets[mode];
1355
const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1356
+ VP8FixedCostsUV[mode] * lambda_d_uv;
1357
if (score < best_uv_score) {
1358
best_mode = mode;
1359
best_uv_score = score;
1360
}
1361
}
1362
VP8SetIntraUVMode(it, best_mode);
1363
}
1364
nz |= ReconstructUV(it, rd, it->yuv_out + U_OFF_ENC, it->mb->uv_mode);
1365
1366
rd->nz = nz;
1367
rd->score = best_score;
1368
}
1369
1370
//------------------------------------------------------------------------------
1371
// Entry point
1372
1373
int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
1374
VP8ModeScore* WEBP_RESTRICT const rd,
1375
VP8RDLevel rd_opt) {
1376
int is_skipped;
1377
const int method = it->enc->method;
1378
1379
InitScore(rd);
1380
1381
// We can perform predictions for Luma16x16 and Chroma8x8 already.
1382
// Luma4x4 predictions needs to be done as-we-go.
1383
VP8MakeLuma16Preds(it);
1384
VP8MakeChroma8Preds(it);
1385
1386
if (rd_opt > RD_OPT_NONE) {
1387
it->do_trellis = (rd_opt >= RD_OPT_TRELLIS_ALL);
1388
PickBestIntra16(it, rd);
1389
if (method >= 2) {
1390
PickBestIntra4(it, rd);
1391
}
1392
PickBestUV(it, rd);
1393
if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1394
it->do_trellis = 1;
1395
SimpleQuantize(it, rd);
1396
}
1397
} else {
1398
// At this point we have heuristically decided intra16 / intra4.
1399
// For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1400
// For method <= 1, we don't re-examine the decision but just go ahead with
1401
// quantization/reconstruction.
1402
RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1403
}
1404
is_skipped = (rd->nz == 0);
1405
VP8SetSkip(it, is_skipped);
1406
return is_skipped;
1407
}
1408
1409