Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libwebp/src/enc/quant_enc.c
9913 views
1
// Copyright 2011 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// Quantization
11
//
12
// Author: Skal ([email protected])
13
14
#include <assert.h>
15
#include <math.h>
16
#include <stdlib.h> // for abs()
17
18
#include "src/dsp/quant.h"
19
#include "src/enc/vp8i_enc.h"
20
#include "src/enc/cost_enc.h"
21
22
#define DO_TRELLIS_I4 1
23
#define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
24
#define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
25
#define USE_TDISTO 1
26
27
#define MID_ALPHA 64 // neutral value for susceptibility
28
#define MIN_ALPHA 30 // lowest usable value for susceptibility
29
#define MAX_ALPHA 100 // higher meaningful value for susceptibility
30
31
#define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
32
// power-law modulation. Must be strictly less than 1.
33
34
// number of non-zero coeffs below which we consider the block very flat
35
// (and apply a penalty to complex predictions)
36
#define FLATNESS_LIMIT_I16 0 // I16 mode (special case)
37
#define FLATNESS_LIMIT_I4 3 // I4 mode
38
#define FLATNESS_LIMIT_UV 2 // UV mode
39
#define FLATNESS_PENALTY 140 // roughly ~1bit per block
40
41
#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42
43
#define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
44
45
// #define DEBUG_BLOCK
46
47
//------------------------------------------------------------------------------
48
49
#if defined(DEBUG_BLOCK)
50
51
#include <stdio.h>
52
#include <stdlib.h>
53
54
static void PrintBlockInfo(const VP8EncIterator* const it,
55
const VP8ModeScore* const rd) {
56
int i, j;
57
const int is_i16 = (it->mb_->type_ == 1);
58
const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59
const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60
const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61
const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62
printf("SOURCE / OUTPUT / ABS DELTA\n");
63
for (j = 0; j < 16; ++j) {
64
for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65
printf(" ");
66
for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67
printf(" ");
68
for (i = 0; i < 16; ++i) {
69
printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70
}
71
printf("\n");
72
}
73
printf("\n"); // newline before the U/V block
74
for (j = 0; j < 8; ++j) {
75
for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76
printf(" ");
77
for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78
printf(" ");
79
for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80
printf(" ");
81
for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82
printf(" ");
83
for (i = 0; i < 8; ++i) {
84
printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85
}
86
printf(" ");
87
for (i = 8; i < 16; ++i) {
88
printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89
}
90
printf("\n");
91
}
92
printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93
(int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94
(int)rd->score);
95
if (is_i16) {
96
printf("Mode: %d\n", rd->mode_i16);
97
printf("y_dc_levels:");
98
for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99
printf("\n");
100
} else {
101
printf("Modes[16]: ");
102
for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103
printf("\n");
104
}
105
printf("y_ac_levels:\n");
106
for (j = 0; j < 16; ++j) {
107
for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108
printf("%4d ", rd->y_ac_levels[j][i]);
109
}
110
printf("\n");
111
}
112
printf("\n");
113
printf("uv_levels (mode=%d):\n", rd->mode_uv);
114
for (j = 0; j < 8; ++j) {
115
for (i = 0; i < 16; ++i) {
116
printf("%4d ", rd->uv_levels[j][i]);
117
}
118
printf("\n");
119
}
120
}
121
122
#endif // DEBUG_BLOCK
123
124
//------------------------------------------------------------------------------
125
126
static WEBP_INLINE int clip(int v, int m, int M) {
127
return v < m ? m : v > M ? M : v;
128
}
129
130
static const uint8_t kZigzag[16] = {
131
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132
};
133
134
static const uint8_t kDcTable[128] = {
135
4, 5, 6, 7, 8, 9, 10, 10,
136
11, 12, 13, 14, 15, 16, 17, 17,
137
18, 19, 20, 20, 21, 21, 22, 22,
138
23, 23, 24, 25, 25, 26, 27, 28,
139
29, 30, 31, 32, 33, 34, 35, 36,
140
37, 37, 38, 39, 40, 41, 42, 43,
141
44, 45, 46, 46, 47, 48, 49, 50,
142
51, 52, 53, 54, 55, 56, 57, 58,
143
59, 60, 61, 62, 63, 64, 65, 66,
144
67, 68, 69, 70, 71, 72, 73, 74,
145
75, 76, 76, 77, 78, 79, 80, 81,
146
82, 83, 84, 85, 86, 87, 88, 89,
147
91, 93, 95, 96, 98, 100, 101, 102,
148
104, 106, 108, 110, 112, 114, 116, 118,
149
122, 124, 126, 128, 130, 132, 134, 136,
150
138, 140, 143, 145, 148, 151, 154, 157
151
};
152
153
static const uint16_t kAcTable[128] = {
154
4, 5, 6, 7, 8, 9, 10, 11,
155
12, 13, 14, 15, 16, 17, 18, 19,
156
20, 21, 22, 23, 24, 25, 26, 27,
157
28, 29, 30, 31, 32, 33, 34, 35,
158
36, 37, 38, 39, 40, 41, 42, 43,
159
44, 45, 46, 47, 48, 49, 50, 51,
160
52, 53, 54, 55, 56, 57, 58, 60,
161
62, 64, 66, 68, 70, 72, 74, 76,
162
78, 80, 82, 84, 86, 88, 90, 92,
163
94, 96, 98, 100, 102, 104, 106, 108,
164
110, 112, 114, 116, 119, 122, 125, 128,
165
131, 134, 137, 140, 143, 146, 149, 152,
166
155, 158, 161, 164, 167, 170, 173, 177,
167
181, 185, 189, 193, 197, 201, 205, 209,
168
213, 217, 221, 225, 229, 234, 239, 245,
169
249, 254, 259, 264, 269, 274, 279, 284
170
};
171
172
static const uint16_t kAcTable2[128] = {
173
8, 8, 9, 10, 12, 13, 15, 17,
174
18, 20, 21, 23, 24, 26, 27, 29,
175
31, 32, 34, 35, 37, 38, 40, 41,
176
43, 44, 46, 48, 49, 51, 52, 54,
177
55, 57, 58, 60, 62, 63, 65, 66,
178
68, 69, 71, 72, 74, 75, 77, 79,
179
80, 82, 83, 85, 86, 88, 89, 93,
180
96, 99, 102, 105, 108, 111, 114, 117,
181
120, 124, 127, 130, 133, 136, 139, 142,
182
145, 148, 151, 155, 158, 161, 164, 167,
183
170, 173, 176, 179, 184, 189, 193, 198,
184
203, 207, 212, 217, 221, 226, 230, 235,
185
240, 244, 249, 254, 258, 263, 268, 274,
186
280, 286, 292, 299, 305, 311, 317, 323,
187
330, 336, 342, 348, 354, 362, 370, 379,
188
385, 393, 401, 409, 416, 424, 432, 440
189
};
190
191
static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
192
{ 96, 110 }, { 96, 108 }, { 110, 115 }
193
};
194
195
// Sharpening by (slightly) raising the hi-frequency coeffs.
196
// Hack-ish but helpful for mid-bitrate range. Use with care.
197
#define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
198
static const uint8_t kFreqSharpening[16] = {
199
0, 30, 60, 90,
200
30, 60, 90, 90,
201
60, 90, 90, 90,
202
90, 90, 90, 90
203
};
204
205
//------------------------------------------------------------------------------
206
// Initialize quantization parameters in VP8Matrix
207
208
// Returns the average quantizer
209
static int ExpandMatrix(VP8Matrix* const m, int type) {
210
int i, sum;
211
for (i = 0; i < 2; ++i) {
212
const int is_ac_coeff = (i > 0);
213
const int bias = kBiasMatrices[type][is_ac_coeff];
214
m->iq_[i] = (1 << QFIX) / m->q_[i];
215
m->bias_[i] = BIAS(bias);
216
// zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217
// * zero if coeff <= zthresh
218
// * non-zero if coeff > zthresh
219
m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220
}
221
for (i = 2; i < 16; ++i) {
222
m->q_[i] = m->q_[1];
223
m->iq_[i] = m->iq_[1];
224
m->bias_[i] = m->bias_[1];
225
m->zthresh_[i] = m->zthresh_[1];
226
}
227
for (sum = 0, i = 0; i < 16; ++i) {
228
if (type == 0) { // we only use sharpening for AC luma coeffs
229
m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230
} else {
231
m->sharpen_[i] = 0;
232
}
233
sum += m->q_[i];
234
}
235
return (sum + 8) >> 4;
236
}
237
238
static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239
240
static void SetupMatrices(VP8Encoder* enc) {
241
int i;
242
const int tlambda_scale =
243
(enc->method_ >= 4) ? enc->config_->sns_strength
244
: 0;
245
const int num_segments = enc->segment_hdr_.num_segments_;
246
for (i = 0; i < num_segments; ++i) {
247
VP8SegmentInfo* const m = &enc->dqm_[i];
248
const int q = m->quant_;
249
int q_i4, q_i16, q_uv;
250
m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251
m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
252
253
m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254
m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255
256
m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257
m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258
259
q_i4 = ExpandMatrix(&m->y1_, 0);
260
q_i16 = ExpandMatrix(&m->y2_, 1);
261
q_uv = ExpandMatrix(&m->uv_, 2);
262
263
m->lambda_i4_ = (3 * q_i4 * q_i4) >> 7;
264
m->lambda_i16_ = (3 * q_i16 * q_i16);
265
m->lambda_uv_ = (3 * q_uv * q_uv) >> 6;
266
m->lambda_mode_ = (1 * q_i4 * q_i4) >> 7;
267
m->lambda_trellis_i4_ = (7 * q_i4 * q_i4) >> 3;
268
m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269
m->lambda_trellis_uv_ = (q_uv * q_uv) << 1;
270
m->tlambda_ = (tlambda_scale * q_i4) >> 5;
271
272
// none of these constants should be < 1
273
CheckLambdaValue(&m->lambda_i4_);
274
CheckLambdaValue(&m->lambda_i16_);
275
CheckLambdaValue(&m->lambda_uv_);
276
CheckLambdaValue(&m->lambda_mode_);
277
CheckLambdaValue(&m->lambda_trellis_i4_);
278
CheckLambdaValue(&m->lambda_trellis_i16_);
279
CheckLambdaValue(&m->lambda_trellis_uv_);
280
CheckLambdaValue(&m->tlambda_);
281
282
m->min_disto_ = 20 * m->y1_.q_[0]; // quantization-aware min disto
283
m->max_edge_ = 0;
284
285
m->i4_penalty_ = 1000 * q_i4 * q_i4;
286
}
287
}
288
289
//------------------------------------------------------------------------------
290
// Initialize filtering parameters
291
292
// Very small filter-strength values have close to no visual effect. So we can
293
// save a little decoding-CPU by turning filtering off for these.
294
#define FSTRENGTH_CUTOFF 2
295
296
static void SetupFilterStrength(VP8Encoder* const enc) {
297
int i;
298
// level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299
const int level0 = 5 * enc->config_->filter_strength;
300
for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301
VP8SegmentInfo* const m = &enc->dqm_[i];
302
// We focus on the quantization of AC coeffs.
303
const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304
const int base_strength =
305
VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306
// Segments with lower complexity ('beta') will be less filtered.
307
const int f = base_strength * level0 / (256 + m->beta_);
308
m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309
}
310
// We record the initial strength (mainly for the case of 1-segment only).
311
enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312
enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313
enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314
}
315
316
//------------------------------------------------------------------------------
317
318
// Note: if you change the values below, remember that the max range
319
// allowed by the syntax for DQ_UV is [-16,16].
320
#define MAX_DQ_UV (6)
321
#define MIN_DQ_UV (-4)
322
323
// We want to emulate jpeg-like behaviour where the expected "good" quality
324
// is around q=75. Internally, our "good" middle is around c=50. So we
325
// map accordingly using linear piece-wise function
326
static double QualityToCompression(double c) {
327
const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328
// The file size roughly scales as pow(quantizer, 3.). Actually, the
329
// exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330
// in the mid-quant range. So we scale the compressibility inversely to
331
// this power-law: quant ~= compression ^ 1/3. This law holds well for
332
// low quant. Finer modeling for high-quant would make use of kAcTable[]
333
// more explicitly.
334
const double v = pow(linear_c, 1 / 3.);
335
return v;
336
}
337
338
static double QualityToJPEGCompression(double c, double alpha) {
339
// We map the complexity 'alpha' and quality setting 'c' to a compression
340
// exponent empirically matched to the compression curve of libjpeg6b.
341
// On average, the WebP output size will be roughly similar to that of a
342
// JPEG file compressed with same quality factor.
343
const double amin = 0.30;
344
const double amax = 0.85;
345
const double exp_min = 0.4;
346
const double exp_max = 0.9;
347
const double slope = (exp_min - exp_max) / (amax - amin);
348
// Linearly interpolate 'expn' from exp_min to exp_max
349
// in the [amin, amax] range.
350
const double expn = (alpha > amax) ? exp_min
351
: (alpha < amin) ? exp_max
352
: exp_max + slope * (alpha - amin);
353
const double v = pow(c, expn);
354
return v;
355
}
356
357
static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358
const VP8SegmentInfo* const S2) {
359
return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360
}
361
362
static void SimplifySegments(VP8Encoder* const enc) {
363
int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364
// 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365
// explicit check is needed to avoid a spurious warning about 'i' exceeding
366
// array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367
const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368
? enc->segment_hdr_.num_segments_
369
: NUM_MB_SEGMENTS;
370
int num_final_segments = 1;
371
int s1, s2;
372
for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
373
const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374
int found = 0;
375
// check if we already have similar segment
376
for (s2 = 0; s2 < num_final_segments; ++s2) {
377
const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378
if (SegmentsAreEquivalent(S1, S2)) {
379
found = 1;
380
break;
381
}
382
}
383
map[s1] = s2;
384
if (!found) {
385
if (num_final_segments != s1) {
386
enc->dqm_[num_final_segments] = enc->dqm_[s1];
387
}
388
++num_final_segments;
389
}
390
}
391
if (num_final_segments < num_segments) { // Remap
392
int i = enc->mb_w_ * enc->mb_h_;
393
while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394
enc->segment_hdr_.num_segments_ = num_final_segments;
395
// Replicate the trailing segment infos (it's mostly cosmetics)
396
for (i = num_final_segments; i < num_segments; ++i) {
397
enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398
}
399
}
400
}
401
402
void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403
int i;
404
int dq_uv_ac, dq_uv_dc;
405
const int num_segments = enc->segment_hdr_.num_segments_;
406
const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407
const double Q = quality / 100.;
408
const double c_base = enc->config_->emulate_jpeg_size ?
409
QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410
QualityToCompression(Q);
411
for (i = 0; i < num_segments; ++i) {
412
// We modulate the base coefficient to accommodate for the quantization
413
// susceptibility and allow denser segments to be quantized more.
414
const double expn = 1. - amp * enc->dqm_[i].alpha_;
415
const double c = pow(c_base, expn);
416
const int q = (int)(127. * (1. - c));
417
assert(expn > 0.);
418
enc->dqm_[i].quant_ = clip(q, 0, 127);
419
}
420
421
// purely indicative in the bitstream (except for the 1-segment case)
422
enc->base_quant_ = enc->dqm_[0].quant_;
423
424
// fill-in values for the unused segments (required by the syntax)
425
for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426
enc->dqm_[i].quant_ = enc->base_quant_;
427
}
428
429
// uv_alpha_ is normally spread around ~60. The useful range is
430
// typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431
// We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432
dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433
/ (MAX_ALPHA - MIN_ALPHA);
434
// we rescale by the user-defined strength of adaptation
435
dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436
// and make it safe.
437
dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438
// We also boost the dc-uv-quant a little, based on sns-strength, since
439
// U/V channels are quite more reactive to high quants (flat DC-blocks
440
// tend to appear, and are unpleasant).
441
dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442
dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
443
444
enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
445
enc->dq_y2_dc_ = 0;
446
enc->dq_y2_ac_ = 0;
447
enc->dq_uv_dc_ = dq_uv_dc;
448
enc->dq_uv_ac_ = dq_uv_ac;
449
450
SetupFilterStrength(enc); // initialize segments' filtering, eventually
451
452
if (num_segments > 1) SimplifySegments(enc);
453
454
SetupMatrices(enc); // finalize quantization matrices
455
}
456
457
//------------------------------------------------------------------------------
458
// Form the predictions in cache
459
460
// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461
const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462
const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463
464
// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465
static const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466
I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467
};
468
469
void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470
const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471
const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472
VP8EncPredLuma16(it->yuv_p_, left, top);
473
}
474
475
void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476
const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477
const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478
VP8EncPredChroma8(it->yuv_p_, left, top);
479
}
480
481
// Form all the ten Intra4x4 predictions in the yuv_p_ cache
482
// for the 4x4 block it->i4_
483
static void MakeIntra4Preds(const VP8EncIterator* const it) {
484
VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
485
}
486
487
//------------------------------------------------------------------------------
488
// Quantize
489
490
// Layout:
491
// +----+----+
492
// |YYYY|UUVV| 0
493
// |YYYY|UUVV| 4
494
// |YYYY|....| 8
495
// |YYYY|....| 12
496
// +----+----+
497
498
const uint16_t VP8Scan[16] = { // Luma
499
0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
500
0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
501
0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
502
0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
503
};
504
505
static const uint16_t VP8ScanUV[4 + 4] = {
506
0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
507
8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
508
};
509
510
//------------------------------------------------------------------------------
511
// Distortion measurement
512
513
static const uint16_t kWeightY[16] = {
514
38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
515
};
516
517
static const uint16_t kWeightTrellis[16] = {
518
#if USE_TDISTO == 0
519
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
520
#else
521
30, 27, 19, 11,
522
27, 24, 17, 10,
523
19, 17, 12, 8,
524
11, 10, 8, 6
525
#endif
526
};
527
528
// Init/Copy the common fields in score.
529
static void InitScore(VP8ModeScore* const rd) {
530
rd->D = 0;
531
rd->SD = 0;
532
rd->R = 0;
533
rd->H = 0;
534
rd->nz = 0;
535
rd->score = MAX_COST;
536
}
537
538
static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
539
const VP8ModeScore* WEBP_RESTRICT const src) {
540
dst->D = src->D;
541
dst->SD = src->SD;
542
dst->R = src->R;
543
dst->H = src->H;
544
dst->nz = src->nz; // note that nz is not accumulated, but just copied.
545
dst->score = src->score;
546
}
547
548
static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
549
const VP8ModeScore* WEBP_RESTRICT const src) {
550
dst->D += src->D;
551
dst->SD += src->SD;
552
dst->R += src->R;
553
dst->H += src->H;
554
dst->nz |= src->nz; // here, new nz bits are accumulated.
555
dst->score += src->score;
556
}
557
558
//------------------------------------------------------------------------------
559
// Performs trellis-optimized quantization.
560
561
// Prevents Visual Studio debugger from using this Node struct in place of the Godot Node class.
562
#define Node Node_libwebp_quant
563
564
// Trellis node
565
typedef struct {
566
int8_t prev; // best previous node
567
int8_t sign; // sign of coeff_i
568
int16_t level; // level
569
} Node;
570
571
// Score state
572
typedef struct {
573
score_t score; // partial RD score
574
const uint16_t* costs; // shortcut to cost tables
575
} ScoreState;
576
577
// If a coefficient was quantized to a value Q (using a neutral bias),
578
// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
579
// We don't test negative values though.
580
#define MIN_DELTA 0 // how much lower level to try
581
#define MAX_DELTA 1 // how much higher
582
#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
583
#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
584
#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
585
586
static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
587
rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
588
}
589
590
static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
591
score_t distortion) {
592
return rate * lambda + RD_DISTO_MULT * distortion;
593
}
594
595
// Coefficient type.
596
enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
597
598
static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
599
int16_t in[16], int16_t out[16],
600
int ctx0, int coeff_type,
601
const VP8Matrix* WEBP_RESTRICT const mtx,
602
int lambda) {
603
const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
604
CostArrayPtr const costs =
605
(CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
606
const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
607
Node nodes[16][NUM_NODES];
608
ScoreState score_states[2][NUM_NODES];
609
ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
610
ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
611
int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
612
score_t best_score;
613
int n, m, p, last;
614
615
{
616
score_t cost;
617
const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
618
const int last_proba = probas[VP8EncBands[first]][ctx0][0];
619
620
// compute the position of the last interesting coefficient
621
last = first - 1;
622
for (n = 15; n >= first; --n) {
623
const int j = kZigzag[n];
624
const int err = in[j] * in[j];
625
if (err > thresh) {
626
last = n;
627
break;
628
}
629
}
630
// we don't need to go inspect up to n = 16 coeffs. We can just go up
631
// to last + 1 (inclusive) without losing much.
632
if (last < 15) ++last;
633
634
// compute 'skip' score. This is the max score one can do.
635
cost = VP8BitCost(0, last_proba);
636
best_score = RDScoreTrellis(lambda, cost, 0);
637
638
// initialize source node.
639
for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
640
const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
641
ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
642
ss_cur[m].costs = costs[first][ctx0];
643
}
644
}
645
646
// traverse trellis.
647
for (n = first; n <= last; ++n) {
648
const int j = kZigzag[n];
649
const uint32_t Q = mtx->q_[j];
650
const uint32_t iQ = mtx->iq_[j];
651
const uint32_t B = BIAS(0x00); // neutral bias
652
// note: it's important to take sign of the _original_ coeff,
653
// so we don't have to consider level < 0 afterward.
654
const int sign = (in[j] < 0);
655
const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
656
int level0 = QUANTDIV(coeff0, iQ, B);
657
int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
658
if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
659
if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
660
661
{ // Swap current and previous score states
662
ScoreState* const tmp = ss_cur;
663
ss_cur = ss_prev;
664
ss_prev = tmp;
665
}
666
667
// test all alternate level values around level0.
668
for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
669
Node* const cur = &NODE(n, m);
670
const int level = level0 + m;
671
const int ctx = (level > 2) ? 2 : level;
672
const int band = VP8EncBands[n + 1];
673
score_t base_score;
674
score_t best_cur_score;
675
int best_prev;
676
score_t cost, score;
677
678
ss_cur[m].costs = costs[n + 1][ctx];
679
if (level < 0 || level > thresh_level) {
680
ss_cur[m].score = MAX_COST;
681
// Node is dead.
682
continue;
683
}
684
685
{
686
// Compute delta_error = how much coding this level will
687
// subtract to max_error as distortion.
688
// Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
689
const int new_error = coeff0 - level * Q;
690
const int delta_error =
691
kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
692
base_score = RDScoreTrellis(lambda, 0, delta_error);
693
}
694
695
// Inspect all possible non-dead predecessors. Retain only the best one.
696
// The base_score is added to all scores so it is only added for the final
697
// value after the loop.
698
cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
699
best_cur_score =
700
ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
701
best_prev = -MIN_DELTA;
702
for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
703
// Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
704
// eliminated since their score can't be better than the current best.
705
cost = VP8LevelCost(ss_prev[p].costs, level);
706
// Examine node assuming it's a non-terminal one.
707
score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
708
if (score < best_cur_score) {
709
best_cur_score = score;
710
best_prev = p;
711
}
712
}
713
best_cur_score += base_score;
714
// Store best finding in current node.
715
cur->sign = sign;
716
cur->level = level;
717
cur->prev = best_prev;
718
ss_cur[m].score = best_cur_score;
719
720
// Now, record best terminal node (and thus best entry in the graph).
721
if (level != 0 && best_cur_score < best_score) {
722
const score_t last_pos_cost =
723
(n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
724
const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
725
score = best_cur_score + last_pos_score;
726
if (score < best_score) {
727
best_score = score;
728
best_path[0] = n; // best eob position
729
best_path[1] = m; // best node index
730
best_path[2] = best_prev; // best predecessor
731
}
732
}
733
}
734
}
735
736
// Fresh start
737
// Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
738
if (coeff_type == TYPE_I16_AC) {
739
memset(in + 1, 0, 15 * sizeof(*in));
740
memset(out + 1, 0, 15 * sizeof(*out));
741
} else {
742
memset(in, 0, 16 * sizeof(*in));
743
memset(out, 0, 16 * sizeof(*out));
744
}
745
if (best_path[0] == -1) {
746
return 0; // skip!
747
}
748
749
{
750
// Unwind the best path.
751
// Note: best-prev on terminal node is not necessarily equal to the
752
// best_prev for non-terminal. So we patch best_path[2] in.
753
int nz = 0;
754
int best_node = best_path[1];
755
n = best_path[0];
756
NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
757
758
for (; n >= first; --n) {
759
const Node* const node = &NODE(n, best_node);
760
const int j = kZigzag[n];
761
out[n] = node->sign ? -node->level : node->level;
762
nz |= node->level;
763
in[j] = out[n] * mtx->q_[j];
764
best_node = node->prev;
765
}
766
return (nz != 0);
767
}
768
}
769
770
#undef NODE
771
772
//------------------------------------------------------------------------------
773
// Performs: difference, transform, quantize, back-transform, add
774
// all at once. Output is the reconstructed block in *yuv_out, and the
775
// quantized levels in *levels.
776
777
static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
778
VP8ModeScore* WEBP_RESTRICT const rd,
779
uint8_t* WEBP_RESTRICT const yuv_out,
780
int mode) {
781
const VP8Encoder* const enc = it->enc_;
782
const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
783
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
784
const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
785
int nz = 0;
786
int n;
787
int16_t tmp[16][16], dc_tmp[16];
788
789
for (n = 0; n < 16; n += 2) {
790
VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
791
}
792
VP8FTransformWHT(tmp[0], dc_tmp);
793
nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
794
795
if (DO_TRELLIS_I16 && it->do_trellis_) {
796
int x, y;
797
VP8IteratorNzToBytes(it);
798
for (y = 0, n = 0; y < 4; ++y) {
799
for (x = 0; x < 4; ++x, ++n) {
800
const int ctx = it->top_nz_[x] + it->left_nz_[y];
801
const int non_zero = TrellisQuantizeBlock(
802
enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
803
dqm->lambda_trellis_i16_);
804
it->top_nz_[x] = it->left_nz_[y] = non_zero;
805
rd->y_ac_levels[n][0] = 0;
806
nz |= non_zero << n;
807
}
808
}
809
} else {
810
for (n = 0; n < 16; n += 2) {
811
// Zero-out the first coeff, so that: a) nz is correct below, and
812
// b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
813
tmp[n][0] = tmp[n + 1][0] = 0;
814
nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
815
assert(rd->y_ac_levels[n + 0][0] == 0);
816
assert(rd->y_ac_levels[n + 1][0] == 0);
817
}
818
}
819
820
// Transform back
821
VP8TransformWHT(dc_tmp, tmp[0]);
822
for (n = 0; n < 16; n += 2) {
823
VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
824
}
825
826
return nz;
827
}
828
829
static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
830
int16_t levels[16],
831
const uint8_t* WEBP_RESTRICT const src,
832
uint8_t* WEBP_RESTRICT const yuv_out,
833
int mode) {
834
const VP8Encoder* const enc = it->enc_;
835
const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
836
const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
837
int nz = 0;
838
int16_t tmp[16];
839
840
VP8FTransform(src, ref, tmp);
841
if (DO_TRELLIS_I4 && it->do_trellis_) {
842
const int x = it->i4_ & 3, y = it->i4_ >> 2;
843
const int ctx = it->top_nz_[x] + it->left_nz_[y];
844
nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
845
dqm->lambda_trellis_i4_);
846
} else {
847
nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
848
}
849
VP8ITransform(ref, tmp, yuv_out, 0);
850
return nz;
851
}
852
853
//------------------------------------------------------------------------------
854
// DC-error diffusion
855
856
// Diffusion weights. We under-correct a bit (15/16th of the error is actually
857
// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
858
#define C1 7 // fraction of error sent to the 4x4 block below
859
#define C2 8 // fraction of error sent to the 4x4 block on the right
860
#define DSHIFT 4
861
#define DSCALE 1 // storage descaling, needed to make the error fit int8_t
862
863
// Quantize as usual, but also compute and return the quantization error.
864
// Error is already divided by DSHIFT.
865
static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
866
const VP8Matrix* WEBP_RESTRICT const mtx) {
867
int V = *v;
868
const int sign = (V < 0);
869
if (sign) V = -V;
870
if (V > (int)mtx->zthresh_[0]) {
871
const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
872
const int err = (V - qV);
873
*v = sign ? -qV : qV;
874
return (sign ? -err : err) >> DSCALE;
875
}
876
*v = 0;
877
return (sign ? -V : V) >> DSCALE;
878
}
879
880
static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
881
const VP8Matrix* WEBP_RESTRICT const mtx,
882
int16_t tmp[][16],
883
VP8ModeScore* WEBP_RESTRICT const rd) {
884
// | top[0] | top[1]
885
// --------+--------+---------
886
// left[0] | tmp[0] tmp[1] <-> err0 err1
887
// left[1] | tmp[2] tmp[3] err2 err3
888
//
889
// Final errors {err1,err2,err3} are preserved and later restored
890
// as top[]/left[] on the next block.
891
int ch;
892
for (ch = 0; ch <= 1; ++ch) {
893
const int8_t* const top = it->top_derr_[it->x_][ch];
894
const int8_t* const left = it->left_derr_[ch];
895
int16_t (* const c)[16] = &tmp[ch * 4];
896
int err0, err1, err2, err3;
897
c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
898
err0 = QuantizeSingle(&c[0][0], mtx);
899
c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
900
err1 = QuantizeSingle(&c[1][0], mtx);
901
c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
902
err2 = QuantizeSingle(&c[2][0], mtx);
903
c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
904
err3 = QuantizeSingle(&c[3][0], mtx);
905
// error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
906
// err >> DSCALE will fit in an int8_t type if DSCALE>=1.
907
assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
908
rd->derr[ch][0] = (int8_t)err1;
909
rd->derr[ch][1] = (int8_t)err2;
910
rd->derr[ch][2] = (int8_t)err3;
911
}
912
}
913
914
static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
915
const VP8ModeScore* WEBP_RESTRICT const rd) {
916
int ch;
917
for (ch = 0; ch <= 1; ++ch) {
918
int8_t* const top = it->top_derr_[it->x_][ch];
919
int8_t* const left = it->left_derr_[ch];
920
left[0] = rd->derr[ch][0]; // restore err1
921
left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3
922
top[0] = rd->derr[ch][1]; // ... err2
923
top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3.
924
}
925
}
926
927
#undef C1
928
#undef C2
929
#undef DSHIFT
930
#undef DSCALE
931
932
//------------------------------------------------------------------------------
933
934
static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
935
VP8ModeScore* WEBP_RESTRICT const rd,
936
uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
937
const VP8Encoder* const enc = it->enc_;
938
const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
939
const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
940
const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
941
int nz = 0;
942
int n;
943
int16_t tmp[8][16];
944
945
for (n = 0; n < 8; n += 2) {
946
VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
947
}
948
if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
949
950
if (DO_TRELLIS_UV && it->do_trellis_) {
951
int ch, x, y;
952
for (ch = 0, n = 0; ch <= 2; ch += 2) {
953
for (y = 0; y < 2; ++y) {
954
for (x = 0; x < 2; ++x, ++n) {
955
const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
956
const int non_zero = TrellisQuantizeBlock(
957
enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
958
dqm->lambda_trellis_uv_);
959
it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
960
nz |= non_zero << n;
961
}
962
}
963
}
964
} else {
965
for (n = 0; n < 8; n += 2) {
966
nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
967
}
968
}
969
970
for (n = 0; n < 8; n += 2) {
971
VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
972
}
973
return (nz << 16);
974
}
975
976
//------------------------------------------------------------------------------
977
// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
978
// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
979
980
static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
981
// We look at the first three AC coefficients to determine what is the average
982
// delta between each sub-4x4 block.
983
const int v0 = abs(DCs[1]);
984
const int v1 = abs(DCs[2]);
985
const int v2 = abs(DCs[4]);
986
int max_v = (v1 > v0) ? v1 : v0;
987
max_v = (v2 > max_v) ? v2 : max_v;
988
if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
989
}
990
991
static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
992
VP8ModeScore* const tmp = *a;
993
*a = *b;
994
*b = tmp;
995
}
996
997
static void SwapPtr(uint8_t** a, uint8_t** b) {
998
uint8_t* const tmp = *a;
999
*a = *b;
1000
*b = tmp;
1001
}
1002
1003
static void SwapOut(VP8EncIterator* const it) {
1004
SwapPtr(&it->yuv_out_, &it->yuv_out2_);
1005
}
1006
1007
static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
1008
VP8ModeScore* WEBP_RESTRICT rd) {
1009
const int kNumBlocks = 16;
1010
VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1011
const int lambda = dqm->lambda_i16_;
1012
const int tlambda = dqm->tlambda_;
1013
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1014
VP8ModeScore rd_tmp;
1015
VP8ModeScore* rd_cur = &rd_tmp;
1016
VP8ModeScore* rd_best = rd;
1017
int mode;
1018
int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
1019
1020
rd->mode_i16 = -1;
1021
for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1022
uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
1023
rd_cur->mode_i16 = mode;
1024
1025
// Reconstruct
1026
rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1027
1028
// Measure RD-score
1029
rd_cur->D = VP8SSE16x16(src, tmp_dst);
1030
rd_cur->SD =
1031
tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1032
rd_cur->H = VP8FixedCostsI16[mode];
1033
rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1034
if (is_flat) {
1035
// refine the first impression (which was in pixel space)
1036
is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1037
if (is_flat) {
1038
// Block is very flat. We put emphasis on the distortion being very low!
1039
rd_cur->D *= 2;
1040
rd_cur->SD *= 2;
1041
}
1042
}
1043
1044
// Since we always examine Intra16 first, we can overwrite *rd directly.
1045
SetRDScore(lambda, rd_cur);
1046
if (mode == 0 || rd_cur->score < rd_best->score) {
1047
SwapModeScore(&rd_cur, &rd_best);
1048
SwapOut(it);
1049
}
1050
}
1051
if (rd_best != rd) {
1052
memcpy(rd, rd_best, sizeof(*rd));
1053
}
1054
SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
1055
VP8SetIntra16Mode(it, rd->mode_i16);
1056
1057
// we have a blocky macroblock (only DCs are non-zero) with fairly high
1058
// distortion, record max delta so we can later adjust the minimal filtering
1059
// strength needed to smooth these blocks out.
1060
if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1061
StoreMaxDelta(dqm, rd->y_dc_levels);
1062
}
1063
}
1064
1065
//------------------------------------------------------------------------------
1066
1067
// return the cost array corresponding to the surrounding prediction modes.
1068
static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
1069
const uint8_t modes[16]) {
1070
const int preds_w = it->enc_->preds_w_;
1071
const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1072
const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1073
const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1074
return VP8FixedCostsI4[top][left];
1075
}
1076
1077
static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
1078
VP8ModeScore* WEBP_RESTRICT const rd) {
1079
const VP8Encoder* const enc = it->enc_;
1080
const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1081
const int lambda = dqm->lambda_i4_;
1082
const int tlambda = dqm->tlambda_;
1083
const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1084
uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1085
int total_header_bits = 0;
1086
VP8ModeScore rd_best;
1087
1088
if (enc->max_i4_header_bits_ == 0) {
1089
return 0;
1090
}
1091
1092
InitScore(&rd_best);
1093
rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
1094
SetRDScore(dqm->lambda_mode_, &rd_best);
1095
VP8IteratorStartI4(it);
1096
do {
1097
const int kNumBlocks = 1;
1098
VP8ModeScore rd_i4;
1099
int mode;
1100
int best_mode = -1;
1101
const uint8_t* const src = src0 + VP8Scan[it->i4_];
1102
const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1103
uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1104
uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
1105
1106
InitScore(&rd_i4);
1107
MakeIntra4Preds(it);
1108
for (mode = 0; mode < NUM_BMODES; ++mode) {
1109
VP8ModeScore rd_tmp;
1110
int16_t tmp_levels[16];
1111
1112
// Reconstruct
1113
rd_tmp.nz =
1114
ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1115
1116
// Compute RD-score
1117
rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1118
rd_tmp.SD =
1119
tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1120
: 0;
1121
rd_tmp.H = mode_costs[mode];
1122
1123
// Add flatness penalty, to avoid flat area to be mispredicted
1124
// by a complex mode.
1125
if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1126
rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1127
} else {
1128
rd_tmp.R = 0;
1129
}
1130
1131
// early-out check
1132
SetRDScore(lambda, &rd_tmp);
1133
if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1134
1135
// finish computing score
1136
rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1137
SetRDScore(lambda, &rd_tmp);
1138
1139
if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1140
CopyScore(&rd_i4, &rd_tmp);
1141
best_mode = mode;
1142
SwapPtr(&tmp_dst, &best_block);
1143
memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1144
sizeof(rd_best.y_ac_levels[it->i4_]));
1145
}
1146
}
1147
SetRDScore(dqm->lambda_mode_, &rd_i4);
1148
AddScore(&rd_best, &rd_i4);
1149
if (rd_best.score >= rd->score) {
1150
return 0;
1151
}
1152
total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1153
if (total_header_bits > enc->max_i4_header_bits_) {
1154
return 0;
1155
}
1156
// Copy selected samples if not in the right place already.
1157
if (best_block != best_blocks + VP8Scan[it->i4_]) {
1158
VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1159
}
1160
rd->modes_i4[it->i4_] = best_mode;
1161
it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1162
} while (VP8IteratorRotateI4(it, best_blocks));
1163
1164
// finalize state
1165
CopyScore(rd, &rd_best);
1166
VP8SetIntra4Mode(it, rd->modes_i4);
1167
SwapOut(it);
1168
memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1169
return 1; // select intra4x4 over intra16x16
1170
}
1171
1172
//------------------------------------------------------------------------------
1173
1174
static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
1175
VP8ModeScore* WEBP_RESTRICT const rd) {
1176
const int kNumBlocks = 8;
1177
const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1178
const int lambda = dqm->lambda_uv_;
1179
const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1180
uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
1181
uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1182
uint8_t* dst = dst0;
1183
VP8ModeScore rd_best;
1184
int mode;
1185
1186
rd->mode_uv = -1;
1187
InitScore(&rd_best);
1188
for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1189
VP8ModeScore rd_uv;
1190
1191
// Reconstruct
1192
rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1193
1194
// Compute RD-score
1195
rd_uv.D = VP8SSE16x8(src, tmp_dst);
1196
rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas.
1197
rd_uv.H = VP8FixedCostsUV[mode];
1198
rd_uv.R = VP8GetCostUV(it, &rd_uv);
1199
if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1200
rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1201
}
1202
1203
SetRDScore(lambda, &rd_uv);
1204
if (mode == 0 || rd_uv.score < rd_best.score) {
1205
CopyScore(&rd_best, &rd_uv);
1206
rd->mode_uv = mode;
1207
memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1208
if (it->top_derr_ != NULL) {
1209
memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1210
}
1211
SwapPtr(&dst, &tmp_dst);
1212
}
1213
}
1214
VP8SetIntraUVMode(it, rd->mode_uv);
1215
AddScore(rd, &rd_best);
1216
if (dst != dst0) { // copy 16x8 block if needed
1217
VP8Copy16x8(dst, dst0);
1218
}
1219
if (it->top_derr_ != NULL) { // store diffusion errors for next block
1220
StoreDiffusionErrors(it, rd);
1221
}
1222
}
1223
1224
//------------------------------------------------------------------------------
1225
// Final reconstruction and quantization.
1226
1227
static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
1228
VP8ModeScore* WEBP_RESTRICT const rd) {
1229
const VP8Encoder* const enc = it->enc_;
1230
const int is_i16 = (it->mb_->type_ == 1);
1231
int nz = 0;
1232
1233
if (is_i16) {
1234
nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1235
} else {
1236
VP8IteratorStartI4(it);
1237
do {
1238
const int mode =
1239
it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1240
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1241
uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1242
MakeIntra4Preds(it);
1243
nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1244
src, dst, mode) << it->i4_;
1245
} while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1246
}
1247
1248
nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1249
rd->nz = nz;
1250
}
1251
1252
// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1253
static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
1254
int try_both_modes, int refine_uv_mode,
1255
VP8ModeScore* WEBP_RESTRICT const rd) {
1256
score_t best_score = MAX_COST;
1257
int nz = 0;
1258
int mode;
1259
int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1260
1261
const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1262
// Some empiric constants, of approximate order of magnitude.
1263
const int lambda_d_i16 = 106;
1264
const int lambda_d_i4 = 11;
1265
const int lambda_d_uv = 120;
1266
score_t score_i4 = dqm->i4_penalty_;
1267
score_t i4_bit_sum = 0;
1268
const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1269
: MAX_COST; // no early-out allowed
1270
1271
if (is_i16) { // First, evaluate Intra16 distortion
1272
int best_mode = -1;
1273
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1274
for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1275
const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1276
const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1277
+ VP8FixedCostsI16[mode] * lambda_d_i16;
1278
if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1279
continue;
1280
}
1281
1282
if (score < best_score) {
1283
best_mode = mode;
1284
best_score = score;
1285
}
1286
}
1287
if (it->x_ == 0 || it->y_ == 0) {
1288
// avoid starting a checkerboard resonance from the border. See bug #432.
1289
if (IsFlatSource16(src)) {
1290
best_mode = (it->x_ == 0) ? 0 : 2;
1291
try_both_modes = 0; // stick to i16
1292
}
1293
}
1294
VP8SetIntra16Mode(it, best_mode);
1295
// we'll reconstruct later, if i16 mode actually gets selected
1296
}
1297
1298
// Next, evaluate Intra4
1299
if (try_both_modes || !is_i16) {
1300
// We don't evaluate the rate here, but just account for it through a
1301
// constant penalty (i4 mode usually needs more bits compared to i16).
1302
is_i16 = 0;
1303
VP8IteratorStartI4(it);
1304
do {
1305
int best_i4_mode = -1;
1306
score_t best_i4_score = MAX_COST;
1307
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1308
const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1309
1310
MakeIntra4Preds(it);
1311
for (mode = 0; mode < NUM_BMODES; ++mode) {
1312
const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1313
const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1314
+ mode_costs[mode] * lambda_d_i4;
1315
if (score < best_i4_score) {
1316
best_i4_mode = mode;
1317
best_i4_score = score;
1318
}
1319
}
1320
i4_bit_sum += mode_costs[best_i4_mode];
1321
rd->modes_i4[it->i4_] = best_i4_mode;
1322
score_i4 += best_i4_score;
1323
if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1324
// Intra4 won't be better than Intra16. Bail out and pick Intra16.
1325
is_i16 = 1;
1326
break;
1327
} else { // reconstruct partial block inside yuv_out2_ buffer
1328
uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1329
nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1330
src, tmp_dst, best_i4_mode) << it->i4_;
1331
}
1332
} while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1333
}
1334
1335
// Final reconstruction, depending on which mode is selected.
1336
if (!is_i16) {
1337
VP8SetIntra4Mode(it, rd->modes_i4);
1338
SwapOut(it);
1339
best_score = score_i4;
1340
} else {
1341
nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1342
}
1343
1344
// ... and UV!
1345
if (refine_uv_mode) {
1346
int best_mode = -1;
1347
score_t best_uv_score = MAX_COST;
1348
const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1349
for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1350
const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1351
const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1352
+ VP8FixedCostsUV[mode] * lambda_d_uv;
1353
if (score < best_uv_score) {
1354
best_mode = mode;
1355
best_uv_score = score;
1356
}
1357
}
1358
VP8SetIntraUVMode(it, best_mode);
1359
}
1360
nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1361
1362
rd->nz = nz;
1363
rd->score = best_score;
1364
}
1365
1366
//------------------------------------------------------------------------------
1367
// Entry point
1368
1369
int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
1370
VP8ModeScore* WEBP_RESTRICT const rd,
1371
VP8RDLevel rd_opt) {
1372
int is_skipped;
1373
const int method = it->enc_->method_;
1374
1375
InitScore(rd);
1376
1377
// We can perform predictions for Luma16x16 and Chroma8x8 already.
1378
// Luma4x4 predictions needs to be done as-we-go.
1379
VP8MakeLuma16Preds(it);
1380
VP8MakeChroma8Preds(it);
1381
1382
if (rd_opt > RD_OPT_NONE) {
1383
it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1384
PickBestIntra16(it, rd);
1385
if (method >= 2) {
1386
PickBestIntra4(it, rd);
1387
}
1388
PickBestUV(it, rd);
1389
if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1390
it->do_trellis_ = 1;
1391
SimpleQuantize(it, rd);
1392
}
1393
} else {
1394
// At this point we have heuristically decided intra16 / intra4.
1395
// For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1396
// For method <= 1, we don't re-examine the decision but just go ahead with
1397
// quantization/reconstruction.
1398
RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1399
}
1400
is_skipped = (rd->nz == 0);
1401
VP8SetSkip(it, is_skipped);
1402
return is_skipped;
1403
}
1404
1405