Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/libwebp/src/enc/quant_enc.c
16344 views
1
// Copyright 2011 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// Quantization
11
//
12
// Author: Skal ([email protected])
13
14
#include <assert.h>
15
#include <math.h>
16
#include <stdlib.h> // for abs()
17
18
#include "src/enc/vp8i_enc.h"
19
#include "src/enc/cost_enc.h"
20
21
#define DO_TRELLIS_I4 1
22
#define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
23
#define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
24
#define USE_TDISTO 1
25
26
#define MID_ALPHA 64 // neutral value for susceptibility
27
#define MIN_ALPHA 30 // lowest usable value for susceptibility
28
#define MAX_ALPHA 100 // higher meaningful value for susceptibility
29
30
#define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
31
// power-law modulation. Must be strictly less than 1.
32
33
// number of non-zero coeffs below which we consider the block very flat
34
// (and apply a penalty to complex predictions)
35
#define FLATNESS_LIMIT_I16 10 // I16 mode
36
#define FLATNESS_LIMIT_I4 3 // I4 mode
37
#define FLATNESS_LIMIT_UV 2 // UV mode
38
#define FLATNESS_PENALTY 140 // roughly ~1bit per block
39
40
#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
41
42
#define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
43
44
// #define DEBUG_BLOCK
45
46
//------------------------------------------------------------------------------
47
48
#if defined(DEBUG_BLOCK)
49
50
#include <stdio.h>
51
#include <stdlib.h>
52
53
static void PrintBlockInfo(const VP8EncIterator* const it,
54
const VP8ModeScore* const rd) {
55
int i, j;
56
const int is_i16 = (it->mb_->type_ == 1);
57
const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
58
const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
59
const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
60
const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
61
printf("SOURCE / OUTPUT / ABS DELTA\n");
62
for (j = 0; j < 16; ++j) {
63
for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
64
printf(" ");
65
for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
66
printf(" ");
67
for (i = 0; i < 16; ++i) {
68
printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
69
}
70
printf("\n");
71
}
72
printf("\n"); // newline before the U/V block
73
for (j = 0; j < 8; ++j) {
74
for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
75
printf(" ");
76
for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
77
printf(" ");
78
for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
79
printf(" ");
80
for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
81
printf(" ");
82
for (i = 0; i < 8; ++i) {
83
printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
84
}
85
printf(" ");
86
for (i = 8; i < 16; ++i) {
87
printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
88
}
89
printf("\n");
90
}
91
printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
92
(int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
93
(int)rd->score);
94
if (is_i16) {
95
printf("Mode: %d\n", rd->mode_i16);
96
printf("y_dc_levels:");
97
for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
98
printf("\n");
99
} else {
100
printf("Modes[16]: ");
101
for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
102
printf("\n");
103
}
104
printf("y_ac_levels:\n");
105
for (j = 0; j < 16; ++j) {
106
for (i = is_i16 ? 1 : 0; i < 16; ++i) {
107
printf("%4d ", rd->y_ac_levels[j][i]);
108
}
109
printf("\n");
110
}
111
printf("\n");
112
printf("uv_levels (mode=%d):\n", rd->mode_uv);
113
for (j = 0; j < 8; ++j) {
114
for (i = 0; i < 16; ++i) {
115
printf("%4d ", rd->uv_levels[j][i]);
116
}
117
printf("\n");
118
}
119
}
120
121
#endif // DEBUG_BLOCK
122
123
//------------------------------------------------------------------------------
124
125
static WEBP_INLINE int clip(int v, int m, int M) {
126
return v < m ? m : v > M ? M : v;
127
}
128
129
static const uint8_t kZigzag[16] = {
130
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
131
};
132
133
static const uint8_t kDcTable[128] = {
134
4, 5, 6, 7, 8, 9, 10, 10,
135
11, 12, 13, 14, 15, 16, 17, 17,
136
18, 19, 20, 20, 21, 21, 22, 22,
137
23, 23, 24, 25, 25, 26, 27, 28,
138
29, 30, 31, 32, 33, 34, 35, 36,
139
37, 37, 38, 39, 40, 41, 42, 43,
140
44, 45, 46, 46, 47, 48, 49, 50,
141
51, 52, 53, 54, 55, 56, 57, 58,
142
59, 60, 61, 62, 63, 64, 65, 66,
143
67, 68, 69, 70, 71, 72, 73, 74,
144
75, 76, 76, 77, 78, 79, 80, 81,
145
82, 83, 84, 85, 86, 87, 88, 89,
146
91, 93, 95, 96, 98, 100, 101, 102,
147
104, 106, 108, 110, 112, 114, 116, 118,
148
122, 124, 126, 128, 130, 132, 134, 136,
149
138, 140, 143, 145, 148, 151, 154, 157
150
};
151
152
static const uint16_t kAcTable[128] = {
153
4, 5, 6, 7, 8, 9, 10, 11,
154
12, 13, 14, 15, 16, 17, 18, 19,
155
20, 21, 22, 23, 24, 25, 26, 27,
156
28, 29, 30, 31, 32, 33, 34, 35,
157
36, 37, 38, 39, 40, 41, 42, 43,
158
44, 45, 46, 47, 48, 49, 50, 51,
159
52, 53, 54, 55, 56, 57, 58, 60,
160
62, 64, 66, 68, 70, 72, 74, 76,
161
78, 80, 82, 84, 86, 88, 90, 92,
162
94, 96, 98, 100, 102, 104, 106, 108,
163
110, 112, 114, 116, 119, 122, 125, 128,
164
131, 134, 137, 140, 143, 146, 149, 152,
165
155, 158, 161, 164, 167, 170, 173, 177,
166
181, 185, 189, 193, 197, 201, 205, 209,
167
213, 217, 221, 225, 229, 234, 239, 245,
168
249, 254, 259, 264, 269, 274, 279, 284
169
};
170
171
static const uint16_t kAcTable2[128] = {
172
8, 8, 9, 10, 12, 13, 15, 17,
173
18, 20, 21, 23, 24, 26, 27, 29,
174
31, 32, 34, 35, 37, 38, 40, 41,
175
43, 44, 46, 48, 49, 51, 52, 54,
176
55, 57, 58, 60, 62, 63, 65, 66,
177
68, 69, 71, 72, 74, 75, 77, 79,
178
80, 82, 83, 85, 86, 88, 89, 93,
179
96, 99, 102, 105, 108, 111, 114, 117,
180
120, 124, 127, 130, 133, 136, 139, 142,
181
145, 148, 151, 155, 158, 161, 164, 167,
182
170, 173, 176, 179, 184, 189, 193, 198,
183
203, 207, 212, 217, 221, 226, 230, 235,
184
240, 244, 249, 254, 258, 263, 268, 274,
185
280, 286, 292, 299, 305, 311, 317, 323,
186
330, 336, 342, 348, 354, 362, 370, 379,
187
385, 393, 401, 409, 416, 424, 432, 440
188
};
189
190
static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
191
{ 96, 110 }, { 96, 108 }, { 110, 115 }
192
};
193
194
// Sharpening by (slightly) raising the hi-frequency coeffs.
195
// Hack-ish but helpful for mid-bitrate range. Use with care.
196
#define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
197
static const uint8_t kFreqSharpening[16] = {
198
0, 30, 60, 90,
199
30, 60, 90, 90,
200
60, 90, 90, 90,
201
90, 90, 90, 90
202
};
203
204
//------------------------------------------------------------------------------
205
// Initialize quantization parameters in VP8Matrix
206
207
// Returns the average quantizer
208
static int ExpandMatrix(VP8Matrix* const m, int type) {
209
int i, sum;
210
for (i = 0; i < 2; ++i) {
211
const int is_ac_coeff = (i > 0);
212
const int bias = kBiasMatrices[type][is_ac_coeff];
213
m->iq_[i] = (1 << QFIX) / m->q_[i];
214
m->bias_[i] = BIAS(bias);
215
// zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
216
// * zero if coeff <= zthresh
217
// * non-zero if coeff > zthresh
218
m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
219
}
220
for (i = 2; i < 16; ++i) {
221
m->q_[i] = m->q_[1];
222
m->iq_[i] = m->iq_[1];
223
m->bias_[i] = m->bias_[1];
224
m->zthresh_[i] = m->zthresh_[1];
225
}
226
for (sum = 0, i = 0; i < 16; ++i) {
227
if (type == 0) { // we only use sharpening for AC luma coeffs
228
m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
229
} else {
230
m->sharpen_[i] = 0;
231
}
232
sum += m->q_[i];
233
}
234
return (sum + 8) >> 4;
235
}
236
237
static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
238
239
static void SetupMatrices(VP8Encoder* enc) {
240
int i;
241
const int tlambda_scale =
242
(enc->method_ >= 4) ? enc->config_->sns_strength
243
: 0;
244
const int num_segments = enc->segment_hdr_.num_segments_;
245
for (i = 0; i < num_segments; ++i) {
246
VP8SegmentInfo* const m = &enc->dqm_[i];
247
const int q = m->quant_;
248
int q_i4, q_i16, q_uv;
249
m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
250
m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
251
252
m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
253
m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
254
255
m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
256
m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
257
258
q_i4 = ExpandMatrix(&m->y1_, 0);
259
q_i16 = ExpandMatrix(&m->y2_, 1);
260
q_uv = ExpandMatrix(&m->uv_, 2);
261
262
m->lambda_i4_ = (3 * q_i4 * q_i4) >> 7;
263
m->lambda_i16_ = (3 * q_i16 * q_i16);
264
m->lambda_uv_ = (3 * q_uv * q_uv) >> 6;
265
m->lambda_mode_ = (1 * q_i4 * q_i4) >> 7;
266
m->lambda_trellis_i4_ = (7 * q_i4 * q_i4) >> 3;
267
m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
268
m->lambda_trellis_uv_ = (q_uv * q_uv) << 1;
269
m->tlambda_ = (tlambda_scale * q_i4) >> 5;
270
271
// none of these constants should be < 1
272
CheckLambdaValue(&m->lambda_i4_);
273
CheckLambdaValue(&m->lambda_i16_);
274
CheckLambdaValue(&m->lambda_uv_);
275
CheckLambdaValue(&m->lambda_mode_);
276
CheckLambdaValue(&m->lambda_trellis_i4_);
277
CheckLambdaValue(&m->lambda_trellis_i16_);
278
CheckLambdaValue(&m->lambda_trellis_uv_);
279
CheckLambdaValue(&m->tlambda_);
280
281
m->min_disto_ = 20 * m->y1_.q_[0]; // quantization-aware min disto
282
m->max_edge_ = 0;
283
284
m->i4_penalty_ = 1000 * q_i4 * q_i4;
285
}
286
}
287
288
//------------------------------------------------------------------------------
289
// Initialize filtering parameters
290
291
// Very small filter-strength values have close to no visual effect. So we can
292
// save a little decoding-CPU by turning filtering off for these.
293
#define FSTRENGTH_CUTOFF 2
294
295
static void SetupFilterStrength(VP8Encoder* const enc) {
296
int i;
297
// level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
298
const int level0 = 5 * enc->config_->filter_strength;
299
for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
300
VP8SegmentInfo* const m = &enc->dqm_[i];
301
// We focus on the quantization of AC coeffs.
302
const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
303
const int base_strength =
304
VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
305
// Segments with lower complexity ('beta') will be less filtered.
306
const int f = base_strength * level0 / (256 + m->beta_);
307
m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
308
}
309
// We record the initial strength (mainly for the case of 1-segment only).
310
enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
311
enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
312
enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
313
}
314
315
//------------------------------------------------------------------------------
316
317
// Note: if you change the values below, remember that the max range
318
// allowed by the syntax for DQ_UV is [-16,16].
319
#define MAX_DQ_UV (6)
320
#define MIN_DQ_UV (-4)
321
322
// We want to emulate jpeg-like behaviour where the expected "good" quality
323
// is around q=75. Internally, our "good" middle is around c=50. So we
324
// map accordingly using linear piece-wise function
325
static double QualityToCompression(double c) {
326
const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
327
// The file size roughly scales as pow(quantizer, 3.). Actually, the
328
// exponent is somewhere between 2.8 and 3.2, but we're mostly interested
329
// in the mid-quant range. So we scale the compressibility inversely to
330
// this power-law: quant ~= compression ^ 1/3. This law holds well for
331
// low quant. Finer modeling for high-quant would make use of kAcTable[]
332
// more explicitly.
333
const double v = pow(linear_c, 1 / 3.);
334
return v;
335
}
336
337
static double QualityToJPEGCompression(double c, double alpha) {
338
// We map the complexity 'alpha' and quality setting 'c' to a compression
339
// exponent empirically matched to the compression curve of libjpeg6b.
340
// On average, the WebP output size will be roughly similar to that of a
341
// JPEG file compressed with same quality factor.
342
const double amin = 0.30;
343
const double amax = 0.85;
344
const double exp_min = 0.4;
345
const double exp_max = 0.9;
346
const double slope = (exp_min - exp_max) / (amax - amin);
347
// Linearly interpolate 'expn' from exp_min to exp_max
348
// in the [amin, amax] range.
349
const double expn = (alpha > amax) ? exp_min
350
: (alpha < amin) ? exp_max
351
: exp_max + slope * (alpha - amin);
352
const double v = pow(c, expn);
353
return v;
354
}
355
356
static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
357
const VP8SegmentInfo* const S2) {
358
return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
359
}
360
361
static void SimplifySegments(VP8Encoder* const enc) {
362
int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
363
// 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
364
// explicit check is needed to avoid a spurious warning about 'i' exceeding
365
// array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
366
const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
367
? enc->segment_hdr_.num_segments_
368
: NUM_MB_SEGMENTS;
369
int num_final_segments = 1;
370
int s1, s2;
371
for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
372
const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
373
int found = 0;
374
// check if we already have similar segment
375
for (s2 = 0; s2 < num_final_segments; ++s2) {
376
const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
377
if (SegmentsAreEquivalent(S1, S2)) {
378
found = 1;
379
break;
380
}
381
}
382
map[s1] = s2;
383
if (!found) {
384
if (num_final_segments != s1) {
385
enc->dqm_[num_final_segments] = enc->dqm_[s1];
386
}
387
++num_final_segments;
388
}
389
}
390
if (num_final_segments < num_segments) { // Remap
391
int i = enc->mb_w_ * enc->mb_h_;
392
while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
393
enc->segment_hdr_.num_segments_ = num_final_segments;
394
// Replicate the trailing segment infos (it's mostly cosmetics)
395
for (i = num_final_segments; i < num_segments; ++i) {
396
enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
397
}
398
}
399
}
400
401
void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
402
int i;
403
int dq_uv_ac, dq_uv_dc;
404
const int num_segments = enc->segment_hdr_.num_segments_;
405
const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
406
const double Q = quality / 100.;
407
const double c_base = enc->config_->emulate_jpeg_size ?
408
QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
409
QualityToCompression(Q);
410
for (i = 0; i < num_segments; ++i) {
411
// We modulate the base coefficient to accommodate for the quantization
412
// susceptibility and allow denser segments to be quantized more.
413
const double expn = 1. - amp * enc->dqm_[i].alpha_;
414
const double c = pow(c_base, expn);
415
const int q = (int)(127. * (1. - c));
416
assert(expn > 0.);
417
enc->dqm_[i].quant_ = clip(q, 0, 127);
418
}
419
420
// purely indicative in the bitstream (except for the 1-segment case)
421
enc->base_quant_ = enc->dqm_[0].quant_;
422
423
// fill-in values for the unused segments (required by the syntax)
424
for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
425
enc->dqm_[i].quant_ = enc->base_quant_;
426
}
427
428
// uv_alpha_ is normally spread around ~60. The useful range is
429
// typically ~30 (quite bad) to ~100 (ok to decimate UV more).
430
// We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
431
dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
432
/ (MAX_ALPHA - MIN_ALPHA);
433
// we rescale by the user-defined strength of adaptation
434
dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
435
// and make it safe.
436
dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
437
// We also boost the dc-uv-quant a little, based on sns-strength, since
438
// U/V channels are quite more reactive to high quants (flat DC-blocks
439
// tend to appear, and are unpleasant).
440
dq_uv_dc = -4 * enc->config_->sns_strength / 100;
441
dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
442
443
enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
444
enc->dq_y2_dc_ = 0;
445
enc->dq_y2_ac_ = 0;
446
enc->dq_uv_dc_ = dq_uv_dc;
447
enc->dq_uv_ac_ = dq_uv_ac;
448
449
SetupFilterStrength(enc); // initialize segments' filtering, eventually
450
451
if (num_segments > 1) SimplifySegments(enc);
452
453
SetupMatrices(enc); // finalize quantization matrices
454
}
455
456
//------------------------------------------------------------------------------
457
// Form the predictions in cache
458
459
// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
460
const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
461
const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
462
463
// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
464
const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
465
I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
466
};
467
468
void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
469
const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
470
const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
471
VP8EncPredLuma16(it->yuv_p_, left, top);
472
}
473
474
void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
475
const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
476
const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
477
VP8EncPredChroma8(it->yuv_p_, left, top);
478
}
479
480
void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
481
VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
482
}
483
484
//------------------------------------------------------------------------------
485
// Quantize
486
487
// Layout:
488
// +----+----+
489
// |YYYY|UUVV| 0
490
// |YYYY|UUVV| 4
491
// |YYYY|....| 8
492
// |YYYY|....| 12
493
// +----+----+
494
495
const uint16_t VP8Scan[16] = { // Luma
496
0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
497
0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
498
0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
499
0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
500
};
501
502
static const uint16_t VP8ScanUV[4 + 4] = {
503
0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
504
8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
505
};
506
507
//------------------------------------------------------------------------------
508
// Distortion measurement
509
510
static const uint16_t kWeightY[16] = {
511
38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
512
};
513
514
static const uint16_t kWeightTrellis[16] = {
515
#if USE_TDISTO == 0
516
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
517
#else
518
30, 27, 19, 11,
519
27, 24, 17, 10,
520
19, 17, 12, 8,
521
11, 10, 8, 6
522
#endif
523
};
524
525
// Init/Copy the common fields in score.
526
static void InitScore(VP8ModeScore* const rd) {
527
rd->D = 0;
528
rd->SD = 0;
529
rd->R = 0;
530
rd->H = 0;
531
rd->nz = 0;
532
rd->score = MAX_COST;
533
}
534
535
static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
536
dst->D = src->D;
537
dst->SD = src->SD;
538
dst->R = src->R;
539
dst->H = src->H;
540
dst->nz = src->nz; // note that nz is not accumulated, but just copied.
541
dst->score = src->score;
542
}
543
544
static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
545
dst->D += src->D;
546
dst->SD += src->SD;
547
dst->R += src->R;
548
dst->H += src->H;
549
dst->nz |= src->nz; // here, new nz bits are accumulated.
550
dst->score += src->score;
551
}
552
553
//------------------------------------------------------------------------------
554
// Performs trellis-optimized quantization.
555
556
// Trellis node
557
typedef struct {
558
int8_t prev; // best previous node
559
int8_t sign; // sign of coeff_i
560
int16_t level; // level
561
} Node;
562
563
// Score state
564
typedef struct {
565
score_t score; // partial RD score
566
const uint16_t* costs; // shortcut to cost tables
567
} ScoreState;
568
569
// If a coefficient was quantized to a value Q (using a neutral bias),
570
// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
571
// We don't test negative values though.
572
#define MIN_DELTA 0 // how much lower level to try
573
#define MAX_DELTA 1 // how much higher
574
#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
575
#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
576
#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
577
578
static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
579
rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
580
}
581
582
static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
583
score_t distortion) {
584
return rate * lambda + RD_DISTO_MULT * distortion;
585
}
586
587
static int TrellisQuantizeBlock(const VP8Encoder* const enc,
588
int16_t in[16], int16_t out[16],
589
int ctx0, int coeff_type,
590
const VP8Matrix* const mtx,
591
int lambda) {
592
const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
593
CostArrayPtr const costs =
594
(CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
595
const int first = (coeff_type == 0) ? 1 : 0;
596
Node nodes[16][NUM_NODES];
597
ScoreState score_states[2][NUM_NODES];
598
ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
599
ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
600
int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
601
score_t best_score;
602
int n, m, p, last;
603
604
{
605
score_t cost;
606
const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
607
const int last_proba = probas[VP8EncBands[first]][ctx0][0];
608
609
// compute the position of the last interesting coefficient
610
last = first - 1;
611
for (n = 15; n >= first; --n) {
612
const int j = kZigzag[n];
613
const int err = in[j] * in[j];
614
if (err > thresh) {
615
last = n;
616
break;
617
}
618
}
619
// we don't need to go inspect up to n = 16 coeffs. We can just go up
620
// to last + 1 (inclusive) without losing much.
621
if (last < 15) ++last;
622
623
// compute 'skip' score. This is the max score one can do.
624
cost = VP8BitCost(0, last_proba);
625
best_score = RDScoreTrellis(lambda, cost, 0);
626
627
// initialize source node.
628
for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
629
const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
630
ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
631
ss_cur[m].costs = costs[first][ctx0];
632
}
633
}
634
635
// traverse trellis.
636
for (n = first; n <= last; ++n) {
637
const int j = kZigzag[n];
638
const uint32_t Q = mtx->q_[j];
639
const uint32_t iQ = mtx->iq_[j];
640
const uint32_t B = BIAS(0x00); // neutral bias
641
// note: it's important to take sign of the _original_ coeff,
642
// so we don't have to consider level < 0 afterward.
643
const int sign = (in[j] < 0);
644
const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
645
int level0 = QUANTDIV(coeff0, iQ, B);
646
int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
647
if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
648
if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
649
650
{ // Swap current and previous score states
651
ScoreState* const tmp = ss_cur;
652
ss_cur = ss_prev;
653
ss_prev = tmp;
654
}
655
656
// test all alternate level values around level0.
657
for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
658
Node* const cur = &NODE(n, m);
659
int level = level0 + m;
660
const int ctx = (level > 2) ? 2 : level;
661
const int band = VP8EncBands[n + 1];
662
score_t base_score;
663
score_t best_cur_score = MAX_COST;
664
int best_prev = 0; // default, in case
665
666
ss_cur[m].score = MAX_COST;
667
ss_cur[m].costs = costs[n + 1][ctx];
668
if (level < 0 || level > thresh_level) {
669
// Node is dead.
670
continue;
671
}
672
673
{
674
// Compute delta_error = how much coding this level will
675
// subtract to max_error as distortion.
676
// Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
677
const int new_error = coeff0 - level * Q;
678
const int delta_error =
679
kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
680
base_score = RDScoreTrellis(lambda, 0, delta_error);
681
}
682
683
// Inspect all possible non-dead predecessors. Retain only the best one.
684
for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
685
// Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
686
// eliminated since their score can't be better than the current best.
687
const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
688
// Examine node assuming it's a non-terminal one.
689
const score_t score =
690
base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
691
if (score < best_cur_score) {
692
best_cur_score = score;
693
best_prev = p;
694
}
695
}
696
// Store best finding in current node.
697
cur->sign = sign;
698
cur->level = level;
699
cur->prev = best_prev;
700
ss_cur[m].score = best_cur_score;
701
702
// Now, record best terminal node (and thus best entry in the graph).
703
if (level != 0) {
704
const score_t last_pos_cost =
705
(n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
706
const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
707
const score_t score = best_cur_score + last_pos_score;
708
if (score < best_score) {
709
best_score = score;
710
best_path[0] = n; // best eob position
711
best_path[1] = m; // best node index
712
best_path[2] = best_prev; // best predecessor
713
}
714
}
715
}
716
}
717
718
// Fresh start
719
memset(in + first, 0, (16 - first) * sizeof(*in));
720
memset(out + first, 0, (16 - first) * sizeof(*out));
721
if (best_path[0] == -1) {
722
return 0; // skip!
723
}
724
725
{
726
// Unwind the best path.
727
// Note: best-prev on terminal node is not necessarily equal to the
728
// best_prev for non-terminal. So we patch best_path[2] in.
729
int nz = 0;
730
int best_node = best_path[1];
731
n = best_path[0];
732
NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
733
734
for (; n >= first; --n) {
735
const Node* const node = &NODE(n, best_node);
736
const int j = kZigzag[n];
737
out[n] = node->sign ? -node->level : node->level;
738
nz |= node->level;
739
in[j] = out[n] * mtx->q_[j];
740
best_node = node->prev;
741
}
742
return (nz != 0);
743
}
744
}
745
746
#undef NODE
747
748
//------------------------------------------------------------------------------
749
// Performs: difference, transform, quantize, back-transform, add
750
// all at once. Output is the reconstructed block in *yuv_out, and the
751
// quantized levels in *levels.
752
753
static int ReconstructIntra16(VP8EncIterator* const it,
754
VP8ModeScore* const rd,
755
uint8_t* const yuv_out,
756
int mode) {
757
const VP8Encoder* const enc = it->enc_;
758
const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
759
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
760
const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
761
int nz = 0;
762
int n;
763
int16_t tmp[16][16], dc_tmp[16];
764
765
for (n = 0; n < 16; n += 2) {
766
VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
767
}
768
VP8FTransformWHT(tmp[0], dc_tmp);
769
nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
770
771
if (DO_TRELLIS_I16 && it->do_trellis_) {
772
int x, y;
773
VP8IteratorNzToBytes(it);
774
for (y = 0, n = 0; y < 4; ++y) {
775
for (x = 0; x < 4; ++x, ++n) {
776
const int ctx = it->top_nz_[x] + it->left_nz_[y];
777
const int non_zero =
778
TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
779
&dqm->y1_, dqm->lambda_trellis_i16_);
780
it->top_nz_[x] = it->left_nz_[y] = non_zero;
781
rd->y_ac_levels[n][0] = 0;
782
nz |= non_zero << n;
783
}
784
}
785
} else {
786
for (n = 0; n < 16; n += 2) {
787
// Zero-out the first coeff, so that: a) nz is correct below, and
788
// b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
789
tmp[n][0] = tmp[n + 1][0] = 0;
790
nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
791
assert(rd->y_ac_levels[n + 0][0] == 0);
792
assert(rd->y_ac_levels[n + 1][0] == 0);
793
}
794
}
795
796
// Transform back
797
VP8TransformWHT(dc_tmp, tmp[0]);
798
for (n = 0; n < 16; n += 2) {
799
VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
800
}
801
802
return nz;
803
}
804
805
static int ReconstructIntra4(VP8EncIterator* const it,
806
int16_t levels[16],
807
const uint8_t* const src,
808
uint8_t* const yuv_out,
809
int mode) {
810
const VP8Encoder* const enc = it->enc_;
811
const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
812
const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
813
int nz = 0;
814
int16_t tmp[16];
815
816
VP8FTransform(src, ref, tmp);
817
if (DO_TRELLIS_I4 && it->do_trellis_) {
818
const int x = it->i4_ & 3, y = it->i4_ >> 2;
819
const int ctx = it->top_nz_[x] + it->left_nz_[y];
820
nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
821
dqm->lambda_trellis_i4_);
822
} else {
823
nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
824
}
825
VP8ITransform(ref, tmp, yuv_out, 0);
826
return nz;
827
}
828
829
//------------------------------------------------------------------------------
830
// DC-error diffusion
831
832
// Diffusion weights. We under-correct a bit (15/16th of the error is actually
833
// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
834
#define C1 7 // fraction of error sent to the 4x4 block below
835
#define C2 8 // fraction of error sent to the 4x4 block on the right
836
#define DSHIFT 4
837
#define DSCALE 1 // storage descaling, needed to make the error fit int8_t
838
839
// Quantize as usual, but also compute and return the quantization error.
840
// Error is already divided by DSHIFT.
841
static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
842
int V = *v;
843
const int sign = (V < 0);
844
if (sign) V = -V;
845
if (V > (int)mtx->zthresh_[0]) {
846
const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
847
const int err = (V - qV);
848
*v = sign ? -qV : qV;
849
return (sign ? -err : err) >> DSCALE;
850
}
851
*v = 0;
852
return (sign ? -V : V) >> DSCALE;
853
}
854
855
static void CorrectDCValues(const VP8EncIterator* const it,
856
const VP8Matrix* const mtx,
857
int16_t tmp[][16], VP8ModeScore* const rd) {
858
// | top[0] | top[1]
859
// --------+--------+---------
860
// left[0] | tmp[0] tmp[1] <-> err0 err1
861
// left[1] | tmp[2] tmp[3] err2 err3
862
//
863
// Final errors {err1,err2,err3} are preserved and later restored
864
// as top[]/left[] on the next block.
865
int ch;
866
for (ch = 0; ch <= 1; ++ch) {
867
const int8_t* const top = it->top_derr_[it->x_][ch];
868
const int8_t* const left = it->left_derr_[ch];
869
int16_t (* const c)[16] = &tmp[ch * 4];
870
int err0, err1, err2, err3;
871
c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
872
err0 = QuantizeSingle(&c[0][0], mtx);
873
c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
874
err1 = QuantizeSingle(&c[1][0], mtx);
875
c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
876
err2 = QuantizeSingle(&c[2][0], mtx);
877
c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
878
err3 = QuantizeSingle(&c[3][0], mtx);
879
// error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
880
// err >> DSCALE will fit in an int8_t type if DSCALE>=1.
881
assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
882
rd->derr[ch][0] = (int8_t)err1;
883
rd->derr[ch][1] = (int8_t)err2;
884
rd->derr[ch][2] = (int8_t)err3;
885
}
886
}
887
888
static void StoreDiffusionErrors(VP8EncIterator* const it,
889
const VP8ModeScore* const rd) {
890
int ch;
891
for (ch = 0; ch <= 1; ++ch) {
892
int8_t* const top = it->top_derr_[it->x_][ch];
893
int8_t* const left = it->left_derr_[ch];
894
left[0] = rd->derr[ch][0]; // restore err1
895
left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3
896
top[0] = rd->derr[ch][1]; // ... err2
897
top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3.
898
}
899
}
900
901
#undef C1
902
#undef C2
903
#undef DSHIFT
904
#undef DSCALE
905
906
//------------------------------------------------------------------------------
907
908
static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
909
uint8_t* const yuv_out, int mode) {
910
const VP8Encoder* const enc = it->enc_;
911
const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
912
const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
913
const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
914
int nz = 0;
915
int n;
916
int16_t tmp[8][16];
917
918
for (n = 0; n < 8; n += 2) {
919
VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
920
}
921
if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
922
923
if (DO_TRELLIS_UV && it->do_trellis_) {
924
int ch, x, y;
925
for (ch = 0, n = 0; ch <= 2; ch += 2) {
926
for (y = 0; y < 2; ++y) {
927
for (x = 0; x < 2; ++x, ++n) {
928
const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
929
const int non_zero =
930
TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
931
&dqm->uv_, dqm->lambda_trellis_uv_);
932
it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
933
nz |= non_zero << n;
934
}
935
}
936
}
937
} else {
938
for (n = 0; n < 8; n += 2) {
939
nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
940
}
941
}
942
943
for (n = 0; n < 8; n += 2) {
944
VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
945
}
946
return (nz << 16);
947
}
948
949
//------------------------------------------------------------------------------
950
// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
951
// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
952
953
static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
954
// We look at the first three AC coefficients to determine what is the average
955
// delta between each sub-4x4 block.
956
const int v0 = abs(DCs[1]);
957
const int v1 = abs(DCs[2]);
958
const int v2 = abs(DCs[4]);
959
int max_v = (v1 > v0) ? v1 : v0;
960
max_v = (v2 > max_v) ? v2 : max_v;
961
if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
962
}
963
964
static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
965
VP8ModeScore* const tmp = *a;
966
*a = *b;
967
*b = tmp;
968
}
969
970
static void SwapPtr(uint8_t** a, uint8_t** b) {
971
uint8_t* const tmp = *a;
972
*a = *b;
973
*b = tmp;
974
}
975
976
static void SwapOut(VP8EncIterator* const it) {
977
SwapPtr(&it->yuv_out_, &it->yuv_out2_);
978
}
979
980
static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
981
score_t score = 0;
982
while (num_blocks-- > 0) { // TODO(skal): refine positional scoring?
983
int i;
984
for (i = 1; i < 16; ++i) { // omit DC, we're only interested in AC
985
score += (levels[i] != 0);
986
if (score > thresh) return 0;
987
}
988
levels += 16;
989
}
990
return 1;
991
}
992
993
static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
994
const int kNumBlocks = 16;
995
VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
996
const int lambda = dqm->lambda_i16_;
997
const int tlambda = dqm->tlambda_;
998
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
999
VP8ModeScore rd_tmp;
1000
VP8ModeScore* rd_cur = &rd_tmp;
1001
VP8ModeScore* rd_best = rd;
1002
int mode;
1003
1004
rd->mode_i16 = -1;
1005
for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1006
uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
1007
rd_cur->mode_i16 = mode;
1008
1009
// Reconstruct
1010
rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1011
1012
// Measure RD-score
1013
rd_cur->D = VP8SSE16x16(src, tmp_dst);
1014
rd_cur->SD =
1015
tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1016
rd_cur->H = VP8FixedCostsI16[mode];
1017
rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1018
if (mode > 0 &&
1019
IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
1020
// penalty to avoid flat area to be mispredicted by complex mode
1021
rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
1022
}
1023
1024
// Since we always examine Intra16 first, we can overwrite *rd directly.
1025
SetRDScore(lambda, rd_cur);
1026
if (mode == 0 || rd_cur->score < rd_best->score) {
1027
SwapModeScore(&rd_cur, &rd_best);
1028
SwapOut(it);
1029
}
1030
}
1031
if (rd_best != rd) {
1032
memcpy(rd, rd_best, sizeof(*rd));
1033
}
1034
SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
1035
VP8SetIntra16Mode(it, rd->mode_i16);
1036
1037
// we have a blocky macroblock (only DCs are non-zero) with fairly high
1038
// distortion, record max delta so we can later adjust the minimal filtering
1039
// strength needed to smooth these blocks out.
1040
if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1041
StoreMaxDelta(dqm, rd->y_dc_levels);
1042
}
1043
}
1044
1045
//------------------------------------------------------------------------------
1046
1047
// return the cost array corresponding to the surrounding prediction modes.
1048
static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
1049
const uint8_t modes[16]) {
1050
const int preds_w = it->enc_->preds_w_;
1051
const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1052
const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1053
const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1054
return VP8FixedCostsI4[top][left];
1055
}
1056
1057
static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
1058
const VP8Encoder* const enc = it->enc_;
1059
const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1060
const int lambda = dqm->lambda_i4_;
1061
const int tlambda = dqm->tlambda_;
1062
const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1063
uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1064
int total_header_bits = 0;
1065
VP8ModeScore rd_best;
1066
1067
if (enc->max_i4_header_bits_ == 0) {
1068
return 0;
1069
}
1070
1071
InitScore(&rd_best);
1072
rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
1073
SetRDScore(dqm->lambda_mode_, &rd_best);
1074
VP8IteratorStartI4(it);
1075
do {
1076
const int kNumBlocks = 1;
1077
VP8ModeScore rd_i4;
1078
int mode;
1079
int best_mode = -1;
1080
const uint8_t* const src = src0 + VP8Scan[it->i4_];
1081
const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1082
uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1083
uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
1084
1085
InitScore(&rd_i4);
1086
VP8MakeIntra4Preds(it);
1087
for (mode = 0; mode < NUM_BMODES; ++mode) {
1088
VP8ModeScore rd_tmp;
1089
int16_t tmp_levels[16];
1090
1091
// Reconstruct
1092
rd_tmp.nz =
1093
ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1094
1095
// Compute RD-score
1096
rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1097
rd_tmp.SD =
1098
tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1099
: 0;
1100
rd_tmp.H = mode_costs[mode];
1101
1102
// Add flatness penalty
1103
if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1104
rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1105
} else {
1106
rd_tmp.R = 0;
1107
}
1108
1109
// early-out check
1110
SetRDScore(lambda, &rd_tmp);
1111
if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1112
1113
// finish computing score
1114
rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1115
SetRDScore(lambda, &rd_tmp);
1116
1117
if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1118
CopyScore(&rd_i4, &rd_tmp);
1119
best_mode = mode;
1120
SwapPtr(&tmp_dst, &best_block);
1121
memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1122
sizeof(rd_best.y_ac_levels[it->i4_]));
1123
}
1124
}
1125
SetRDScore(dqm->lambda_mode_, &rd_i4);
1126
AddScore(&rd_best, &rd_i4);
1127
if (rd_best.score >= rd->score) {
1128
return 0;
1129
}
1130
total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1131
if (total_header_bits > enc->max_i4_header_bits_) {
1132
return 0;
1133
}
1134
// Copy selected samples if not in the right place already.
1135
if (best_block != best_blocks + VP8Scan[it->i4_]) {
1136
VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1137
}
1138
rd->modes_i4[it->i4_] = best_mode;
1139
it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1140
} while (VP8IteratorRotateI4(it, best_blocks));
1141
1142
// finalize state
1143
CopyScore(rd, &rd_best);
1144
VP8SetIntra4Mode(it, rd->modes_i4);
1145
SwapOut(it);
1146
memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1147
return 1; // select intra4x4 over intra16x16
1148
}
1149
1150
//------------------------------------------------------------------------------
1151
1152
static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1153
const int kNumBlocks = 8;
1154
const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1155
const int lambda = dqm->lambda_uv_;
1156
const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1157
uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
1158
uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1159
uint8_t* dst = dst0;
1160
VP8ModeScore rd_best;
1161
int mode;
1162
1163
rd->mode_uv = -1;
1164
InitScore(&rd_best);
1165
for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1166
VP8ModeScore rd_uv;
1167
1168
// Reconstruct
1169
rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1170
1171
// Compute RD-score
1172
rd_uv.D = VP8SSE16x8(src, tmp_dst);
1173
rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas.
1174
rd_uv.H = VP8FixedCostsUV[mode];
1175
rd_uv.R = VP8GetCostUV(it, &rd_uv);
1176
if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1177
rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1178
}
1179
1180
SetRDScore(lambda, &rd_uv);
1181
if (mode == 0 || rd_uv.score < rd_best.score) {
1182
CopyScore(&rd_best, &rd_uv);
1183
rd->mode_uv = mode;
1184
memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1185
if (it->top_derr_ != NULL) {
1186
memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1187
}
1188
SwapPtr(&dst, &tmp_dst);
1189
}
1190
}
1191
VP8SetIntraUVMode(it, rd->mode_uv);
1192
AddScore(rd, &rd_best);
1193
if (dst != dst0) { // copy 16x8 block if needed
1194
VP8Copy16x8(dst, dst0);
1195
}
1196
if (it->top_derr_ != NULL) { // store diffusion errors for next block
1197
StoreDiffusionErrors(it, rd);
1198
}
1199
}
1200
1201
//------------------------------------------------------------------------------
1202
// Final reconstruction and quantization.
1203
1204
static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1205
const VP8Encoder* const enc = it->enc_;
1206
const int is_i16 = (it->mb_->type_ == 1);
1207
int nz = 0;
1208
1209
if (is_i16) {
1210
nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1211
} else {
1212
VP8IteratorStartI4(it);
1213
do {
1214
const int mode =
1215
it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1216
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1217
uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1218
VP8MakeIntra4Preds(it);
1219
nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1220
src, dst, mode) << it->i4_;
1221
} while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1222
}
1223
1224
nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1225
rd->nz = nz;
1226
}
1227
1228
// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1229
static void RefineUsingDistortion(VP8EncIterator* const it,
1230
int try_both_modes, int refine_uv_mode,
1231
VP8ModeScore* const rd) {
1232
score_t best_score = MAX_COST;
1233
int nz = 0;
1234
int mode;
1235
int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1236
1237
const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1238
// Some empiric constants, of approximate order of magnitude.
1239
const int lambda_d_i16 = 106;
1240
const int lambda_d_i4 = 11;
1241
const int lambda_d_uv = 120;
1242
score_t score_i4 = dqm->i4_penalty_;
1243
score_t i4_bit_sum = 0;
1244
const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1245
: MAX_COST; // no early-out allowed
1246
1247
if (is_i16) { // First, evaluate Intra16 distortion
1248
int best_mode = -1;
1249
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1250
for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1251
const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1252
const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1253
+ VP8FixedCostsI16[mode] * lambda_d_i16;
1254
if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1255
continue;
1256
}
1257
if (score < best_score) {
1258
best_mode = mode;
1259
best_score = score;
1260
}
1261
}
1262
VP8SetIntra16Mode(it, best_mode);
1263
// we'll reconstruct later, if i16 mode actually gets selected
1264
}
1265
1266
// Next, evaluate Intra4
1267
if (try_both_modes || !is_i16) {
1268
// We don't evaluate the rate here, but just account for it through a
1269
// constant penalty (i4 mode usually needs more bits compared to i16).
1270
is_i16 = 0;
1271
VP8IteratorStartI4(it);
1272
do {
1273
int best_i4_mode = -1;
1274
score_t best_i4_score = MAX_COST;
1275
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1276
const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1277
1278
VP8MakeIntra4Preds(it);
1279
for (mode = 0; mode < NUM_BMODES; ++mode) {
1280
const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1281
const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1282
+ mode_costs[mode] * lambda_d_i4;
1283
if (score < best_i4_score) {
1284
best_i4_mode = mode;
1285
best_i4_score = score;
1286
}
1287
}
1288
i4_bit_sum += mode_costs[best_i4_mode];
1289
rd->modes_i4[it->i4_] = best_i4_mode;
1290
score_i4 += best_i4_score;
1291
if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1292
// Intra4 won't be better than Intra16. Bail out and pick Intra16.
1293
is_i16 = 1;
1294
break;
1295
} else { // reconstruct partial block inside yuv_out2_ buffer
1296
uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1297
nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1298
src, tmp_dst, best_i4_mode) << it->i4_;
1299
}
1300
} while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1301
}
1302
1303
// Final reconstruction, depending on which mode is selected.
1304
if (!is_i16) {
1305
VP8SetIntra4Mode(it, rd->modes_i4);
1306
SwapOut(it);
1307
best_score = score_i4;
1308
} else {
1309
nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1310
}
1311
1312
// ... and UV!
1313
if (refine_uv_mode) {
1314
int best_mode = -1;
1315
score_t best_uv_score = MAX_COST;
1316
const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1317
for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1318
const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1319
const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1320
+ VP8FixedCostsUV[mode] * lambda_d_uv;
1321
if (score < best_uv_score) {
1322
best_mode = mode;
1323
best_uv_score = score;
1324
}
1325
}
1326
VP8SetIntraUVMode(it, best_mode);
1327
}
1328
nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1329
1330
rd->nz = nz;
1331
rd->score = best_score;
1332
}
1333
1334
//------------------------------------------------------------------------------
1335
// Entry point
1336
1337
int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1338
VP8RDLevel rd_opt) {
1339
int is_skipped;
1340
const int method = it->enc_->method_;
1341
1342
InitScore(rd);
1343
1344
// We can perform predictions for Luma16x16 and Chroma8x8 already.
1345
// Luma4x4 predictions needs to be done as-we-go.
1346
VP8MakeLuma16Preds(it);
1347
VP8MakeChroma8Preds(it);
1348
1349
if (rd_opt > RD_OPT_NONE) {
1350
it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1351
PickBestIntra16(it, rd);
1352
if (method >= 2) {
1353
PickBestIntra4(it, rd);
1354
}
1355
PickBestUV(it, rd);
1356
if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1357
it->do_trellis_ = 1;
1358
SimpleQuantize(it, rd);
1359
}
1360
} else {
1361
// At this point we have heuristically decided intra16 / intra4.
1362
// For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1363
// For method <= 1, we don't re-examine the decision but just go ahead with
1364
// quantization/reconstruction.
1365
RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1366
}
1367
is_skipped = (rd->nz == 0);
1368
VP8SetSkip(it, is_skipped);
1369
return is_skipped;
1370
}
1371
1372