CoCalc -- quant

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libwebp/src/enc/quant_enc.c
⁹⁹¹³ views
1
// Copyright 2011 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
//   Quantization
11
//
12
// Author: Skal ([email protected])
13

14
#include <assert.h>
15
#include <math.h>
16
#include <stdlib.h>  // for abs()
17

18
#include "src/dsp/quant.h"
19
#include "src/enc/vp8i_enc.h"
20
#include "src/enc/cost_enc.h"
21

22
#define DO_TRELLIS_I4  1
23
#define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
24
#define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
25
#define USE_TDISTO 1
26

27
#define MID_ALPHA 64      // neutral value for susceptibility
28
#define MIN_ALPHA 30      // lowest usable value for susceptibility
29
#define MAX_ALPHA 100     // higher meaningful value for susceptibility
30

31
#define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
32
                          // power-law modulation. Must be strictly less than 1.
33

34
// number of non-zero coeffs below which we consider the block very flat
35
// (and apply a penalty to complex predictions)
36
#define FLATNESS_LIMIT_I16 0       // I16 mode (special case)
37
#define FLATNESS_LIMIT_I4  3       // I4 mode
38
#define FLATNESS_LIMIT_UV  2       // UV mode
39
#define FLATNESS_PENALTY   140     // roughly ~1bit per block
40

41
#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42

43
#define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
44

45
// #define DEBUG_BLOCK
46

47
//------------------------------------------------------------------------------
48

49
#if defined(DEBUG_BLOCK)
50

51
#include <stdio.h>
52
#include <stdlib.h>
53

54
static void PrintBlockInfo(const VP8EncIterator* const it,
55
                           const VP8ModeScore* const rd) {
56
  int i, j;
57
  const int is_i16 = (it->mb_->type_ == 1);
58
  const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59
  const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60
  const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61
  const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62
  printf("SOURCE / OUTPUT / ABS DELTA\n");
63
  for (j = 0; j < 16; ++j) {
64
    for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65
    printf("     ");
66
    for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67
    printf("     ");
68
    for (i = 0; i < 16; ++i) {
69
      printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70
    }
71
    printf("\n");
72
  }
73
  printf("\n");   // newline before the U/V block
74
  for (j = 0; j < 8; ++j) {
75
    for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76
    printf(" ");
77
    for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78
    printf("    ");
79
    for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80
    printf(" ");
81
    for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82
    printf("   ");
83
    for (i = 0; i < 8; ++i) {
84
      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85
    }
86
    printf(" ");
87
    for (i = 8; i < 16; ++i) {
88
      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89
    }
90
    printf("\n");
91
  }
92
  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93
    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94
    (int)rd->score);
95
  if (is_i16) {
96
    printf("Mode: %d\n", rd->mode_i16);
97
    printf("y_dc_levels:");
98
    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99
    printf("\n");
100
  } else {
101
    printf("Modes[16]: ");
102
    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103
    printf("\n");
104
  }
105
  printf("y_ac_levels:\n");
106
  for (j = 0; j < 16; ++j) {
107
    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108
      printf("%4d ", rd->y_ac_levels[j][i]);
109
    }
110
    printf("\n");
111
  }
112
  printf("\n");
113
  printf("uv_levels (mode=%d):\n", rd->mode_uv);
114
  for (j = 0; j < 8; ++j) {
115
    for (i = 0; i < 16; ++i) {
116
      printf("%4d ", rd->uv_levels[j][i]);
117
    }
118
    printf("\n");
119
  }
120
}
121

122
#endif   // DEBUG_BLOCK
123

124
//------------------------------------------------------------------------------
125

126
static WEBP_INLINE int clip(int v, int m, int M) {
127
  return v < m ? m : v > M ? M : v;
128
}
129

130
static const uint8_t kZigzag[16] = {
131
  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132
};
133

134
static const uint8_t kDcTable[128] = {
135
  4,     5,   6,   7,   8,   9,  10,  10,
136
  11,   12,  13,  14,  15,  16,  17,  17,
137
  18,   19,  20,  20,  21,  21,  22,  22,
138
  23,   23,  24,  25,  25,  26,  27,  28,
139
  29,   30,  31,  32,  33,  34,  35,  36,
140
  37,   37,  38,  39,  40,  41,  42,  43,
141
  44,   45,  46,  46,  47,  48,  49,  50,
142
  51,   52,  53,  54,  55,  56,  57,  58,
143
  59,   60,  61,  62,  63,  64,  65,  66,
144
  67,   68,  69,  70,  71,  72,  73,  74,
145
  75,   76,  76,  77,  78,  79,  80,  81,
146
  82,   83,  84,  85,  86,  87,  88,  89,
147
  91,   93,  95,  96,  98, 100, 101, 102,
148
  104, 106, 108, 110, 112, 114, 116, 118,
149
  122, 124, 126, 128, 130, 132, 134, 136,
150
  138, 140, 143, 145, 148, 151, 154, 157
151
};
152

153
static const uint16_t kAcTable[128] = {
154
  4,     5,   6,   7,   8,   9,  10,  11,
155
  12,   13,  14,  15,  16,  17,  18,  19,
156
  20,   21,  22,  23,  24,  25,  26,  27,
157
  28,   29,  30,  31,  32,  33,  34,  35,
158
  36,   37,  38,  39,  40,  41,  42,  43,
159
  44,   45,  46,  47,  48,  49,  50,  51,
160
  52,   53,  54,  55,  56,  57,  58,  60,
161
  62,   64,  66,  68,  70,  72,  74,  76,
162
  78,   80,  82,  84,  86,  88,  90,  92,
163
  94,   96,  98, 100, 102, 104, 106, 108,
164
  110, 112, 114, 116, 119, 122, 125, 128,
165
  131, 134, 137, 140, 143, 146, 149, 152,
166
  155, 158, 161, 164, 167, 170, 173, 177,
167
  181, 185, 189, 193, 197, 201, 205, 209,
168
  213, 217, 221, 225, 229, 234, 239, 245,
169
  249, 254, 259, 264, 269, 274, 279, 284
170
};
171

172
static const uint16_t kAcTable2[128] = {
173
  8,     8,   9,  10,  12,  13,  15,  17,
174
  18,   20,  21,  23,  24,  26,  27,  29,
175
  31,   32,  34,  35,  37,  38,  40,  41,
176
  43,   44,  46,  48,  49,  51,  52,  54,
177
  55,   57,  58,  60,  62,  63,  65,  66,
178
  68,   69,  71,  72,  74,  75,  77,  79,
179
  80,   82,  83,  85,  86,  88,  89,  93,
180
  96,   99, 102, 105, 108, 111, 114, 117,
181
  120, 124, 127, 130, 133, 136, 139, 142,
182
  145, 148, 151, 155, 158, 161, 164, 167,
183
  170, 173, 176, 179, 184, 189, 193, 198,
184
  203, 207, 212, 217, 221, 226, 230, 235,
185
  240, 244, 249, 254, 258, 263, 268, 274,
186
  280, 286, 292, 299, 305, 311, 317, 323,
187
  330, 336, 342, 348, 354, 362, 370, 379,
188
  385, 393, 401, 409, 416, 424, 432, 440
189
};
190

191
static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
192
  { 96, 110 }, { 96, 108 }, { 110, 115 }
193
};
194

195
// Sharpening by (slightly) raising the hi-frequency coeffs.
196
// Hack-ish but helpful for mid-bitrate range. Use with care.
197
#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
198
static const uint8_t kFreqSharpening[16] = {
199
  0,  30, 60, 90,
200
  30, 60, 90, 90,
201
  60, 90, 90, 90,
202
  90, 90, 90, 90
203
};
204

205
//------------------------------------------------------------------------------
206
// Initialize quantization parameters in VP8Matrix
207

208
// Returns the average quantizer
209
static int ExpandMatrix(VP8Matrix* const m, int type) {
210
  int i, sum;
211
  for (i = 0; i < 2; ++i) {
212
    const int is_ac_coeff = (i > 0);
213
    const int bias = kBiasMatrices[type][is_ac_coeff];
214
    m->iq_[i] = (1 << QFIX) / m->q_[i];
215
    m->bias_[i] = BIAS(bias);
216
    // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217
    //   * zero if coeff <= zthresh
218
    //   * non-zero if coeff > zthresh
219
    m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220
  }
221
  for (i = 2; i < 16; ++i) {
222
    m->q_[i] = m->q_[1];
223
    m->iq_[i] = m->iq_[1];
224
    m->bias_[i] = m->bias_[1];
225
    m->zthresh_[i] = m->zthresh_[1];
226
  }
227
  for (sum = 0, i = 0; i < 16; ++i) {
228
    if (type == 0) {  // we only use sharpening for AC luma coeffs
229
      m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230
    } else {
231
      m->sharpen_[i] = 0;
232
    }
233
    sum += m->q_[i];
234
  }
235
  return (sum + 8) >> 4;
236
}
237

238
static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239

240
static void SetupMatrices(VP8Encoder* enc) {
241
  int i;
242
  const int tlambda_scale =
243
    (enc->method_ >= 4) ? enc->config_->sns_strength
244
                        : 0;
245
  const int num_segments = enc->segment_hdr_.num_segments_;
246
  for (i = 0; i < num_segments; ++i) {
247
    VP8SegmentInfo* const m = &enc->dqm_[i];
248
    const int q = m->quant_;
249
    int q_i4, q_i16, q_uv;
250
    m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251
    m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
252

253
    m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254
    m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255

256
    m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257
    m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258

259
    q_i4  = ExpandMatrix(&m->y1_, 0);
260
    q_i16 = ExpandMatrix(&m->y2_, 1);
261
    q_uv  = ExpandMatrix(&m->uv_, 2);
262

263
    m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
264
    m->lambda_i16_         = (3 * q_i16 * q_i16);
265
    m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
266
    m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
267
    m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
268
    m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269
    m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
270
    m->tlambda_            = (tlambda_scale * q_i4) >> 5;
271

272
    // none of these constants should be < 1
273
    CheckLambdaValue(&m->lambda_i4_);
274
    CheckLambdaValue(&m->lambda_i16_);
275
    CheckLambdaValue(&m->lambda_uv_);
276
    CheckLambdaValue(&m->lambda_mode_);
277
    CheckLambdaValue(&m->lambda_trellis_i4_);
278
    CheckLambdaValue(&m->lambda_trellis_i16_);
279
    CheckLambdaValue(&m->lambda_trellis_uv_);
280
    CheckLambdaValue(&m->tlambda_);
281

282
    m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
283
    m->max_edge_  = 0;
284

285
    m->i4_penalty_ = 1000 * q_i4 * q_i4;
286
  }
287
}
288

289
//------------------------------------------------------------------------------
290
// Initialize filtering parameters
291

292
// Very small filter-strength values have close to no visual effect. So we can
293
// save a little decoding-CPU by turning filtering off for these.
294
#define FSTRENGTH_CUTOFF 2
295

296
static void SetupFilterStrength(VP8Encoder* const enc) {
297
  int i;
298
  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299
  const int level0 = 5 * enc->config_->filter_strength;
300
  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301
    VP8SegmentInfo* const m = &enc->dqm_[i];
302
    // We focus on the quantization of AC coeffs.
303
    const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304
    const int base_strength =
305
        VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306
    // Segments with lower complexity ('beta') will be less filtered.
307
    const int f = base_strength * level0 / (256 + m->beta_);
308
    m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309
  }
310
  // We record the initial strength (mainly for the case of 1-segment only).
311
  enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312
  enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313
  enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314
}
315

316
//------------------------------------------------------------------------------
317

318
// Note: if you change the values below, remember that the max range
319
// allowed by the syntax for DQ_UV is [-16,16].
320
#define MAX_DQ_UV (6)
321
#define MIN_DQ_UV (-4)
322

323
// We want to emulate jpeg-like behaviour where the expected "good" quality
324
// is around q=75. Internally, our "good" middle is around c=50. So we
325
// map accordingly using linear piece-wise function
326
static double QualityToCompression(double c) {
327
  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328
  // The file size roughly scales as pow(quantizer, 3.). Actually, the
329
  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330
  // in the mid-quant range. So we scale the compressibility inversely to
331
  // this power-law: quant ~= compression ^ 1/3. This law holds well for
332
  // low quant. Finer modeling for high-quant would make use of kAcTable[]
333
  // more explicitly.
334
  const double v = pow(linear_c, 1 / 3.);
335
  return v;
336
}
337

338
static double QualityToJPEGCompression(double c, double alpha) {
339
  // We map the complexity 'alpha' and quality setting 'c' to a compression
340
  // exponent empirically matched to the compression curve of libjpeg6b.
341
  // On average, the WebP output size will be roughly similar to that of a
342
  // JPEG file compressed with same quality factor.
343
  const double amin = 0.30;
344
  const double amax = 0.85;
345
  const double exp_min = 0.4;
346
  const double exp_max = 0.9;
347
  const double slope = (exp_min - exp_max) / (amax - amin);
348
  // Linearly interpolate 'expn' from exp_min to exp_max
349
  // in the [amin, amax] range.
350
  const double expn = (alpha > amax) ? exp_min
351
                    : (alpha < amin) ? exp_max
352
                    : exp_max + slope * (alpha - amin);
353
  const double v = pow(c, expn);
354
  return v;
355
}
356

357
static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358
                                 const VP8SegmentInfo* const S2) {
359
  return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360
}
361

362
static void SimplifySegments(VP8Encoder* const enc) {
363
  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364
  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365
  // explicit check is needed to avoid a spurious warning about 'i' exceeding
366
  // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367
  const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368
                               ? enc->segment_hdr_.num_segments_
369
                               : NUM_MB_SEGMENTS;
370
  int num_final_segments = 1;
371
  int s1, s2;
372
  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
373
    const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374
    int found = 0;
375
    // check if we already have similar segment
376
    for (s2 = 0; s2 < num_final_segments; ++s2) {
377
      const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378
      if (SegmentsAreEquivalent(S1, S2)) {
379
        found = 1;
380
        break;
381
      }
382
    }
383
    map[s1] = s2;
384
    if (!found) {
385
      if (num_final_segments != s1) {
386
        enc->dqm_[num_final_segments] = enc->dqm_[s1];
387
      }
388
      ++num_final_segments;
389
    }
390
  }
391
  if (num_final_segments < num_segments) {  // Remap
392
    int i = enc->mb_w_ * enc->mb_h_;
393
    while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394
    enc->segment_hdr_.num_segments_ = num_final_segments;
395
    // Replicate the trailing segment infos (it's mostly cosmetics)
396
    for (i = num_final_segments; i < num_segments; ++i) {
397
      enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398
    }
399
  }
400
}
401

402
void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403
  int i;
404
  int dq_uv_ac, dq_uv_dc;
405
  const int num_segments = enc->segment_hdr_.num_segments_;
406
  const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407
  const double Q = quality / 100.;
408
  const double c_base = enc->config_->emulate_jpeg_size ?
409
      QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410
      QualityToCompression(Q);
411
  for (i = 0; i < num_segments; ++i) {
412
    // We modulate the base coefficient to accommodate for the quantization
413
    // susceptibility and allow denser segments to be quantized more.
414
    const double expn = 1. - amp * enc->dqm_[i].alpha_;
415
    const double c = pow(c_base, expn);
416
    const int q = (int)(127. * (1. - c));
417
    assert(expn > 0.);
418
    enc->dqm_[i].quant_ = clip(q, 0, 127);
419
  }
420

421
  // purely indicative in the bitstream (except for the 1-segment case)
422
  enc->base_quant_ = enc->dqm_[0].quant_;
423

424
  // fill-in values for the unused segments (required by the syntax)
425
  for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426
    enc->dqm_[i].quant_ = enc->base_quant_;
427
  }
428

429
  // uv_alpha_ is normally spread around ~60. The useful range is
430
  // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431
  // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432
  dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433
                                          / (MAX_ALPHA - MIN_ALPHA);
434
  // we rescale by the user-defined strength of adaptation
435
  dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436
  // and make it safe.
437
  dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438
  // We also boost the dc-uv-quant a little, based on sns-strength, since
439
  // U/V channels are quite more reactive to high quants (flat DC-blocks
440
  // tend to appear, and are unpleasant).
441
  dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442
  dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
443

444
  enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
445
  enc->dq_y2_dc_ = 0;
446
  enc->dq_y2_ac_ = 0;
447
  enc->dq_uv_dc_ = dq_uv_dc;
448
  enc->dq_uv_ac_ = dq_uv_ac;
449

450
  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
451

452
  if (num_segments > 1) SimplifySegments(enc);
453

454
  SetupMatrices(enc);         // finalize quantization matrices
455
}
456

457
//------------------------------------------------------------------------------
458
// Form the predictions in cache
459

460
// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461
const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462
const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463

464
// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465
static const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466
  I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467
};
468

469
void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470
  const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471
  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472
  VP8EncPredLuma16(it->yuv_p_, left, top);
473
}
474

475
void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476
  const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477
  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478
  VP8EncPredChroma8(it->yuv_p_, left, top);
479
}
480

481
// Form all the ten Intra4x4 predictions in the yuv_p_ cache
482
// for the 4x4 block it->i4_
483
static void MakeIntra4Preds(const VP8EncIterator* const it) {
484
  VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
485
}
486

487
//------------------------------------------------------------------------------
488
// Quantize
489

490
// Layout:
491
// +----+----+
492
// |YYYY|UUVV| 0
493
// |YYYY|UUVV| 4
494
// |YYYY|....| 8
495
// |YYYY|....| 12
496
// +----+----+
497

498
const uint16_t VP8Scan[16] = {  // Luma
499
  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
500
  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
501
  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
502
  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
503
};
504

505
static const uint16_t VP8ScanUV[4 + 4] = {
506
  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
507
  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
508
};
509

510
//------------------------------------------------------------------------------
511
// Distortion measurement
512

513
static const uint16_t kWeightY[16] = {
514
  38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
515
};
516

517
static const uint16_t kWeightTrellis[16] = {
518
#if USE_TDISTO == 0
519
  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
520
#else
521
  30, 27, 19, 11,
522
  27, 24, 17, 10,
523
  19, 17, 12,  8,
524
  11, 10,  8,  6
525
#endif
526
};
527

528
// Init/Copy the common fields in score.
529
static void InitScore(VP8ModeScore* const rd) {
530
  rd->D  = 0;
531
  rd->SD = 0;
532
  rd->R  = 0;
533
  rd->H  = 0;
534
  rd->nz = 0;
535
  rd->score = MAX_COST;
536
}
537

538
static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
539
                      const VP8ModeScore* WEBP_RESTRICT const src) {
540
  dst->D  = src->D;
541
  dst->SD = src->SD;
542
  dst->R  = src->R;
543
  dst->H  = src->H;
544
  dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
545
  dst->score = src->score;
546
}
547

548
static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
549
                     const VP8ModeScore* WEBP_RESTRICT const src) {
550
  dst->D  += src->D;
551
  dst->SD += src->SD;
552
  dst->R  += src->R;
553
  dst->H  += src->H;
554
  dst->nz |= src->nz;     // here, new nz bits are accumulated.
555
  dst->score += src->score;
556
}
557

558
//------------------------------------------------------------------------------
559
// Performs trellis-optimized quantization.
560

561
// Prevents Visual Studio debugger from using this Node struct in place of the Godot Node class.
562
#define Node Node_libwebp_quant
563

564
// Trellis node
565
typedef struct {
566
  int8_t prev;            // best previous node
567
  int8_t sign;            // sign of coeff_i
568
  int16_t level;          // level
569
} Node;
570

571
// Score state
572
typedef struct {
573
  score_t score;          // partial RD score
574
  const uint16_t* costs;  // shortcut to cost tables
575
} ScoreState;
576

577
// If a coefficient was quantized to a value Q (using a neutral bias),
578
// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
579
// We don't test negative values though.
580
#define MIN_DELTA 0   // how much lower level to try
581
#define MAX_DELTA 1   // how much higher
582
#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
583
#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
584
#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
585

586
static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
587
  rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
588
}
589

590
static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
591
                                          score_t distortion) {
592
  return rate * lambda + RD_DISTO_MULT * distortion;
593
}
594

595
// Coefficient type.
596
enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
597

598
static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
599
                                int16_t in[16], int16_t out[16],
600
                                int ctx0, int coeff_type,
601
                                const VP8Matrix* WEBP_RESTRICT const mtx,
602
                                int lambda) {
603
  const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
604
  CostArrayPtr const costs =
605
      (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
606
  const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
607
  Node nodes[16][NUM_NODES];
608
  ScoreState score_states[2][NUM_NODES];
609
  ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
610
  ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
611
  int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
612
  score_t best_score;
613
  int n, m, p, last;
614

615
  {
616
    score_t cost;
617
    const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
618
    const int last_proba = probas[VP8EncBands[first]][ctx0][0];
619

620
    // compute the position of the last interesting coefficient
621
    last = first - 1;
622
    for (n = 15; n >= first; --n) {
623
      const int j = kZigzag[n];
624
      const int err = in[j] * in[j];
625
      if (err > thresh) {
626
        last = n;
627
        break;
628
      }
629
    }
630
    // we don't need to go inspect up to n = 16 coeffs. We can just go up
631
    // to last + 1 (inclusive) without losing much.
632
    if (last < 15) ++last;
633

634
    // compute 'skip' score. This is the max score one can do.
635
    cost = VP8BitCost(0, last_proba);
636
    best_score = RDScoreTrellis(lambda, cost, 0);
637

638
    // initialize source node.
639
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
640
      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
641
      ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
642
      ss_cur[m].costs = costs[first][ctx0];
643
    }
644
  }
645

646
  // traverse trellis.
647
  for (n = first; n <= last; ++n) {
648
    const int j = kZigzag[n];
649
    const uint32_t Q  = mtx->q_[j];
650
    const uint32_t iQ = mtx->iq_[j];
651
    const uint32_t B = BIAS(0x00);     // neutral bias
652
    // note: it's important to take sign of the _original_ coeff,
653
    // so we don't have to consider level < 0 afterward.
654
    const int sign = (in[j] < 0);
655
    const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
656
    int level0 = QUANTDIV(coeff0, iQ, B);
657
    int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
658
    if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
659
    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
660

661
    {   // Swap current and previous score states
662
      ScoreState* const tmp = ss_cur;
663
      ss_cur = ss_prev;
664
      ss_prev = tmp;
665
    }
666

667
    // test all alternate level values around level0.
668
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
669
      Node* const cur = &NODE(n, m);
670
      const int level = level0 + m;
671
      const int ctx = (level > 2) ? 2 : level;
672
      const int band = VP8EncBands[n + 1];
673
      score_t base_score;
674
      score_t best_cur_score;
675
      int best_prev;
676
      score_t cost, score;
677

678
      ss_cur[m].costs = costs[n + 1][ctx];
679
      if (level < 0 || level > thresh_level) {
680
        ss_cur[m].score = MAX_COST;
681
        // Node is dead.
682
        continue;
683
      }
684

685
      {
686
        // Compute delta_error = how much coding this level will
687
        // subtract to max_error as distortion.
688
        // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
689
        const int new_error = coeff0 - level * Q;
690
        const int delta_error =
691
            kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
692
        base_score = RDScoreTrellis(lambda, 0, delta_error);
693
      }
694

695
      // Inspect all possible non-dead predecessors. Retain only the best one.
696
      // The base_score is added to all scores so it is only added for the final
697
      // value after the loop.
698
      cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
699
      best_cur_score =
700
          ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
701
      best_prev = -MIN_DELTA;
702
      for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
703
        // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
704
        // eliminated since their score can't be better than the current best.
705
        cost = VP8LevelCost(ss_prev[p].costs, level);
706
        // Examine node assuming it's a non-terminal one.
707
        score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
708
        if (score < best_cur_score) {
709
          best_cur_score = score;
710
          best_prev = p;
711
        }
712
      }
713
      best_cur_score += base_score;
714
      // Store best finding in current node.
715
      cur->sign = sign;
716
      cur->level = level;
717
      cur->prev = best_prev;
718
      ss_cur[m].score = best_cur_score;
719

720
      // Now, record best terminal node (and thus best entry in the graph).
721
      if (level != 0 && best_cur_score < best_score) {
722
        const score_t last_pos_cost =
723
            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
724
        const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
725
        score = best_cur_score + last_pos_score;
726
        if (score < best_score) {
727
          best_score = score;
728
          best_path[0] = n;                     // best eob position
729
          best_path[1] = m;                     // best node index
730
          best_path[2] = best_prev;             // best predecessor
731
        }
732
      }
733
    }
734
  }
735

736
  // Fresh start
737
  // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
738
  if (coeff_type == TYPE_I16_AC) {
739
    memset(in + 1, 0, 15 * sizeof(*in));
740
    memset(out + 1, 0, 15 * sizeof(*out));
741
  } else {
742
    memset(in, 0, 16 * sizeof(*in));
743
    memset(out, 0, 16 * sizeof(*out));
744
  }
745
  if (best_path[0] == -1) {
746
    return 0;  // skip!
747
  }
748

749
  {
750
    // Unwind the best path.
751
    // Note: best-prev on terminal node is not necessarily equal to the
752
    // best_prev for non-terminal. So we patch best_path[2] in.
753
    int nz = 0;
754
    int best_node = best_path[1];
755
    n = best_path[0];
756
    NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
757

758
    for (; n >= first; --n) {
759
      const Node* const node = &NODE(n, best_node);
760
      const int j = kZigzag[n];
761
      out[n] = node->sign ? -node->level : node->level;
762
      nz |= node->level;
763
      in[j] = out[n] * mtx->q_[j];
764
      best_node = node->prev;
765
    }
766
    return (nz != 0);
767
  }
768
}
769

770
#undef NODE
771

772
//------------------------------------------------------------------------------
773
// Performs: difference, transform, quantize, back-transform, add
774
// all at once. Output is the reconstructed block in *yuv_out, and the
775
// quantized levels in *levels.
776

777
static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
778
                              VP8ModeScore* WEBP_RESTRICT const rd,
779
                              uint8_t* WEBP_RESTRICT const yuv_out,
780
                              int mode) {
781
  const VP8Encoder* const enc = it->enc_;
782
  const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
783
  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
784
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
785
  int nz = 0;
786
  int n;
787
  int16_t tmp[16][16], dc_tmp[16];
788

789
  for (n = 0; n < 16; n += 2) {
790
    VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
791
  }
792
  VP8FTransformWHT(tmp[0], dc_tmp);
793
  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
794

795
  if (DO_TRELLIS_I16 && it->do_trellis_) {
796
    int x, y;
797
    VP8IteratorNzToBytes(it);
798
    for (y = 0, n = 0; y < 4; ++y) {
799
      for (x = 0; x < 4; ++x, ++n) {
800
        const int ctx = it->top_nz_[x] + it->left_nz_[y];
801
        const int non_zero = TrellisQuantizeBlock(
802
            enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
803
            dqm->lambda_trellis_i16_);
804
        it->top_nz_[x] = it->left_nz_[y] = non_zero;
805
        rd->y_ac_levels[n][0] = 0;
806
        nz |= non_zero << n;
807
      }
808
    }
809
  } else {
810
    for (n = 0; n < 16; n += 2) {
811
      // Zero-out the first coeff, so that: a) nz is correct below, and
812
      // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
813
      tmp[n][0] = tmp[n + 1][0] = 0;
814
      nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
815
      assert(rd->y_ac_levels[n + 0][0] == 0);
816
      assert(rd->y_ac_levels[n + 1][0] == 0);
817
    }
818
  }
819

820
  // Transform back
821
  VP8TransformWHT(dc_tmp, tmp[0]);
822
  for (n = 0; n < 16; n += 2) {
823
    VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
824
  }
825

826
  return nz;
827
}
828

829
static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
830
                             int16_t levels[16],
831
                             const uint8_t* WEBP_RESTRICT const src,
832
                             uint8_t* WEBP_RESTRICT const yuv_out,
833
                             int mode) {
834
  const VP8Encoder* const enc = it->enc_;
835
  const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
836
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
837
  int nz = 0;
838
  int16_t tmp[16];
839

840
  VP8FTransform(src, ref, tmp);
841
  if (DO_TRELLIS_I4 && it->do_trellis_) {
842
    const int x = it->i4_ & 3, y = it->i4_ >> 2;
843
    const int ctx = it->top_nz_[x] + it->left_nz_[y];
844
    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
845
                              dqm->lambda_trellis_i4_);
846
  } else {
847
    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
848
  }
849
  VP8ITransform(ref, tmp, yuv_out, 0);
850
  return nz;
851
}
852

853
//------------------------------------------------------------------------------
854
// DC-error diffusion
855

856
// Diffusion weights. We under-correct a bit (15/16th of the error is actually
857
// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
858
#define C1 7    // fraction of error sent to the 4x4 block below
859
#define C2 8    // fraction of error sent to the 4x4 block on the right
860
#define DSHIFT 4
861
#define DSCALE 1   // storage descaling, needed to make the error fit int8_t
862

863
// Quantize as usual, but also compute and return the quantization error.
864
// Error is already divided by DSHIFT.
865
static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
866
                          const VP8Matrix* WEBP_RESTRICT const mtx) {
867
  int V = *v;
868
  const int sign = (V < 0);
869
  if (sign) V = -V;
870
  if (V > (int)mtx->zthresh_[0]) {
871
    const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
872
    const int err = (V - qV);
873
    *v = sign ? -qV : qV;
874
    return (sign ? -err : err) >> DSCALE;
875
  }
876
  *v = 0;
877
  return (sign ? -V : V) >> DSCALE;
878
}
879

880
static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
881
                            const VP8Matrix* WEBP_RESTRICT const mtx,
882
                            int16_t tmp[][16],
883
                            VP8ModeScore* WEBP_RESTRICT const rd) {
884
  //         | top[0] | top[1]
885
  // --------+--------+---------
886
  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
887
  // left[1] | tmp[2]   tmp[3]        err2 err3
888
  //
889
  // Final errors {err1,err2,err3} are preserved and later restored
890
  // as top[]/left[] on the next block.
891
  int ch;
892
  for (ch = 0; ch <= 1; ++ch) {
893
    const int8_t* const top = it->top_derr_[it->x_][ch];
894
    const int8_t* const left = it->left_derr_[ch];
895
    int16_t (* const c)[16] = &tmp[ch * 4];
896
    int err0, err1, err2, err3;
897
    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
898
    err0 = QuantizeSingle(&c[0][0], mtx);
899
    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
900
    err1 = QuantizeSingle(&c[1][0], mtx);
901
    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
902
    err2 = QuantizeSingle(&c[2][0], mtx);
903
    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
904
    err3 = QuantizeSingle(&c[3][0], mtx);
905
    // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
906
    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
907
    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
908
    rd->derr[ch][0] = (int8_t)err1;
909
    rd->derr[ch][1] = (int8_t)err2;
910
    rd->derr[ch][2] = (int8_t)err3;
911
  }
912
}
913

914
static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
915
                                 const VP8ModeScore* WEBP_RESTRICT const rd) {
916
  int ch;
917
  for (ch = 0; ch <= 1; ++ch) {
918
    int8_t* const top = it->top_derr_[it->x_][ch];
919
    int8_t* const left = it->left_derr_[ch];
920
    left[0] = rd->derr[ch][0];            // restore err1
921
    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
922
    top[0]  = rd->derr[ch][1];            //     ... err2
923
    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
924
  }
925
}
926

927
#undef C1
928
#undef C2
929
#undef DSHIFT
930
#undef DSCALE
931

932
//------------------------------------------------------------------------------
933

934
static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
935
                         VP8ModeScore* WEBP_RESTRICT const rd,
936
                         uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
937
  const VP8Encoder* const enc = it->enc_;
938
  const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
939
  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
940
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
941
  int nz = 0;
942
  int n;
943
  int16_t tmp[8][16];
944

945
  for (n = 0; n < 8; n += 2) {
946
    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
947
  }
948
  if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
949

950
  if (DO_TRELLIS_UV && it->do_trellis_) {
951
    int ch, x, y;
952
    for (ch = 0, n = 0; ch <= 2; ch += 2) {
953
      for (y = 0; y < 2; ++y) {
954
        for (x = 0; x < 2; ++x, ++n) {
955
          const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
956
          const int non_zero = TrellisQuantizeBlock(
957
              enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
958
              dqm->lambda_trellis_uv_);
959
          it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
960
          nz |= non_zero << n;
961
        }
962
      }
963
    }
964
  } else {
965
    for (n = 0; n < 8; n += 2) {
966
      nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
967
    }
968
  }
969

970
  for (n = 0; n < 8; n += 2) {
971
    VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
972
  }
973
  return (nz << 16);
974
}
975

976
//------------------------------------------------------------------------------
977
// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
978
// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
979

980
static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
981
  // We look at the first three AC coefficients to determine what is the average
982
  // delta between each sub-4x4 block.
983
  const int v0 = abs(DCs[1]);
984
  const int v1 = abs(DCs[2]);
985
  const int v2 = abs(DCs[4]);
986
  int max_v = (v1 > v0) ? v1 : v0;
987
  max_v = (v2 > max_v) ? v2 : max_v;
988
  if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
989
}
990

991
static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
992
  VP8ModeScore* const tmp = *a;
993
  *a = *b;
994
  *b = tmp;
995
}
996

997
static void SwapPtr(uint8_t** a, uint8_t** b) {
998
  uint8_t* const tmp = *a;
999
  *a = *b;
1000
  *b = tmp;
1001
}
1002

1003
static void SwapOut(VP8EncIterator* const it) {
1004
  SwapPtr(&it->yuv_out_, &it->yuv_out2_);
1005
}
1006

1007
static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
1008
                            VP8ModeScore* WEBP_RESTRICT rd) {
1009
  const int kNumBlocks = 16;
1010
  VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1011
  const int lambda = dqm->lambda_i16_;
1012
  const int tlambda = dqm->tlambda_;
1013
  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1014
  VP8ModeScore rd_tmp;
1015
  VP8ModeScore* rd_cur = &rd_tmp;
1016
  VP8ModeScore* rd_best = rd;
1017
  int mode;
1018
  int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
1019

1020
  rd->mode_i16 = -1;
1021
  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1022
    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
1023
    rd_cur->mode_i16 = mode;
1024

1025
    // Reconstruct
1026
    rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1027

1028
    // Measure RD-score
1029
    rd_cur->D = VP8SSE16x16(src, tmp_dst);
1030
    rd_cur->SD =
1031
        tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1032
    rd_cur->H = VP8FixedCostsI16[mode];
1033
    rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1034
    if (is_flat) {
1035
      // refine the first impression (which was in pixel space)
1036
      is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1037
      if (is_flat) {
1038
        // Block is very flat. We put emphasis on the distortion being very low!
1039
        rd_cur->D *= 2;
1040
        rd_cur->SD *= 2;
1041
      }
1042
    }
1043

1044
    // Since we always examine Intra16 first, we can overwrite *rd directly.
1045
    SetRDScore(lambda, rd_cur);
1046
    if (mode == 0 || rd_cur->score < rd_best->score) {
1047
      SwapModeScore(&rd_cur, &rd_best);
1048
      SwapOut(it);
1049
    }
1050
  }
1051
  if (rd_best != rd) {
1052
    memcpy(rd, rd_best, sizeof(*rd));
1053
  }
1054
  SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
1055
  VP8SetIntra16Mode(it, rd->mode_i16);
1056

1057
  // we have a blocky macroblock (only DCs are non-zero) with fairly high
1058
  // distortion, record max delta so we can later adjust the minimal filtering
1059
  // strength needed to smooth these blocks out.
1060
  if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1061
    StoreMaxDelta(dqm, rd->y_dc_levels);
1062
  }
1063
}
1064

1065
//------------------------------------------------------------------------------
1066

1067
// return the cost array corresponding to the surrounding prediction modes.
1068
static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
1069
                                     const uint8_t modes[16]) {
1070
  const int preds_w = it->enc_->preds_w_;
1071
  const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1072
  const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1073
  const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1074
  return VP8FixedCostsI4[top][left];
1075
}
1076

1077
static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
1078
                          VP8ModeScore* WEBP_RESTRICT const rd) {
1079
  const VP8Encoder* const enc = it->enc_;
1080
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1081
  const int lambda = dqm->lambda_i4_;
1082
  const int tlambda = dqm->tlambda_;
1083
  const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1084
  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1085
  int total_header_bits = 0;
1086
  VP8ModeScore rd_best;
1087

1088
  if (enc->max_i4_header_bits_ == 0) {
1089
    return 0;
1090
  }
1091

1092
  InitScore(&rd_best);
1093
  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
1094
  SetRDScore(dqm->lambda_mode_, &rd_best);
1095
  VP8IteratorStartI4(it);
1096
  do {
1097
    const int kNumBlocks = 1;
1098
    VP8ModeScore rd_i4;
1099
    int mode;
1100
    int best_mode = -1;
1101
    const uint8_t* const src = src0 + VP8Scan[it->i4_];
1102
    const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1103
    uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1104
    uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
1105

1106
    InitScore(&rd_i4);
1107
    MakeIntra4Preds(it);
1108
    for (mode = 0; mode < NUM_BMODES; ++mode) {
1109
      VP8ModeScore rd_tmp;
1110
      int16_t tmp_levels[16];
1111

1112
      // Reconstruct
1113
      rd_tmp.nz =
1114
          ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1115

1116
      // Compute RD-score
1117
      rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1118
      rd_tmp.SD =
1119
          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1120
                  : 0;
1121
      rd_tmp.H = mode_costs[mode];
1122

1123
      // Add flatness penalty, to avoid flat area to be mispredicted
1124
      // by a complex mode.
1125
      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1126
        rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1127
      } else {
1128
        rd_tmp.R = 0;
1129
      }
1130

1131
      // early-out check
1132
      SetRDScore(lambda, &rd_tmp);
1133
      if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1134

1135
      // finish computing score
1136
      rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1137
      SetRDScore(lambda, &rd_tmp);
1138

1139
      if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1140
        CopyScore(&rd_i4, &rd_tmp);
1141
        best_mode = mode;
1142
        SwapPtr(&tmp_dst, &best_block);
1143
        memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1144
               sizeof(rd_best.y_ac_levels[it->i4_]));
1145
      }
1146
    }
1147
    SetRDScore(dqm->lambda_mode_, &rd_i4);
1148
    AddScore(&rd_best, &rd_i4);
1149
    if (rd_best.score >= rd->score) {
1150
      return 0;
1151
    }
1152
    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1153
    if (total_header_bits > enc->max_i4_header_bits_) {
1154
      return 0;
1155
    }
1156
    // Copy selected samples if not in the right place already.
1157
    if (best_block != best_blocks + VP8Scan[it->i4_]) {
1158
      VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1159
    }
1160
    rd->modes_i4[it->i4_] = best_mode;
1161
    it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1162
  } while (VP8IteratorRotateI4(it, best_blocks));
1163

1164
  // finalize state
1165
  CopyScore(rd, &rd_best);
1166
  VP8SetIntra4Mode(it, rd->modes_i4);
1167
  SwapOut(it);
1168
  memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1169
  return 1;   // select intra4x4 over intra16x16
1170
}
1171

1172
//------------------------------------------------------------------------------
1173

1174
static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
1175
                       VP8ModeScore* WEBP_RESTRICT const rd) {
1176
  const int kNumBlocks = 8;
1177
  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1178
  const int lambda = dqm->lambda_uv_;
1179
  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1180
  uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
1181
  uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1182
  uint8_t* dst = dst0;
1183
  VP8ModeScore rd_best;
1184
  int mode;
1185

1186
  rd->mode_uv = -1;
1187
  InitScore(&rd_best);
1188
  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1189
    VP8ModeScore rd_uv;
1190

1191
    // Reconstruct
1192
    rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1193

1194
    // Compute RD-score
1195
    rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1196
    rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1197
    rd_uv.H  = VP8FixedCostsUV[mode];
1198
    rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1199
    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1200
      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1201
    }
1202

1203
    SetRDScore(lambda, &rd_uv);
1204
    if (mode == 0 || rd_uv.score < rd_best.score) {
1205
      CopyScore(&rd_best, &rd_uv);
1206
      rd->mode_uv = mode;
1207
      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1208
      if (it->top_derr_ != NULL) {
1209
        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1210
      }
1211
      SwapPtr(&dst, &tmp_dst);
1212
    }
1213
  }
1214
  VP8SetIntraUVMode(it, rd->mode_uv);
1215
  AddScore(rd, &rd_best);
1216
  if (dst != dst0) {   // copy 16x8 block if needed
1217
    VP8Copy16x8(dst, dst0);
1218
  }
1219
  if (it->top_derr_ != NULL) {  // store diffusion errors for next block
1220
    StoreDiffusionErrors(it, rd);
1221
  }
1222
}
1223

1224
//------------------------------------------------------------------------------
1225
// Final reconstruction and quantization.
1226

1227
static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
1228
                           VP8ModeScore* WEBP_RESTRICT const rd) {
1229
  const VP8Encoder* const enc = it->enc_;
1230
  const int is_i16 = (it->mb_->type_ == 1);
1231
  int nz = 0;
1232

1233
  if (is_i16) {
1234
    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1235
  } else {
1236
    VP8IteratorStartI4(it);
1237
    do {
1238
      const int mode =
1239
          it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1240
      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1241
      uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1242
      MakeIntra4Preds(it);
1243
      nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1244
                              src, dst, mode) << it->i4_;
1245
    } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1246
  }
1247

1248
  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1249
  rd->nz = nz;
1250
}
1251

1252
// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1253
static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
1254
                                  int try_both_modes, int refine_uv_mode,
1255
                                  VP8ModeScore* WEBP_RESTRICT const rd) {
1256
  score_t best_score = MAX_COST;
1257
  int nz = 0;
1258
  int mode;
1259
  int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1260

1261
  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1262
  // Some empiric constants, of approximate order of magnitude.
1263
  const int lambda_d_i16 = 106;
1264
  const int lambda_d_i4 = 11;
1265
  const int lambda_d_uv = 120;
1266
  score_t score_i4 = dqm->i4_penalty_;
1267
  score_t i4_bit_sum = 0;
1268
  const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1269
                                           : MAX_COST;  // no early-out allowed
1270

1271
  if (is_i16) {   // First, evaluate Intra16 distortion
1272
    int best_mode = -1;
1273
    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1274
    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1275
      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1276
      const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1277
                          + VP8FixedCostsI16[mode] * lambda_d_i16;
1278
      if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1279
        continue;
1280
      }
1281

1282
      if (score < best_score) {
1283
        best_mode = mode;
1284
        best_score = score;
1285
      }
1286
    }
1287
    if (it->x_ == 0 || it->y_ == 0) {
1288
      // avoid starting a checkerboard resonance from the border. See bug #432.
1289
      if (IsFlatSource16(src)) {
1290
        best_mode = (it->x_ == 0) ? 0 : 2;
1291
        try_both_modes = 0;  // stick to i16
1292
      }
1293
    }
1294
    VP8SetIntra16Mode(it, best_mode);
1295
    // we'll reconstruct later, if i16 mode actually gets selected
1296
  }
1297

1298
  // Next, evaluate Intra4
1299
  if (try_both_modes || !is_i16) {
1300
    // We don't evaluate the rate here, but just account for it through a
1301
    // constant penalty (i4 mode usually needs more bits compared to i16).
1302
    is_i16 = 0;
1303
    VP8IteratorStartI4(it);
1304
    do {
1305
      int best_i4_mode = -1;
1306
      score_t best_i4_score = MAX_COST;
1307
      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1308
      const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1309

1310
      MakeIntra4Preds(it);
1311
      for (mode = 0; mode < NUM_BMODES; ++mode) {
1312
        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1313
        const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1314
                            + mode_costs[mode] * lambda_d_i4;
1315
        if (score < best_i4_score) {
1316
          best_i4_mode = mode;
1317
          best_i4_score = score;
1318
        }
1319
      }
1320
      i4_bit_sum += mode_costs[best_i4_mode];
1321
      rd->modes_i4[it->i4_] = best_i4_mode;
1322
      score_i4 += best_i4_score;
1323
      if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1324
        // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1325
        is_i16 = 1;
1326
        break;
1327
      } else {  // reconstruct partial block inside yuv_out2_ buffer
1328
        uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1329
        nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1330
                                src, tmp_dst, best_i4_mode) << it->i4_;
1331
      }
1332
    } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1333
  }
1334

1335
  // Final reconstruction, depending on which mode is selected.
1336
  if (!is_i16) {
1337
    VP8SetIntra4Mode(it, rd->modes_i4);
1338
    SwapOut(it);
1339
    best_score = score_i4;
1340
  } else {
1341
    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1342
  }
1343

1344
  // ... and UV!
1345
  if (refine_uv_mode) {
1346
    int best_mode = -1;
1347
    score_t best_uv_score = MAX_COST;
1348
    const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1349
    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1350
      const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1351
      const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1352
                          + VP8FixedCostsUV[mode] * lambda_d_uv;
1353
      if (score < best_uv_score) {
1354
        best_mode = mode;
1355
        best_uv_score = score;
1356
      }
1357
    }
1358
    VP8SetIntraUVMode(it, best_mode);
1359
  }
1360
  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1361

1362
  rd->nz = nz;
1363
  rd->score = best_score;
1364
}
1365

1366
//------------------------------------------------------------------------------
1367
// Entry point
1368

1369
int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
1370
                VP8ModeScore* WEBP_RESTRICT const rd,
1371
                VP8RDLevel rd_opt) {
1372
  int is_skipped;
1373
  const int method = it->enc_->method_;
1374

1375
  InitScore(rd);
1376

1377
  // We can perform predictions for Luma16x16 and Chroma8x8 already.
1378
  // Luma4x4 predictions needs to be done as-we-go.
1379
  VP8MakeLuma16Preds(it);
1380
  VP8MakeChroma8Preds(it);
1381

1382
  if (rd_opt > RD_OPT_NONE) {
1383
    it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1384
    PickBestIntra16(it, rd);
1385
    if (method >= 2) {
1386
      PickBestIntra4(it, rd);
1387
    }
1388
    PickBestUV(it, rd);
1389
    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1390
      it->do_trellis_ = 1;
1391
      SimpleQuantize(it, rd);
1392
    }
1393
  } else {
1394
    // At this point we have heuristically decided intra16 / intra4.
1395
    // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1396
    // For method <= 1, we don't re-examine the decision but just go ahead with
1397
    // quantization/reconstruction.
1398
    RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1399
  }
1400
  is_skipped = (rd->nz == 0);
1401
  VP8SetSkip(it, is_skipped);
1402
  return is_skipped;
1403
}
1404

1405
Product

Resources

Company