CoCalc -- quant

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/libwebp/src/enc/quant_enc.c
¹⁶³⁴⁴ views
1
// Copyright 2011 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
//   Quantization
11
//
12
// Author: Skal ([email protected])
13

14
#include <assert.h>
15
#include <math.h>
16
#include <stdlib.h>  // for abs()
17

18
#include "src/enc/vp8i_enc.h"
19
#include "src/enc/cost_enc.h"
20

21
#define DO_TRELLIS_I4  1
22
#define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
23
#define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
24
#define USE_TDISTO 1
25

26
#define MID_ALPHA 64      // neutral value for susceptibility
27
#define MIN_ALPHA 30      // lowest usable value for susceptibility
28
#define MAX_ALPHA 100     // higher meaningful value for susceptibility
29

30
#define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
31
                          // power-law modulation. Must be strictly less than 1.
32

33
// number of non-zero coeffs below which we consider the block very flat
34
// (and apply a penalty to complex predictions)
35
#define FLATNESS_LIMIT_I16 10      // I16 mode
36
#define FLATNESS_LIMIT_I4  3       // I4 mode
37
#define FLATNESS_LIMIT_UV  2       // UV mode
38
#define FLATNESS_PENALTY   140     // roughly ~1bit per block
39

40
#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
41

42
#define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
43

44
// #define DEBUG_BLOCK
45

46
//------------------------------------------------------------------------------
47

48
#if defined(DEBUG_BLOCK)
49

50
#include <stdio.h>
51
#include <stdlib.h>
52

53
static void PrintBlockInfo(const VP8EncIterator* const it,
54
                           const VP8ModeScore* const rd) {
55
  int i, j;
56
  const int is_i16 = (it->mb_->type_ == 1);
57
  const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
58
  const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
59
  const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
60
  const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
61
  printf("SOURCE / OUTPUT / ABS DELTA\n");
62
  for (j = 0; j < 16; ++j) {
63
    for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
64
    printf("     ");
65
    for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
66
    printf("     ");
67
    for (i = 0; i < 16; ++i) {
68
      printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
69
    }
70
    printf("\n");
71
  }
72
  printf("\n");   // newline before the U/V block
73
  for (j = 0; j < 8; ++j) {
74
    for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
75
    printf(" ");
76
    for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
77
    printf("    ");
78
    for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
79
    printf(" ");
80
    for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
81
    printf("   ");
82
    for (i = 0; i < 8; ++i) {
83
      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
84
    }
85
    printf(" ");
86
    for (i = 8; i < 16; ++i) {
87
      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
88
    }
89
    printf("\n");
90
  }
91
  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
92
    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
93
    (int)rd->score);
94
  if (is_i16) {
95
    printf("Mode: %d\n", rd->mode_i16);
96
    printf("y_dc_levels:");
97
    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
98
    printf("\n");
99
  } else {
100
    printf("Modes[16]: ");
101
    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
102
    printf("\n");
103
  }
104
  printf("y_ac_levels:\n");
105
  for (j = 0; j < 16; ++j) {
106
    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
107
      printf("%4d ", rd->y_ac_levels[j][i]);
108
    }
109
    printf("\n");
110
  }
111
  printf("\n");
112
  printf("uv_levels (mode=%d):\n", rd->mode_uv);
113
  for (j = 0; j < 8; ++j) {
114
    for (i = 0; i < 16; ++i) {
115
      printf("%4d ", rd->uv_levels[j][i]);
116
    }
117
    printf("\n");
118
  }
119
}
120

121
#endif   // DEBUG_BLOCK
122

123
//------------------------------------------------------------------------------
124

125
static WEBP_INLINE int clip(int v, int m, int M) {
126
  return v < m ? m : v > M ? M : v;
127
}
128

129
static const uint8_t kZigzag[16] = {
130
  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
131
};
132

133
static const uint8_t kDcTable[128] = {
134
  4,     5,   6,   7,   8,   9,  10,  10,
135
  11,   12,  13,  14,  15,  16,  17,  17,
136
  18,   19,  20,  20,  21,  21,  22,  22,
137
  23,   23,  24,  25,  25,  26,  27,  28,
138
  29,   30,  31,  32,  33,  34,  35,  36,
139
  37,   37,  38,  39,  40,  41,  42,  43,
140
  44,   45,  46,  46,  47,  48,  49,  50,
141
  51,   52,  53,  54,  55,  56,  57,  58,
142
  59,   60,  61,  62,  63,  64,  65,  66,
143
  67,   68,  69,  70,  71,  72,  73,  74,
144
  75,   76,  76,  77,  78,  79,  80,  81,
145
  82,   83,  84,  85,  86,  87,  88,  89,
146
  91,   93,  95,  96,  98, 100, 101, 102,
147
  104, 106, 108, 110, 112, 114, 116, 118,
148
  122, 124, 126, 128, 130, 132, 134, 136,
149
  138, 140, 143, 145, 148, 151, 154, 157
150
};
151

152
static const uint16_t kAcTable[128] = {
153
  4,     5,   6,   7,   8,   9,  10,  11,
154
  12,   13,  14,  15,  16,  17,  18,  19,
155
  20,   21,  22,  23,  24,  25,  26,  27,
156
  28,   29,  30,  31,  32,  33,  34,  35,
157
  36,   37,  38,  39,  40,  41,  42,  43,
158
  44,   45,  46,  47,  48,  49,  50,  51,
159
  52,   53,  54,  55,  56,  57,  58,  60,
160
  62,   64,  66,  68,  70,  72,  74,  76,
161
  78,   80,  82,  84,  86,  88,  90,  92,
162
  94,   96,  98, 100, 102, 104, 106, 108,
163
  110, 112, 114, 116, 119, 122, 125, 128,
164
  131, 134, 137, 140, 143, 146, 149, 152,
165
  155, 158, 161, 164, 167, 170, 173, 177,
166
  181, 185, 189, 193, 197, 201, 205, 209,
167
  213, 217, 221, 225, 229, 234, 239, 245,
168
  249, 254, 259, 264, 269, 274, 279, 284
169
};
170

171
static const uint16_t kAcTable2[128] = {
172
  8,     8,   9,  10,  12,  13,  15,  17,
173
  18,   20,  21,  23,  24,  26,  27,  29,
174
  31,   32,  34,  35,  37,  38,  40,  41,
175
  43,   44,  46,  48,  49,  51,  52,  54,
176
  55,   57,  58,  60,  62,  63,  65,  66,
177
  68,   69,  71,  72,  74,  75,  77,  79,
178
  80,   82,  83,  85,  86,  88,  89,  93,
179
  96,   99, 102, 105, 108, 111, 114, 117,
180
  120, 124, 127, 130, 133, 136, 139, 142,
181
  145, 148, 151, 155, 158, 161, 164, 167,
182
  170, 173, 176, 179, 184, 189, 193, 198,
183
  203, 207, 212, 217, 221, 226, 230, 235,
184
  240, 244, 249, 254, 258, 263, 268, 274,
185
  280, 286, 292, 299, 305, 311, 317, 323,
186
  330, 336, 342, 348, 354, 362, 370, 379,
187
  385, 393, 401, 409, 416, 424, 432, 440
188
};
189

190
static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
191
  { 96, 110 }, { 96, 108 }, { 110, 115 }
192
};
193

194
// Sharpening by (slightly) raising the hi-frequency coeffs.
195
// Hack-ish but helpful for mid-bitrate range. Use with care.
196
#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
197
static const uint8_t kFreqSharpening[16] = {
198
  0,  30, 60, 90,
199
  30, 60, 90, 90,
200
  60, 90, 90, 90,
201
  90, 90, 90, 90
202
};
203

204
//------------------------------------------------------------------------------
205
// Initialize quantization parameters in VP8Matrix
206

207
// Returns the average quantizer
208
static int ExpandMatrix(VP8Matrix* const m, int type) {
209
  int i, sum;
210
  for (i = 0; i < 2; ++i) {
211
    const int is_ac_coeff = (i > 0);
212
    const int bias = kBiasMatrices[type][is_ac_coeff];
213
    m->iq_[i] = (1 << QFIX) / m->q_[i];
214
    m->bias_[i] = BIAS(bias);
215
    // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
216
    //   * zero if coeff <= zthresh
217
    //   * non-zero if coeff > zthresh
218
    m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
219
  }
220
  for (i = 2; i < 16; ++i) {
221
    m->q_[i] = m->q_[1];
222
    m->iq_[i] = m->iq_[1];
223
    m->bias_[i] = m->bias_[1];
224
    m->zthresh_[i] = m->zthresh_[1];
225
  }
226
  for (sum = 0, i = 0; i < 16; ++i) {
227
    if (type == 0) {  // we only use sharpening for AC luma coeffs
228
      m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
229
    } else {
230
      m->sharpen_[i] = 0;
231
    }
232
    sum += m->q_[i];
233
  }
234
  return (sum + 8) >> 4;
235
}
236

237
static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
238

239
static void SetupMatrices(VP8Encoder* enc) {
240
  int i;
241
  const int tlambda_scale =
242
    (enc->method_ >= 4) ? enc->config_->sns_strength
243
                        : 0;
244
  const int num_segments = enc->segment_hdr_.num_segments_;
245
  for (i = 0; i < num_segments; ++i) {
246
    VP8SegmentInfo* const m = &enc->dqm_[i];
247
    const int q = m->quant_;
248
    int q_i4, q_i16, q_uv;
249
    m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
250
    m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
251

252
    m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
253
    m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
254

255
    m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
256
    m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
257

258
    q_i4  = ExpandMatrix(&m->y1_, 0);
259
    q_i16 = ExpandMatrix(&m->y2_, 1);
260
    q_uv  = ExpandMatrix(&m->uv_, 2);
261

262
    m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
263
    m->lambda_i16_         = (3 * q_i16 * q_i16);
264
    m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
265
    m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
266
    m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
267
    m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
268
    m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
269
    m->tlambda_            = (tlambda_scale * q_i4) >> 5;
270

271
    // none of these constants should be < 1
272
    CheckLambdaValue(&m->lambda_i4_);
273
    CheckLambdaValue(&m->lambda_i16_);
274
    CheckLambdaValue(&m->lambda_uv_);
275
    CheckLambdaValue(&m->lambda_mode_);
276
    CheckLambdaValue(&m->lambda_trellis_i4_);
277
    CheckLambdaValue(&m->lambda_trellis_i16_);
278
    CheckLambdaValue(&m->lambda_trellis_uv_);
279
    CheckLambdaValue(&m->tlambda_);
280

281
    m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
282
    m->max_edge_  = 0;
283

284
    m->i4_penalty_ = 1000 * q_i4 * q_i4;
285
  }
286
}
287

288
//------------------------------------------------------------------------------
289
// Initialize filtering parameters
290

291
// Very small filter-strength values have close to no visual effect. So we can
292
// save a little decoding-CPU by turning filtering off for these.
293
#define FSTRENGTH_CUTOFF 2
294

295
static void SetupFilterStrength(VP8Encoder* const enc) {
296
  int i;
297
  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
298
  const int level0 = 5 * enc->config_->filter_strength;
299
  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
300
    VP8SegmentInfo* const m = &enc->dqm_[i];
301
    // We focus on the quantization of AC coeffs.
302
    const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
303
    const int base_strength =
304
        VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
305
    // Segments with lower complexity ('beta') will be less filtered.
306
    const int f = base_strength * level0 / (256 + m->beta_);
307
    m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
308
  }
309
  // We record the initial strength (mainly for the case of 1-segment only).
310
  enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
311
  enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
312
  enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
313
}
314

315
//------------------------------------------------------------------------------
316

317
// Note: if you change the values below, remember that the max range
318
// allowed by the syntax for DQ_UV is [-16,16].
319
#define MAX_DQ_UV (6)
320
#define MIN_DQ_UV (-4)
321

322
// We want to emulate jpeg-like behaviour where the expected "good" quality
323
// is around q=75. Internally, our "good" middle is around c=50. So we
324
// map accordingly using linear piece-wise function
325
static double QualityToCompression(double c) {
326
  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
327
  // The file size roughly scales as pow(quantizer, 3.). Actually, the
328
  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
329
  // in the mid-quant range. So we scale the compressibility inversely to
330
  // this power-law: quant ~= compression ^ 1/3. This law holds well for
331
  // low quant. Finer modeling for high-quant would make use of kAcTable[]
332
  // more explicitly.
333
  const double v = pow(linear_c, 1 / 3.);
334
  return v;
335
}
336

337
static double QualityToJPEGCompression(double c, double alpha) {
338
  // We map the complexity 'alpha' and quality setting 'c' to a compression
339
  // exponent empirically matched to the compression curve of libjpeg6b.
340
  // On average, the WebP output size will be roughly similar to that of a
341
  // JPEG file compressed with same quality factor.
342
  const double amin = 0.30;
343
  const double amax = 0.85;
344
  const double exp_min = 0.4;
345
  const double exp_max = 0.9;
346
  const double slope = (exp_min - exp_max) / (amax - amin);
347
  // Linearly interpolate 'expn' from exp_min to exp_max
348
  // in the [amin, amax] range.
349
  const double expn = (alpha > amax) ? exp_min
350
                    : (alpha < amin) ? exp_max
351
                    : exp_max + slope * (alpha - amin);
352
  const double v = pow(c, expn);
353
  return v;
354
}
355

356
static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
357
                                 const VP8SegmentInfo* const S2) {
358
  return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
359
}
360

361
static void SimplifySegments(VP8Encoder* const enc) {
362
  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
363
  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
364
  // explicit check is needed to avoid a spurious warning about 'i' exceeding
365
  // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
366
  const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
367
                               ? enc->segment_hdr_.num_segments_
368
                               : NUM_MB_SEGMENTS;
369
  int num_final_segments = 1;
370
  int s1, s2;
371
  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
372
    const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
373
    int found = 0;
374
    // check if we already have similar segment
375
    for (s2 = 0; s2 < num_final_segments; ++s2) {
376
      const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
377
      if (SegmentsAreEquivalent(S1, S2)) {
378
        found = 1;
379
        break;
380
      }
381
    }
382
    map[s1] = s2;
383
    if (!found) {
384
      if (num_final_segments != s1) {
385
        enc->dqm_[num_final_segments] = enc->dqm_[s1];
386
      }
387
      ++num_final_segments;
388
    }
389
  }
390
  if (num_final_segments < num_segments) {  // Remap
391
    int i = enc->mb_w_ * enc->mb_h_;
392
    while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
393
    enc->segment_hdr_.num_segments_ = num_final_segments;
394
    // Replicate the trailing segment infos (it's mostly cosmetics)
395
    for (i = num_final_segments; i < num_segments; ++i) {
396
      enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
397
    }
398
  }
399
}
400

401
void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
402
  int i;
403
  int dq_uv_ac, dq_uv_dc;
404
  const int num_segments = enc->segment_hdr_.num_segments_;
405
  const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
406
  const double Q = quality / 100.;
407
  const double c_base = enc->config_->emulate_jpeg_size ?
408
      QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
409
      QualityToCompression(Q);
410
  for (i = 0; i < num_segments; ++i) {
411
    // We modulate the base coefficient to accommodate for the quantization
412
    // susceptibility and allow denser segments to be quantized more.
413
    const double expn = 1. - amp * enc->dqm_[i].alpha_;
414
    const double c = pow(c_base, expn);
415
    const int q = (int)(127. * (1. - c));
416
    assert(expn > 0.);
417
    enc->dqm_[i].quant_ = clip(q, 0, 127);
418
  }
419

420
  // purely indicative in the bitstream (except for the 1-segment case)
421
  enc->base_quant_ = enc->dqm_[0].quant_;
422

423
  // fill-in values for the unused segments (required by the syntax)
424
  for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
425
    enc->dqm_[i].quant_ = enc->base_quant_;
426
  }
427

428
  // uv_alpha_ is normally spread around ~60. The useful range is
429
  // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
430
  // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
431
  dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
432
                                          / (MAX_ALPHA - MIN_ALPHA);
433
  // we rescale by the user-defined strength of adaptation
434
  dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
435
  // and make it safe.
436
  dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
437
  // We also boost the dc-uv-quant a little, based on sns-strength, since
438
  // U/V channels are quite more reactive to high quants (flat DC-blocks
439
  // tend to appear, and are unpleasant).
440
  dq_uv_dc = -4 * enc->config_->sns_strength / 100;
441
  dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
442

443
  enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
444
  enc->dq_y2_dc_ = 0;
445
  enc->dq_y2_ac_ = 0;
446
  enc->dq_uv_dc_ = dq_uv_dc;
447
  enc->dq_uv_ac_ = dq_uv_ac;
448

449
  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
450

451
  if (num_segments > 1) SimplifySegments(enc);
452

453
  SetupMatrices(enc);         // finalize quantization matrices
454
}
455

456
//------------------------------------------------------------------------------
457
// Form the predictions in cache
458

459
// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
460
const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
461
const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
462

463
// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
464
const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
465
  I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
466
};
467

468
void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
469
  const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
470
  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
471
  VP8EncPredLuma16(it->yuv_p_, left, top);
472
}
473

474
void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
475
  const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
476
  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
477
  VP8EncPredChroma8(it->yuv_p_, left, top);
478
}
479

480
void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
481
  VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
482
}
483

484
//------------------------------------------------------------------------------
485
// Quantize
486

487
// Layout:
488
// +----+----+
489
// |YYYY|UUVV| 0
490
// |YYYY|UUVV| 4
491
// |YYYY|....| 8
492
// |YYYY|....| 12
493
// +----+----+
494

495
const uint16_t VP8Scan[16] = {  // Luma
496
  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
497
  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
498
  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
499
  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
500
};
501

502
static const uint16_t VP8ScanUV[4 + 4] = {
503
  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
504
  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
505
};
506

507
//------------------------------------------------------------------------------
508
// Distortion measurement
509

510
static const uint16_t kWeightY[16] = {
511
  38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
512
};
513

514
static const uint16_t kWeightTrellis[16] = {
515
#if USE_TDISTO == 0
516
  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
517
#else
518
  30, 27, 19, 11,
519
  27, 24, 17, 10,
520
  19, 17, 12,  8,
521
  11, 10,  8,  6
522
#endif
523
};
524

525
// Init/Copy the common fields in score.
526
static void InitScore(VP8ModeScore* const rd) {
527
  rd->D  = 0;
528
  rd->SD = 0;
529
  rd->R  = 0;
530
  rd->H  = 0;
531
  rd->nz = 0;
532
  rd->score = MAX_COST;
533
}
534

535
static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
536
  dst->D  = src->D;
537
  dst->SD = src->SD;
538
  dst->R  = src->R;
539
  dst->H  = src->H;
540
  dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
541
  dst->score = src->score;
542
}
543

544
static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
545
  dst->D  += src->D;
546
  dst->SD += src->SD;
547
  dst->R  += src->R;
548
  dst->H  += src->H;
549
  dst->nz |= src->nz;     // here, new nz bits are accumulated.
550
  dst->score += src->score;
551
}
552

553
//------------------------------------------------------------------------------
554
// Performs trellis-optimized quantization.
555

556
// Trellis node
557
typedef struct {
558
  int8_t prev;            // best previous node
559
  int8_t sign;            // sign of coeff_i
560
  int16_t level;          // level
561
} Node;
562

563
// Score state
564
typedef struct {
565
  score_t score;          // partial RD score
566
  const uint16_t* costs;  // shortcut to cost tables
567
} ScoreState;
568

569
// If a coefficient was quantized to a value Q (using a neutral bias),
570
// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
571
// We don't test negative values though.
572
#define MIN_DELTA 0   // how much lower level to try
573
#define MAX_DELTA 1   // how much higher
574
#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
575
#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
576
#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
577

578
static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
579
  rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
580
}
581

582
static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
583
                                          score_t distortion) {
584
  return rate * lambda + RD_DISTO_MULT * distortion;
585
}
586

587
static int TrellisQuantizeBlock(const VP8Encoder* const enc,
588
                                int16_t in[16], int16_t out[16],
589
                                int ctx0, int coeff_type,
590
                                const VP8Matrix* const mtx,
591
                                int lambda) {
592
  const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
593
  CostArrayPtr const costs =
594
      (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
595
  const int first = (coeff_type == 0) ? 1 : 0;
596
  Node nodes[16][NUM_NODES];
597
  ScoreState score_states[2][NUM_NODES];
598
  ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
599
  ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
600
  int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
601
  score_t best_score;
602
  int n, m, p, last;
603

604
  {
605
    score_t cost;
606
    const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
607
    const int last_proba = probas[VP8EncBands[first]][ctx0][0];
608

609
    // compute the position of the last interesting coefficient
610
    last = first - 1;
611
    for (n = 15; n >= first; --n) {
612
      const int j = kZigzag[n];
613
      const int err = in[j] * in[j];
614
      if (err > thresh) {
615
        last = n;
616
        break;
617
      }
618
    }
619
    // we don't need to go inspect up to n = 16 coeffs. We can just go up
620
    // to last + 1 (inclusive) without losing much.
621
    if (last < 15) ++last;
622

623
    // compute 'skip' score. This is the max score one can do.
624
    cost = VP8BitCost(0, last_proba);
625
    best_score = RDScoreTrellis(lambda, cost, 0);
626

627
    // initialize source node.
628
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
629
      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
630
      ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
631
      ss_cur[m].costs = costs[first][ctx0];
632
    }
633
  }
634

635
  // traverse trellis.
636
  for (n = first; n <= last; ++n) {
637
    const int j = kZigzag[n];
638
    const uint32_t Q  = mtx->q_[j];
639
    const uint32_t iQ = mtx->iq_[j];
640
    const uint32_t B = BIAS(0x00);     // neutral bias
641
    // note: it's important to take sign of the _original_ coeff,
642
    // so we don't have to consider level < 0 afterward.
643
    const int sign = (in[j] < 0);
644
    const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
645
    int level0 = QUANTDIV(coeff0, iQ, B);
646
    int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
647
    if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
648
    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
649

650
    {   // Swap current and previous score states
651
      ScoreState* const tmp = ss_cur;
652
      ss_cur = ss_prev;
653
      ss_prev = tmp;
654
    }
655

656
    // test all alternate level values around level0.
657
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
658
      Node* const cur = &NODE(n, m);
659
      int level = level0 + m;
660
      const int ctx = (level > 2) ? 2 : level;
661
      const int band = VP8EncBands[n + 1];
662
      score_t base_score;
663
      score_t best_cur_score = MAX_COST;
664
      int best_prev = 0;   // default, in case
665

666
      ss_cur[m].score = MAX_COST;
667
      ss_cur[m].costs = costs[n + 1][ctx];
668
      if (level < 0 || level > thresh_level) {
669
        // Node is dead.
670
        continue;
671
      }
672

673
      {
674
        // Compute delta_error = how much coding this level will
675
        // subtract to max_error as distortion.
676
        // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
677
        const int new_error = coeff0 - level * Q;
678
        const int delta_error =
679
            kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
680
        base_score = RDScoreTrellis(lambda, 0, delta_error);
681
      }
682

683
      // Inspect all possible non-dead predecessors. Retain only the best one.
684
      for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
685
        // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
686
        // eliminated since their score can't be better than the current best.
687
        const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
688
        // Examine node assuming it's a non-terminal one.
689
        const score_t score =
690
            base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
691
        if (score < best_cur_score) {
692
          best_cur_score = score;
693
          best_prev = p;
694
        }
695
      }
696
      // Store best finding in current node.
697
      cur->sign = sign;
698
      cur->level = level;
699
      cur->prev = best_prev;
700
      ss_cur[m].score = best_cur_score;
701

702
      // Now, record best terminal node (and thus best entry in the graph).
703
      if (level != 0) {
704
        const score_t last_pos_cost =
705
            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
706
        const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
707
        const score_t score = best_cur_score + last_pos_score;
708
        if (score < best_score) {
709
          best_score = score;
710
          best_path[0] = n;                     // best eob position
711
          best_path[1] = m;                     // best node index
712
          best_path[2] = best_prev;             // best predecessor
713
        }
714
      }
715
    }
716
  }
717

718
  // Fresh start
719
  memset(in + first, 0, (16 - first) * sizeof(*in));
720
  memset(out + first, 0, (16 - first) * sizeof(*out));
721
  if (best_path[0] == -1) {
722
    return 0;   // skip!
723
  }
724

725
  {
726
    // Unwind the best path.
727
    // Note: best-prev on terminal node is not necessarily equal to the
728
    // best_prev for non-terminal. So we patch best_path[2] in.
729
    int nz = 0;
730
    int best_node = best_path[1];
731
    n = best_path[0];
732
    NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
733

734
    for (; n >= first; --n) {
735
      const Node* const node = &NODE(n, best_node);
736
      const int j = kZigzag[n];
737
      out[n] = node->sign ? -node->level : node->level;
738
      nz |= node->level;
739
      in[j] = out[n] * mtx->q_[j];
740
      best_node = node->prev;
741
    }
742
    return (nz != 0);
743
  }
744
}
745

746
#undef NODE
747

748
//------------------------------------------------------------------------------
749
// Performs: difference, transform, quantize, back-transform, add
750
// all at once. Output is the reconstructed block in *yuv_out, and the
751
// quantized levels in *levels.
752

753
static int ReconstructIntra16(VP8EncIterator* const it,
754
                              VP8ModeScore* const rd,
755
                              uint8_t* const yuv_out,
756
                              int mode) {
757
  const VP8Encoder* const enc = it->enc_;
758
  const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
759
  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
760
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
761
  int nz = 0;
762
  int n;
763
  int16_t tmp[16][16], dc_tmp[16];
764

765
  for (n = 0; n < 16; n += 2) {
766
    VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
767
  }
768
  VP8FTransformWHT(tmp[0], dc_tmp);
769
  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
770

771
  if (DO_TRELLIS_I16 && it->do_trellis_) {
772
    int x, y;
773
    VP8IteratorNzToBytes(it);
774
    for (y = 0, n = 0; y < 4; ++y) {
775
      for (x = 0; x < 4; ++x, ++n) {
776
        const int ctx = it->top_nz_[x] + it->left_nz_[y];
777
        const int non_zero =
778
            TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
779
                                 &dqm->y1_, dqm->lambda_trellis_i16_);
780
        it->top_nz_[x] = it->left_nz_[y] = non_zero;
781
        rd->y_ac_levels[n][0] = 0;
782
        nz |= non_zero << n;
783
      }
784
    }
785
  } else {
786
    for (n = 0; n < 16; n += 2) {
787
      // Zero-out the first coeff, so that: a) nz is correct below, and
788
      // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
789
      tmp[n][0] = tmp[n + 1][0] = 0;
790
      nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
791
      assert(rd->y_ac_levels[n + 0][0] == 0);
792
      assert(rd->y_ac_levels[n + 1][0] == 0);
793
    }
794
  }
795

796
  // Transform back
797
  VP8TransformWHT(dc_tmp, tmp[0]);
798
  for (n = 0; n < 16; n += 2) {
799
    VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
800
  }
801

802
  return nz;
803
}
804

805
static int ReconstructIntra4(VP8EncIterator* const it,
806
                             int16_t levels[16],
807
                             const uint8_t* const src,
808
                             uint8_t* const yuv_out,
809
                             int mode) {
810
  const VP8Encoder* const enc = it->enc_;
811
  const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
812
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
813
  int nz = 0;
814
  int16_t tmp[16];
815

816
  VP8FTransform(src, ref, tmp);
817
  if (DO_TRELLIS_I4 && it->do_trellis_) {
818
    const int x = it->i4_ & 3, y = it->i4_ >> 2;
819
    const int ctx = it->top_nz_[x] + it->left_nz_[y];
820
    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
821
                              dqm->lambda_trellis_i4_);
822
  } else {
823
    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
824
  }
825
  VP8ITransform(ref, tmp, yuv_out, 0);
826
  return nz;
827
}
828

829
//------------------------------------------------------------------------------
830
// DC-error diffusion
831

832
// Diffusion weights. We under-correct a bit (15/16th of the error is actually
833
// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
834
#define C1 7    // fraction of error sent to the 4x4 block below
835
#define C2 8    // fraction of error sent to the 4x4 block on the right
836
#define DSHIFT 4
837
#define DSCALE 1   // storage descaling, needed to make the error fit int8_t
838

839
// Quantize as usual, but also compute and return the quantization error.
840
// Error is already divided by DSHIFT.
841
static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
842
  int V = *v;
843
  const int sign = (V < 0);
844
  if (sign) V = -V;
845
  if (V > (int)mtx->zthresh_[0]) {
846
    const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
847
    const int err = (V - qV);
848
    *v = sign ? -qV : qV;
849
    return (sign ? -err : err) >> DSCALE;
850
  }
851
  *v = 0;
852
  return (sign ? -V : V) >> DSCALE;
853
}
854

855
static void CorrectDCValues(const VP8EncIterator* const it,
856
                            const VP8Matrix* const mtx,
857
                            int16_t tmp[][16], VP8ModeScore* const rd) {
858
  //         | top[0] | top[1]
859
  // --------+--------+---------
860
  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
861
  // left[1] | tmp[2]   tmp[3]        err2 err3
862
  //
863
  // Final errors {err1,err2,err3} are preserved and later restored
864
  // as top[]/left[] on the next block.
865
  int ch;
866
  for (ch = 0; ch <= 1; ++ch) {
867
    const int8_t* const top = it->top_derr_[it->x_][ch];
868
    const int8_t* const left = it->left_derr_[ch];
869
    int16_t (* const c)[16] = &tmp[ch * 4];
870
    int err0, err1, err2, err3;
871
    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
872
    err0 = QuantizeSingle(&c[0][0], mtx);
873
    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
874
    err1 = QuantizeSingle(&c[1][0], mtx);
875
    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
876
    err2 = QuantizeSingle(&c[2][0], mtx);
877
    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
878
    err3 = QuantizeSingle(&c[3][0], mtx);
879
    // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
880
    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
881
    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
882
    rd->derr[ch][0] = (int8_t)err1;
883
    rd->derr[ch][1] = (int8_t)err2;
884
    rd->derr[ch][2] = (int8_t)err3;
885
  }
886
}
887

888
static void StoreDiffusionErrors(VP8EncIterator* const it,
889
                                 const VP8ModeScore* const rd) {
890
  int ch;
891
  for (ch = 0; ch <= 1; ++ch) {
892
    int8_t* const top = it->top_derr_[it->x_][ch];
893
    int8_t* const left = it->left_derr_[ch];
894
    left[0] = rd->derr[ch][0];            // restore err1
895
    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
896
    top[0]  = rd->derr[ch][1];            //     ... err2
897
    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
898
  }
899
}
900

901
#undef C1
902
#undef C2
903
#undef DSHIFT
904
#undef DSCALE
905

906
//------------------------------------------------------------------------------
907

908
static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
909
                         uint8_t* const yuv_out, int mode) {
910
  const VP8Encoder* const enc = it->enc_;
911
  const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
912
  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
913
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
914
  int nz = 0;
915
  int n;
916
  int16_t tmp[8][16];
917

918
  for (n = 0; n < 8; n += 2) {
919
    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
920
  }
921
  if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
922

923
  if (DO_TRELLIS_UV && it->do_trellis_) {
924
    int ch, x, y;
925
    for (ch = 0, n = 0; ch <= 2; ch += 2) {
926
      for (y = 0; y < 2; ++y) {
927
        for (x = 0; x < 2; ++x, ++n) {
928
          const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
929
          const int non_zero =
930
              TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
931
                                   &dqm->uv_, dqm->lambda_trellis_uv_);
932
          it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
933
          nz |= non_zero << n;
934
        }
935
      }
936
    }
937
  } else {
938
    for (n = 0; n < 8; n += 2) {
939
      nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
940
    }
941
  }
942

943
  for (n = 0; n < 8; n += 2) {
944
    VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
945
  }
946
  return (nz << 16);
947
}
948

949
//------------------------------------------------------------------------------
950
// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
951
// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
952

953
static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
954
  // We look at the first three AC coefficients to determine what is the average
955
  // delta between each sub-4x4 block.
956
  const int v0 = abs(DCs[1]);
957
  const int v1 = abs(DCs[2]);
958
  const int v2 = abs(DCs[4]);
959
  int max_v = (v1 > v0) ? v1 : v0;
960
  max_v = (v2 > max_v) ? v2 : max_v;
961
  if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
962
}
963

964
static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
965
  VP8ModeScore* const tmp = *a;
966
  *a = *b;
967
  *b = tmp;
968
}
969

970
static void SwapPtr(uint8_t** a, uint8_t** b) {
971
  uint8_t* const tmp = *a;
972
  *a = *b;
973
  *b = tmp;
974
}
975

976
static void SwapOut(VP8EncIterator* const it) {
977
  SwapPtr(&it->yuv_out_, &it->yuv_out2_);
978
}
979

980
static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
981
  score_t score = 0;
982
  while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
983
    int i;
984
    for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
985
      score += (levels[i] != 0);
986
      if (score > thresh) return 0;
987
    }
988
    levels += 16;
989
  }
990
  return 1;
991
}
992

993
static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
994
  const int kNumBlocks = 16;
995
  VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
996
  const int lambda = dqm->lambda_i16_;
997
  const int tlambda = dqm->tlambda_;
998
  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
999
  VP8ModeScore rd_tmp;
1000
  VP8ModeScore* rd_cur = &rd_tmp;
1001
  VP8ModeScore* rd_best = rd;
1002
  int mode;
1003

1004
  rd->mode_i16 = -1;
1005
  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1006
    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
1007
    rd_cur->mode_i16 = mode;
1008

1009
    // Reconstruct
1010
    rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1011

1012
    // Measure RD-score
1013
    rd_cur->D = VP8SSE16x16(src, tmp_dst);
1014
    rd_cur->SD =
1015
        tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1016
    rd_cur->H = VP8FixedCostsI16[mode];
1017
    rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1018
    if (mode > 0 &&
1019
        IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
1020
      // penalty to avoid flat area to be mispredicted by complex mode
1021
      rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
1022
    }
1023

1024
    // Since we always examine Intra16 first, we can overwrite *rd directly.
1025
    SetRDScore(lambda, rd_cur);
1026
    if (mode == 0 || rd_cur->score < rd_best->score) {
1027
      SwapModeScore(&rd_cur, &rd_best);
1028
      SwapOut(it);
1029
    }
1030
  }
1031
  if (rd_best != rd) {
1032
    memcpy(rd, rd_best, sizeof(*rd));
1033
  }
1034
  SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
1035
  VP8SetIntra16Mode(it, rd->mode_i16);
1036

1037
  // we have a blocky macroblock (only DCs are non-zero) with fairly high
1038
  // distortion, record max delta so we can later adjust the minimal filtering
1039
  // strength needed to smooth these blocks out.
1040
  if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1041
    StoreMaxDelta(dqm, rd->y_dc_levels);
1042
  }
1043
}
1044

1045
//------------------------------------------------------------------------------
1046

1047
// return the cost array corresponding to the surrounding prediction modes.
1048
static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
1049
                                     const uint8_t modes[16]) {
1050
  const int preds_w = it->enc_->preds_w_;
1051
  const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1052
  const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1053
  const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1054
  return VP8FixedCostsI4[top][left];
1055
}
1056

1057
static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
1058
  const VP8Encoder* const enc = it->enc_;
1059
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1060
  const int lambda = dqm->lambda_i4_;
1061
  const int tlambda = dqm->tlambda_;
1062
  const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1063
  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1064
  int total_header_bits = 0;
1065
  VP8ModeScore rd_best;
1066

1067
  if (enc->max_i4_header_bits_ == 0) {
1068
    return 0;
1069
  }
1070

1071
  InitScore(&rd_best);
1072
  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
1073
  SetRDScore(dqm->lambda_mode_, &rd_best);
1074
  VP8IteratorStartI4(it);
1075
  do {
1076
    const int kNumBlocks = 1;
1077
    VP8ModeScore rd_i4;
1078
    int mode;
1079
    int best_mode = -1;
1080
    const uint8_t* const src = src0 + VP8Scan[it->i4_];
1081
    const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1082
    uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1083
    uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
1084

1085
    InitScore(&rd_i4);
1086
    VP8MakeIntra4Preds(it);
1087
    for (mode = 0; mode < NUM_BMODES; ++mode) {
1088
      VP8ModeScore rd_tmp;
1089
      int16_t tmp_levels[16];
1090

1091
      // Reconstruct
1092
      rd_tmp.nz =
1093
          ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1094

1095
      // Compute RD-score
1096
      rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1097
      rd_tmp.SD =
1098
          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1099
                  : 0;
1100
      rd_tmp.H = mode_costs[mode];
1101

1102
      // Add flatness penalty
1103
      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1104
        rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1105
      } else {
1106
        rd_tmp.R = 0;
1107
      }
1108

1109
      // early-out check
1110
      SetRDScore(lambda, &rd_tmp);
1111
      if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1112

1113
      // finish computing score
1114
      rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1115
      SetRDScore(lambda, &rd_tmp);
1116

1117
      if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1118
        CopyScore(&rd_i4, &rd_tmp);
1119
        best_mode = mode;
1120
        SwapPtr(&tmp_dst, &best_block);
1121
        memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1122
               sizeof(rd_best.y_ac_levels[it->i4_]));
1123
      }
1124
    }
1125
    SetRDScore(dqm->lambda_mode_, &rd_i4);
1126
    AddScore(&rd_best, &rd_i4);
1127
    if (rd_best.score >= rd->score) {
1128
      return 0;
1129
    }
1130
    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1131
    if (total_header_bits > enc->max_i4_header_bits_) {
1132
      return 0;
1133
    }
1134
    // Copy selected samples if not in the right place already.
1135
    if (best_block != best_blocks + VP8Scan[it->i4_]) {
1136
      VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1137
    }
1138
    rd->modes_i4[it->i4_] = best_mode;
1139
    it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1140
  } while (VP8IteratorRotateI4(it, best_blocks));
1141

1142
  // finalize state
1143
  CopyScore(rd, &rd_best);
1144
  VP8SetIntra4Mode(it, rd->modes_i4);
1145
  SwapOut(it);
1146
  memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1147
  return 1;   // select intra4x4 over intra16x16
1148
}
1149

1150
//------------------------------------------------------------------------------
1151

1152
static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1153
  const int kNumBlocks = 8;
1154
  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1155
  const int lambda = dqm->lambda_uv_;
1156
  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1157
  uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
1158
  uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1159
  uint8_t* dst = dst0;
1160
  VP8ModeScore rd_best;
1161
  int mode;
1162

1163
  rd->mode_uv = -1;
1164
  InitScore(&rd_best);
1165
  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1166
    VP8ModeScore rd_uv;
1167

1168
    // Reconstruct
1169
    rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1170

1171
    // Compute RD-score
1172
    rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1173
    rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1174
    rd_uv.H  = VP8FixedCostsUV[mode];
1175
    rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1176
    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1177
      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1178
    }
1179

1180
    SetRDScore(lambda, &rd_uv);
1181
    if (mode == 0 || rd_uv.score < rd_best.score) {
1182
      CopyScore(&rd_best, &rd_uv);
1183
      rd->mode_uv = mode;
1184
      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1185
      if (it->top_derr_ != NULL) {
1186
        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1187
      }
1188
      SwapPtr(&dst, &tmp_dst);
1189
    }
1190
  }
1191
  VP8SetIntraUVMode(it, rd->mode_uv);
1192
  AddScore(rd, &rd_best);
1193
  if (dst != dst0) {   // copy 16x8 block if needed
1194
    VP8Copy16x8(dst, dst0);
1195
  }
1196
  if (it->top_derr_ != NULL) {  // store diffusion errors for next block
1197
    StoreDiffusionErrors(it, rd);
1198
  }
1199
}
1200

1201
//------------------------------------------------------------------------------
1202
// Final reconstruction and quantization.
1203

1204
static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1205
  const VP8Encoder* const enc = it->enc_;
1206
  const int is_i16 = (it->mb_->type_ == 1);
1207
  int nz = 0;
1208

1209
  if (is_i16) {
1210
    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1211
  } else {
1212
    VP8IteratorStartI4(it);
1213
    do {
1214
      const int mode =
1215
          it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1216
      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1217
      uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1218
      VP8MakeIntra4Preds(it);
1219
      nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1220
                              src, dst, mode) << it->i4_;
1221
    } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1222
  }
1223

1224
  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1225
  rd->nz = nz;
1226
}
1227

1228
// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1229
static void RefineUsingDistortion(VP8EncIterator* const it,
1230
                                  int try_both_modes, int refine_uv_mode,
1231
                                  VP8ModeScore* const rd) {
1232
  score_t best_score = MAX_COST;
1233
  int nz = 0;
1234
  int mode;
1235
  int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1236

1237
  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1238
  // Some empiric constants, of approximate order of magnitude.
1239
  const int lambda_d_i16 = 106;
1240
  const int lambda_d_i4 = 11;
1241
  const int lambda_d_uv = 120;
1242
  score_t score_i4 = dqm->i4_penalty_;
1243
  score_t i4_bit_sum = 0;
1244
  const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1245
                                           : MAX_COST;  // no early-out allowed
1246

1247
  if (is_i16) {   // First, evaluate Intra16 distortion
1248
    int best_mode = -1;
1249
    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1250
    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1251
      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1252
      const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1253
                          + VP8FixedCostsI16[mode] * lambda_d_i16;
1254
      if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1255
        continue;
1256
      }
1257
      if (score < best_score) {
1258
        best_mode = mode;
1259
        best_score = score;
1260
      }
1261
    }
1262
    VP8SetIntra16Mode(it, best_mode);
1263
    // we'll reconstruct later, if i16 mode actually gets selected
1264
  }
1265

1266
  // Next, evaluate Intra4
1267
  if (try_both_modes || !is_i16) {
1268
    // We don't evaluate the rate here, but just account for it through a
1269
    // constant penalty (i4 mode usually needs more bits compared to i16).
1270
    is_i16 = 0;
1271
    VP8IteratorStartI4(it);
1272
    do {
1273
      int best_i4_mode = -1;
1274
      score_t best_i4_score = MAX_COST;
1275
      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1276
      const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1277

1278
      VP8MakeIntra4Preds(it);
1279
      for (mode = 0; mode < NUM_BMODES; ++mode) {
1280
        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1281
        const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1282
                            + mode_costs[mode] * lambda_d_i4;
1283
        if (score < best_i4_score) {
1284
          best_i4_mode = mode;
1285
          best_i4_score = score;
1286
        }
1287
      }
1288
      i4_bit_sum += mode_costs[best_i4_mode];
1289
      rd->modes_i4[it->i4_] = best_i4_mode;
1290
      score_i4 += best_i4_score;
1291
      if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1292
        // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1293
        is_i16 = 1;
1294
        break;
1295
      } else {  // reconstruct partial block inside yuv_out2_ buffer
1296
        uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1297
        nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1298
                                src, tmp_dst, best_i4_mode) << it->i4_;
1299
      }
1300
    } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1301
  }
1302

1303
  // Final reconstruction, depending on which mode is selected.
1304
  if (!is_i16) {
1305
    VP8SetIntra4Mode(it, rd->modes_i4);
1306
    SwapOut(it);
1307
    best_score = score_i4;
1308
  } else {
1309
    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1310
  }
1311

1312
  // ... and UV!
1313
  if (refine_uv_mode) {
1314
    int best_mode = -1;
1315
    score_t best_uv_score = MAX_COST;
1316
    const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1317
    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1318
      const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1319
      const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1320
                          + VP8FixedCostsUV[mode] * lambda_d_uv;
1321
      if (score < best_uv_score) {
1322
        best_mode = mode;
1323
        best_uv_score = score;
1324
      }
1325
    }
1326
    VP8SetIntraUVMode(it, best_mode);
1327
  }
1328
  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1329

1330
  rd->nz = nz;
1331
  rd->score = best_score;
1332
}
1333

1334
//------------------------------------------------------------------------------
1335
// Entry point
1336

1337
int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1338
                VP8RDLevel rd_opt) {
1339
  int is_skipped;
1340
  const int method = it->enc_->method_;
1341

1342
  InitScore(rd);
1343

1344
  // We can perform predictions for Luma16x16 and Chroma8x8 already.
1345
  // Luma4x4 predictions needs to be done as-we-go.
1346
  VP8MakeLuma16Preds(it);
1347
  VP8MakeChroma8Preds(it);
1348

1349
  if (rd_opt > RD_OPT_NONE) {
1350
    it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1351
    PickBestIntra16(it, rd);
1352
    if (method >= 2) {
1353
      PickBestIntra4(it, rd);
1354
    }
1355
    PickBestUV(it, rd);
1356
    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1357
      it->do_trellis_ = 1;
1358
      SimpleQuantize(it, rd);
1359
    }
1360
  } else {
1361
    // At this point we have heuristically decided intra16 / intra4.
1362
    // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1363
    // For method <= 1, we don't re-examine the decision but just go ahead with
1364
    // quantization/reconstruction.
1365
    RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1366
  }
1367
  is_skipped = (rd->nz == 0);
1368
  VP8SetSkip(it, is_skipped);
1369
  return is_skipped;
1370
}
1371

1372
Product

Resources

Company