CoCalc -- analyze.c

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libtheora/analyze.c
⁹⁹⁰⁵ views
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009,2025           *
9
 * by the Xiph.Org Foundation https://www.xiph.org/                 *
10
 *                                                                  *
11
 ********************************************************************
12

13
  function: mode selection code
14

15
 ********************************************************************/
16
#include <limits.h>
17
#include <string.h>
18
#include "encint.h"
19
#include "modedec.h"
20
#if defined(OC_COLLECT_METRICS)
21
# include "collect.c"
22
#endif
23

24

25

26
typedef struct oc_rd_metric          oc_rd_metric;
27
typedef struct oc_mode_choice        oc_mode_choice;
28

29

30

31
/*There are 8 possible schemes used to encode macro block modes.
32
  Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
33
  The same set of Huffman codes is used for each of these 7 schemes, but the
34
   mode assigned to each codeword varies.
35
  Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream,
36
   while schemes 1-6 have a fixed mapping.
37
  Scheme 7 just encodes each mode directly in 3 bits.*/
38

39
/*The mode orderings for the various mode coding schemes.
40
  Scheme 0 uses a custom alphabet, which is not stored in this table.
41
  This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
42
   decoder.*/
43
static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
44
  /*Last MV dominates.*/
45
  /*L P M N I G GM 4*/
46
  {3,4,2,0,1,5,6,7},
47
  /*L P N M I G GM 4*/
48
  {2,4,3,0,1,5,6,7},
49
  /*L M P N I G GM 4*/
50
  {3,4,1,0,2,5,6,7},
51
  /*L M N P I G GM 4*/
52
  {2,4,1,0,3,5,6,7},
53
  /*No MV dominates.*/
54
  /*N L P M I G GM 4*/
55
  {0,4,3,1,2,5,6,7},
56
  /*N G L P M I GM 4*/
57
  {0,5,4,2,3,1,6,7},
58
  /*Default ordering.*/
59
  /*N I M L P G GM 4*/
60
  {0,1,2,3,4,5,6,7}
61
};
62

63

64

65
/*Initialize the mode scheme chooser.
66
  This need only be called once per encoder.*/
67
void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
68
  int si;
69
  _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
70
  for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1];
71
}
72

73
/*Reset the mode scheme chooser.
74
  This needs to be called once for each frame, including the first.*/
75
static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
76
  int si;
77
  memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
78
  /*Scheme 0 starts with 24 bits to store the mode list in.*/
79
  _chooser->scheme_bits[0]=24;
80
  memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
81
  for(si=0;si<8;si++){
82
    /*Scheme 7 should always start first, and scheme 0 should always start
83
       last.*/
84
    _chooser->scheme_list[si]=7-si;
85
    _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
86
  }
87
}
88

89
/*Return the cost of coding _mb_mode in the specified scheme.*/
90
static int oc_mode_scheme_chooser_scheme_mb_cost(
91
 const oc_mode_scheme_chooser *_chooser,int _scheme,int _mb_mode){
92
  int codebook;
93
  int ri;
94
  codebook=_scheme+1>>3;
95
  /*For any scheme except 0, we can just use the bit cost of the mode's rank
96
     in that scheme.*/
97
  ri=_chooser->mode_ranks[_scheme][_mb_mode];
98
  if(_scheme==0){
99
    int mc;
100
    /*For scheme 0, incrementing the mode count could potentially change the
101
       mode's rank.
102
      Find the index where the mode would be moved to in the optimal list,
103
       and use its bit cost instead of the one for the mode's current
104
       position in the list.*/
105
    /*We don't actually reorder the list; this is for computing opportunity
106
       cost, not an update.*/
107
    mc=_chooser->mode_counts[_mb_mode];
108
    while(ri>0&&mc>=_chooser->mode_counts[_chooser->scheme0_list[ri-1]])ri--;
109
  }
110
  return OC_MODE_BITS[codebook][ri];
111
}
112

113
/*This is the real purpose of this data structure: not actually selecting a
114
   mode scheme, but estimating the cost of coding a given mode given all the
115
   modes selected so far.
116
  This is done via opportunity cost: the cost is defined as the number of bits
117
   required to encode all the modes selected so far including the current one
118
   using the best possible scheme, minus the number of bits required to encode
119
   all the modes selected so far not including the current one using the best
120
   possible scheme.
121
  The computational expense of doing this probably makes it overkill.
122
  Just be happy we take a greedy approach instead of trying to solve the
123
   global mode-selection problem (which is NP-hard).
124
  _mb_mode: The mode to determine the cost of.
125
  Return: The number of bits required to code this mode.*/
126
static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
127
 int _mb_mode){
128
  int scheme0;
129
  int scheme1;
130
  int best_bits;
131
  int mode_bits;
132
  int si;
133
  int scheme0_bits;
134
  int scheme1_bits;
135
  scheme0=_chooser->scheme_list[0];
136
  scheme1=_chooser->scheme_list[1];
137
  scheme0_bits=_chooser->scheme_bits[scheme0];
138
  scheme1_bits=_chooser->scheme_bits[scheme1];
139
  mode_bits=oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme0,_mb_mode);
140
  /*Typical case: If the difference between the best scheme and the next best
141
     is greater than 6 bits, then adding just one mode cannot change which
142
     scheme we use.*/
143
  if(scheme1_bits-scheme0_bits>6)return mode_bits;
144
  /*Otherwise, check to see if adding this mode selects a different scheme as
145
     the best.*/
146
  si=1;
147
  best_bits=scheme0_bits+mode_bits;
148
  do{
149
    int cur_bits;
150
    cur_bits=scheme1_bits+
151
     oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme1,_mb_mode);
152
    if(cur_bits<best_bits)best_bits=cur_bits;
153
    if(++si>=8)break;
154
    scheme1=_chooser->scheme_list[si];
155
    scheme1_bits=_chooser->scheme_bits[scheme1];
156
  }
157
  while(scheme1_bits-scheme0_bits<=6);
158
  return best_bits-scheme0_bits;
159
}
160

161
/*Incrementally update the mode counts and per-scheme bit counts and re-order
162
   the scheme lists once a mode has been selected.
163
  _mb_mode: The mode that was chosen.*/
164
static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
165
 int _mb_mode){
166
  int ri;
167
  int si;
168
  _chooser->mode_counts[_mb_mode]++;
169
  /*Re-order the scheme0 mode list if necessary.*/
170
  for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){
171
    int pmode;
172
    pmode=_chooser->scheme0_list[ri-1];
173
    if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break;
174
    /*Reorder the mode ranking.*/
175
    _chooser->scheme0_ranks[pmode]++;
176
    _chooser->scheme0_list[ri]=pmode;
177
  }
178
  _chooser->scheme0_ranks[_mb_mode]=ri;
179
  _chooser->scheme0_list[ri]=_mb_mode;
180
  /*Now add the bit cost for the mode to each scheme.*/
181
  for(si=0;si<8;si++){
182
    _chooser->scheme_bits[si]+=
183
     OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]];
184
  }
185
  /*Finally, re-order the list of schemes.*/
186
  for(si=1;si<8;si++){
187
    int sj;
188
    int scheme0;
189
    int bits0;
190
    sj=si;
191
    scheme0=_chooser->scheme_list[si];
192
    bits0=_chooser->scheme_bits[scheme0];
193
    do{
194
      int scheme1;
195
      scheme1=_chooser->scheme_list[sj-1];
196
      if(bits0>=_chooser->scheme_bits[scheme1])break;
197
      _chooser->scheme_list[sj]=scheme1;
198
    }
199
    while(--sj>0);
200
    _chooser->scheme_list[sj]=scheme0;
201
  }
202
}
203

204

205

206
/*The number of bits required to encode a super block run.
207
  _run_count: The desired run count; must be positive and less than 4130.*/
208
static int oc_sb_run_bits(int _run_count){
209
  int i;
210
  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
211
  return OC_SB_RUN_CODE_NBITS[i];
212
}
213

214
/*The number of bits required to encode a block run.
215
  _run_count: The desired run count; must be positive and less than 30.*/
216
static int oc_block_run_bits(int _run_count){
217
  return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
218
}
219

220

221

222
static void oc_fr_state_init(oc_fr_state *_fr){
223
  _fr->bits=0;
224
  _fr->sb_partial_count=0;
225
  _fr->sb_full_count=0;
226
  _fr->b_coded_count_prev=0;
227
  _fr->b_coded_count=0;
228
  _fr->b_count=0;
229
  _fr->sb_prefer_partial=0;
230
  _fr->sb_bits=0;
231
  _fr->sb_partial=-1;
232
  _fr->sb_full=-1;
233
  _fr->b_coded_prev=-1;
234
  _fr->b_coded=-1;
235
}
236

237

238
static int oc_fr_state_sb_cost(const oc_fr_state *_fr,
239
 int _sb_partial,int _sb_full){
240
  int bits;
241
  int sb_partial_count;
242
  int sb_full_count;
243
  bits=0;
244
  sb_partial_count=_fr->sb_partial_count;
245
  /*Extend the sb_partial run, or start a new one.*/
246
  if(_fr->sb_partial==_sb_partial){
247
    if(sb_partial_count>=4129){
248
      bits++;
249
      sb_partial_count=0;
250
    }
251
    else bits-=oc_sb_run_bits(sb_partial_count);
252
  }
253
  else sb_partial_count=0;
254
  bits+=oc_sb_run_bits(++sb_partial_count);
255
  if(!_sb_partial){
256
    /*Extend the sb_full run, or start a new one.*/
257
    sb_full_count=_fr->sb_full_count;
258
    if(_fr->sb_full==_sb_full){
259
      if(sb_full_count>=4129){
260
        bits++;
261
        sb_full_count=0;
262
      }
263
      else bits-=oc_sb_run_bits(sb_full_count);
264
    }
265
    else sb_full_count=0;
266
    bits+=oc_sb_run_bits(++sb_full_count);
267
  }
268
  return bits;
269
}
270

271
static void oc_fr_state_advance_sb(oc_fr_state *_fr,
272
 int _sb_partial,int _sb_full){
273
  int sb_partial_count;
274
  int sb_full_count;
275
  sb_partial_count=_fr->sb_partial_count;
276
  if(_fr->sb_partial!=_sb_partial||sb_partial_count>=4129)sb_partial_count=0;
277
  sb_partial_count++;
278
  if(!_sb_partial){
279
    sb_full_count=_fr->sb_full_count;
280
    if(_fr->sb_full!=_sb_full||sb_full_count>=4129)sb_full_count=0;
281
    sb_full_count++;
282
    _fr->sb_full_count=sb_full_count;
283
    _fr->sb_full=_sb_full;
284
    /*Roll back the partial block state.*/
285
    _fr->b_coded=_fr->b_coded_prev;
286
    _fr->b_coded_count=_fr->b_coded_count_prev;
287
  }
288
  else{
289
    /*Commit back the partial block state.*/
290
    _fr->b_coded_prev=_fr->b_coded;
291
    _fr->b_coded_count_prev=_fr->b_coded_count;
292
  }
293
  _fr->sb_partial_count=sb_partial_count;
294
  _fr->sb_partial=_sb_partial;
295
  _fr->b_count=0;
296
  _fr->sb_prefer_partial=0;
297
  _fr->sb_bits=0;
298
}
299

300
/*Commit the state of the current super block and advance to the next.*/
301
static void oc_fr_state_flush_sb(oc_fr_state *_fr){
302
  int sb_partial;
303
  int sb_full;
304
  int b_coded_count;
305
  int b_count;
306
  b_count=_fr->b_count;
307
  b_coded_count=_fr->b_coded_count;
308
  sb_full=_fr->b_coded;
309
  sb_partial=b_coded_count<b_count;
310
  if(!sb_partial){
311
    /*If the super block is fully coded/uncoded...*/
312
    if(_fr->sb_prefer_partial){
313
      /*So far coding this super block as partial was cheaper anyway.*/
314
      if(b_coded_count>15||_fr->b_coded_prev<0){
315
        int sb_bits;
316
        /*If the block run is too long, this will limit how far it can be
317
           extended into the next partial super block.
318
          If we need to extend it farther, we don't want to have to roll all
319
           the way back here (since there could be many full SBs between now
320
           and then), so we disallow this.
321
          Similarly, if this is the start of a stripe, we don't know how the
322
           length of the outstanding block run from the previous stripe.*/
323
        sb_bits=oc_fr_state_sb_cost(_fr,sb_partial,sb_full);
324
        _fr->bits+=sb_bits-_fr->sb_bits;
325
        _fr->sb_bits=sb_bits;
326
      }
327
      else sb_partial=1;
328
    }
329
  }
330
  oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
331
}
332

333
static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){
334
  ptrdiff_t bits;
335
  int       sb_bits;
336
  int       b_coded_count;
337
  int       b_count;
338
  int       sb_prefer_partial;
339
  sb_bits=_fr->sb_bits;
340
  bits=_fr->bits-sb_bits;
341
  b_count=_fr->b_count;
342
  b_coded_count=_fr->b_coded_count;
343
  sb_prefer_partial=_fr->sb_prefer_partial;
344
  if(b_coded_count>=b_count){
345
    int sb_partial_bits;
346
    /*This super block is currently fully coded/uncoded.*/
347
    if(b_count<=0){
348
      /*This is the first block in this SB.*/
349
      b_count=1;
350
      /*Check to see whether it's cheaper to code it partially or fully.*/
351
      if(_fr->b_coded==_b_coded){
352
        sb_partial_bits=-oc_block_run_bits(b_coded_count);
353
        sb_partial_bits+=oc_block_run_bits(++b_coded_count);
354
      }
355
      else{
356
        b_coded_count=1;
357
        sb_partial_bits=2;
358
      }
359
      sb_partial_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
360
      sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
361
      sb_prefer_partial=sb_partial_bits<sb_bits;
362
      sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
363
    }
364
    else if(_fr->b_coded==_b_coded){
365
      b_coded_count++;
366
      if(++b_count<16){
367
        if(sb_prefer_partial){
368
          /*Check to see if it's cheaper to code it fully.*/
369
          sb_partial_bits=sb_bits;
370
          sb_partial_bits+=oc_block_run_bits(b_coded_count);
371
          if(b_coded_count>0){
372
            sb_partial_bits-=oc_block_run_bits(b_coded_count-1);
373
          }
374
          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
375
          sb_prefer_partial=sb_partial_bits<sb_bits;
376
          sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
377
        }
378
        /*There's no need to check the converse (whether it's cheaper to code
379
           this SB partially if we were coding it fully), since the cost to
380
           code a SB partially can only increase as we add more blocks, whereas
381
           the cost to code it fully stays constant.*/
382
      }
383
      else{
384
        /*If we get to the end and this SB is still full, then force it to be
385
           coded full.
386
          Otherwise we might not be able to extend the block run far enough
387
           into the next partial SB.*/
388
        if(sb_prefer_partial){
389
          sb_prefer_partial=0;
390
          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
391
        }
392
      }
393
    }
394
    else{
395
      /*This SB was full, but now must be made partial.*/
396
      if(!sb_prefer_partial){
397
        sb_bits=oc_block_run_bits(b_coded_count);
398
        if(b_coded_count>b_count){
399
          sb_bits-=oc_block_run_bits(b_coded_count-b_count);
400
        }
401
        sb_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
402
      }
403
      b_count++;
404
      b_coded_count=1;
405
      sb_prefer_partial=1;
406
      sb_bits+=2;
407
    }
408
  }
409
  else{
410
    b_count++;
411
    if(_fr->b_coded==_b_coded)sb_bits-=oc_block_run_bits(b_coded_count);
412
    else b_coded_count=0;
413
    sb_bits+=oc_block_run_bits(++b_coded_count);
414
  }
415
  _fr->bits=bits+sb_bits;
416
  _fr->b_coded_count=b_coded_count;
417
  _fr->b_coded=_b_coded;
418
  _fr->b_count=b_count;
419
  _fr->sb_prefer_partial=sb_prefer_partial;
420
  _fr->sb_bits=sb_bits;
421
}
422

423
static void oc_fr_skip_block(oc_fr_state *_fr){
424
  oc_fr_state_advance_block(_fr,0);
425
}
426

427
static void oc_fr_code_block(oc_fr_state *_fr){
428
  oc_fr_state_advance_block(_fr,1);
429
}
430

431
static int oc_fr_cost1(const oc_fr_state *_fr){
432
  oc_fr_state tmp;
433
  ptrdiff_t   bits;
434
  *&tmp=*_fr;
435
  oc_fr_skip_block(&tmp);
436
  bits=tmp.bits;
437
  *&tmp=*_fr;
438
  oc_fr_code_block(&tmp);
439
  return (int)(tmp.bits-bits);
440
}
441

442
static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
443
  oc_fr_state tmp;
444
  *&tmp=*_pre;
445
  oc_fr_skip_block(&tmp);
446
  oc_fr_skip_block(&tmp);
447
  oc_fr_skip_block(&tmp);
448
  oc_fr_skip_block(&tmp);
449
  return (int)(_post->bits-tmp.bits);
450
}
451

452

453

454
static void oc_qii_state_init(oc_qii_state *_qs){
455
  _qs->bits=0;
456
  _qs->qi01_count=0;
457
  _qs->qi01=-1;
458
  _qs->qi12_count=0;
459
  _qs->qi12=-1;
460
}
461

462

463
static void oc_qii_state_advance(oc_qii_state *_qd,
464
 const oc_qii_state *_qs,int _qii){
465
  ptrdiff_t bits;
466
  int       qi01;
467
  int       qi01_count;
468
  int       qi12;
469
  int       qi12_count;
470
  bits=_qs->bits;
471
  qi01=_qii+1>>1;
472
  qi01_count=_qs->qi01_count;
473
  if(qi01==_qs->qi01){
474
    if(qi01_count>=4129){
475
      bits++;
476
      qi01_count=0;
477
    }
478
    else bits-=oc_sb_run_bits(qi01_count);
479
  }
480
  else qi01_count=0;
481
  qi01_count++;
482
  bits+=oc_sb_run_bits(qi01_count);
483
  qi12_count=_qs->qi12_count;
484
  if(_qii){
485
    qi12=_qii>>1;
486
    if(qi12==_qs->qi12){
487
      if(qi12_count>=4129){
488
        bits++;
489
        qi12_count=0;
490
      }
491
      else bits-=oc_sb_run_bits(qi12_count);
492
    }
493
    else qi12_count=0;
494
    qi12_count++;
495
    bits+=oc_sb_run_bits(qi12_count);
496
  }
497
  else qi12=_qs->qi12;
498
  _qd->bits=bits;
499
  _qd->qi01=qi01;
500
  _qd->qi01_count=qi01_count;
501
  _qd->qi12=qi12;
502
  _qd->qi12_count=qi12_count;
503
}
504

505

506

507
static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
508
  ptrdiff_t *coded_fragis;
509
  unsigned   mcu_nvsbs;
510
  ptrdiff_t  mcu_nfrags;
511
  int        flimit;
512
  int        hdec;
513
  int        vdec;
514
  int        pli;
515
  int        nqis;
516
  int        qii;
517
  int        qi0;
518
  int        qti;
519
  /*Initialize the per-plane coded block flag trackers.
520
    These are used for bit-estimation purposes only; the real flag bits span
521
     all three planes, so we can't compute them in parallel.*/
522
  for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
523
  for(pli=0;pli<3;pli++)oc_qii_state_init(_pipe->qs+pli);
524
  /*Set up the per-plane skip SSD storage pointers.*/
525
  mcu_nvsbs=_enc->mcu_nvsbs;
526
  mcu_nfrags=mcu_nvsbs*_enc->state.fplanes[0].nhsbs*16;
527
  hdec=!(_enc->state.info.pixel_fmt&1);
528
  vdec=!(_enc->state.info.pixel_fmt&2);
529
  _pipe->skip_ssd[0]=_enc->mcu_skip_ssd;
530
  _pipe->skip_ssd[1]=_pipe->skip_ssd[0]+mcu_nfrags;
531
  _pipe->skip_ssd[2]=_pipe->skip_ssd[1]+(mcu_nfrags>>hdec+vdec);
532
  /*Set up per-plane pointers to the coded and uncoded fragments lists.
533
    Unlike the decoder, each planes' coded and uncoded fragment list is kept
534
     separate during the analysis stage; we only make the coded list for all
535
     three planes contiguous right before the final packet is output
536
     (destroying the uncoded lists, which are no longer needed).*/
537
  coded_fragis=_enc->state.coded_fragis;
538
  for(pli=0;pli<3;pli++){
539
    _pipe->coded_fragis[pli]=coded_fragis;
540
    coded_fragis+=_enc->state.fplanes[pli].nfrags;
541
    _pipe->uncoded_fragis[pli]=coded_fragis;
542
  }
543
  memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
544
  memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
545
  /*Set up condensed quantizer tables.*/
546
  qi0=_enc->state.qis[0];
547
  nqis=_enc->state.nqis;
548
  for(pli=0;pli<3;pli++){
549
    for(qii=0;qii<nqis;qii++){
550
      int qi;
551
      qi=_enc->state.qis[qii];
552
      for(qti=0;qti<2;qti++){
553
        /*Set the DC coefficient in the dequantization table.*/
554
        _enc->state.dequant_tables[qi][pli][qti][0]=
555
         _enc->dequant_dc[qi0][pli][qti];
556
        _enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
557
        /*Copy over the quantization table.*/
558
        memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti],
559
         _enc->opt_data.enquant_table_size);
560
      }
561
    }
562
  }
563
  /*Fix up the DC coefficients in the quantization tables.*/
564
  oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis);
565
  /*Initialize the tokenization state.*/
566
  for(pli=0;pli<3;pli++){
567
    _pipe->ndct_tokens1[pli]=0;
568
    _pipe->eob_run1[pli]=0;
569
  }
570
  /*Initialize the bounding value array for the loop filter.*/
571
  flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]];
572
  _pipe->loop_filter=flimit!=0;
573
  if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit);
574
  /*Clear the temporary DCT scratch space.*/
575
  memset(_pipe->dct_data,0,sizeof(_pipe->dct_data));
576
}
577

578
/*Sets the current MCU stripe to super block row _sby.
579
  Return: A non-zero value if this was the last MCU.*/
580
static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
581
 oc_enc_pipeline_state *_pipe,int _sby){
582
  const oc_fragment_plane *fplane;
583
  unsigned                 mcu_nvsbs;
584
  int                      sby_end;
585
  int                      notdone;
586
  int                      vdec;
587
  int                      pli;
588
  mcu_nvsbs=_enc->mcu_nvsbs;
589
  sby_end=_enc->state.fplanes[0].nvsbs;
590
  notdone=_sby+mcu_nvsbs<sby_end;
591
  if(notdone)sby_end=_sby+mcu_nvsbs;
592
  vdec=0;
593
  for(pli=0;pli<3;pli++){
594
    fplane=_enc->state.fplanes+pli;
595
    _pipe->sbi0[pli]=fplane->sboffset+(_sby>>vdec)*fplane->nhsbs;
596
    _pipe->fragy0[pli]=_sby<<2-vdec;
597
    _pipe->froffset[pli]=fplane->froffset
598
     +_pipe->fragy0[pli]*(ptrdiff_t)fplane->nhfrags;
599
    if(notdone){
600
      _pipe->sbi_end[pli]=fplane->sboffset+(sby_end>>vdec)*fplane->nhsbs;
601
      _pipe->fragy_end[pli]=sby_end<<2-vdec;
602
    }
603
    else{
604
      _pipe->sbi_end[pli]=fplane->sboffset+fplane->nsbs;
605
      _pipe->fragy_end[pli]=fplane->nvfrags;
606
    }
607
    vdec=!(_enc->state.info.pixel_fmt&2);
608
  }
609
  return notdone;
610
}
611

612
static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
613
 oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
614
  /*Copy over all the uncoded fragments from this plane and advance the uncoded
615
     fragment list.*/
616
  if(_pipe->nuncoded_fragis[_pli]>0){
617
    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
618
    oc_frag_copy_list(&_enc->state,
619
     _enc->state.ref_frame_data[OC_FRAME_SELF],
620
     _enc->state.ref_frame_data[OC_FRAME_PREV],
621
     _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
622
     _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
623
    _pipe->nuncoded_fragis[_pli]=0;
624
  }
625
  /*Perform DC prediction.*/
626
  oc_enc_pred_dc_frag_rows(_enc,_pli,
627
   _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
628
  /*Finish DC tokenization.*/
629
  oc_enc_tokenize_dc_frag_list(_enc,_pli,
630
   _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
631
   _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
632
  _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
633
  _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
634
  /*And advance the coded fragment list.*/
635
  _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
636
  _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
637
  _pipe->ncoded_fragis[_pli]=0;
638
  /*Apply the loop filter if necessary.*/
639
  if(_pipe->loop_filter){
640
    oc_state_loop_filter_frag_rows(&_enc->state,
641
     _pipe->bounding_values,OC_FRAME_SELF,_pli,
642
     _pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
643
  }
644
  else _sdelay=_edelay=0;
645
  /*To fill borders, we have an additional two pixel delay, since a fragment
646
     in the next row could filter its top edge, using two pixels from a
647
     fragment in this row.
648
    But there's no reason to delay a full fragment between the two.*/
649
  oc_state_borders_fill_rows(&_enc->state,
650
   _enc->state.ref_frame_idx[OC_FRAME_SELF],_pli,
651
   (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
652
   (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
653
}
654

655

656

657
/*Cost information about the coded blocks in a MB.*/
658
struct oc_rd_metric{
659
  int uncoded_ac_ssd;
660
  int coded_ac_ssd;
661
  int ac_bits;
662
  int dc_flag;
663
};
664

665

666

667
static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
668
 oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
669
 unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
670
 oc_fr_state *_fr,oc_token_checkpoint **_stack){
671
  ogg_int16_t            *data;
672
  ogg_int16_t            *dct;
673
  ogg_int16_t            *idct;
674
  oc_qii_state            qs;
675
  const ogg_uint16_t     *dequant;
676
  ogg_uint16_t            dequant_dc;
677
  ptrdiff_t               frag_offs;
678
  int                     ystride;
679
  const unsigned char    *src;
680
  const unsigned char    *ref;
681
  unsigned char          *dst;
682
  int                     nonzero;
683
  unsigned                uncoded_ssd;
684
  unsigned                coded_ssd;
685
  oc_token_checkpoint    *checkpoint;
686
  oc_fragment            *frags;
687
  int                     mb_mode;
688
  int                     refi;
689
  int                     mv_offs[2];
690
  int                     nmv_offs;
691
  int                     ac_bits;
692
  int                     borderi;
693
  int                     nqis;
694
  int                     qti;
695
  int                     qii;
696
  int                     dc;
697
  nqis=_enc->state.nqis;
698
  frags=_enc->state.frags;
699
  frag_offs=_enc->state.frag_buf_offs[_fragi];
700
  ystride=_enc->state.ref_ystride[_pli];
701
  src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
702
  borderi=frags[_fragi].borderi;
703
  qii=frags[_fragi].qii;
704
  data=_enc->pipe.dct_data;
705
  dct=data+64;
706
  idct=data+128;
707
  if(qii&~3){
708
#if !defined(OC_COLLECT_METRICS)
709
    if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
710
      /*Enable early skip detection.*/
711
      frags[_fragi].coded=0;
712
      frags[_fragi].refi=OC_FRAME_NONE;
713
      oc_fr_skip_block(_fr);
714
      return 0;
715
    }
716
#endif
717
    /*Try and code this block anyway.*/
718
    qii&=3;
719
  }
720
  refi=frags[_fragi].refi;
721
  mb_mode=frags[_fragi].mb_mode;
722
  ref=_enc->state.ref_frame_data[refi]+frag_offs;
723
  dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs;
724
  /*Motion compensation:*/
725
  switch(mb_mode){
726
    case OC_MODE_INTRA:{
727
      nmv_offs=0;
728
      oc_enc_frag_sub_128(_enc,data,src,ystride);
729
    }break;
730
    case OC_MODE_GOLDEN_NOMV:
731
    case OC_MODE_INTER_NOMV:{
732
      nmv_offs=1;
733
      mv_offs[0]=0;
734
      oc_enc_frag_sub(_enc,data,src,ref,ystride);
735
    }break;
736
    default:{
737
      const oc_mv *frag_mvs;
738
      frag_mvs=_enc->state.frag_mvs;
739
      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,
740
       _pli,frag_mvs[_fragi]);
741
      if(nmv_offs>1){
742
        oc_enc_frag_copy2(_enc,dst,
743
         ref+mv_offs[0],ref+mv_offs[1],ystride);
744
        oc_enc_frag_sub(_enc,data,src,dst,ystride);
745
      }
746
      else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride);
747
    }break;
748
  }
749
#if defined(OC_COLLECT_METRICS)
750
  {
751
    unsigned sad;
752
    unsigned satd;
753
    switch(nmv_offs){
754
      case 0:{
755
        sad=oc_enc_frag_intra_sad(_enc,src,ystride);
756
        satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);
757
      }break;
758
      case 1:{
759
        sad=oc_enc_frag_sad_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
760
        satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride);
761
        satd+=abs(dc);
762
      }break;
763
      default:{
764
        sad=oc_enc_frag_sad_thresh(_enc,src,dst,ystride,UINT_MAX);
765
        satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride);
766
        satd+=abs(dc);
767
      }break;
768
    }
769
    _enc->frag_sad[_fragi]=sad;
770
    _enc->frag_satd[_fragi]=satd;
771
  }
772
#endif
773
  /*Transform:*/
774
  oc_enc_fdct8x8(_enc,dct,data);
775
  /*Quantize:*/
776
  qti=mb_mode!=OC_MODE_INTRA;
777
  dequant=_enc->dequant[_pli][qii][qti];
778
  nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]);
779
  dc=data[0];
780
  /*Tokenize.*/
781
  checkpoint=*_stack;
782
  if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
783
    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct,
784
     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
785
  }
786
  else{
787
    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct,
788
     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
789
  }
790
  /*Reconstruct.
791
    TODO: nonzero may need to be adjusted after tokenization.*/
792
  dequant_dc=dequant[0];
793
  if(nonzero==0){
794
    ogg_int16_t p;
795
    int         ci;
796
    int         qi01;
797
    int         qi12;
798
    /*We round this dequant product (and not any of the others) because there's
799
       no iDCT rounding.*/
800
    p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5);
801
    /*LOOP VECTORIZES.*/
802
    for(ci=0;ci<64;ci++)data[ci]=p;
803
    /*We didn't code any AC coefficients, so don't change the quantizer.*/
804
    qi01=_pipe->qs[_pli].qi01;
805
    qi12=_pipe->qs[_pli].qi12;
806
    if(qi01>0)qii=1+qi12;
807
    else if(qi01>=0)qii=0;
808
  }
809
  else{
810
    idct[0]=dc*dequant_dc;
811
    /*Note: This clears idct[] back to zero for the next block.*/
812
    oc_idct8x8(&_enc->state,data,idct,nonzero+1);
813
  }
814
  frags[_fragi].qii=qii;
815
  if(nqis>1){
816
    oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
817
    ac_bits+=qs.bits-_pipe->qs[_pli].bits;
818
  }
819
  if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
820
  else{
821
    oc_enc_frag_recon_inter(_enc,dst,
822
     nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
823
  }
824
  /*If _fr is NULL, then this is an INTRA frame, and we can't skip blocks.*/
825
#if !defined(OC_COLLECT_METRICS)
826
  if(_fr!=NULL)
827
#endif
828
  {
829
    /*In retrospect, should we have skipped this block?*/
830
    if(borderi<0){
831
      coded_ssd=oc_enc_frag_ssd(_enc,src,dst,ystride);
832
    }
833
    else{
834
      coded_ssd=oc_enc_frag_border_ssd(_enc,src,dst,ystride,
835
       _enc->state.borders[borderi].mask);
836
    }
837
    /*Scale to match DCT domain.*/
838
    coded_ssd<<=4;
839
#if defined(OC_COLLECT_METRICS)
840
    _enc->frag_ssd[_fragi]=coded_ssd;
841
  }
842
  if(_fr!=NULL){
843
#endif
844
    coded_ssd=OC_RD_SCALE(coded_ssd,_rd_scale);
845
    uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]];
846
    if(uncoded_ssd<UINT_MAX&&
847
     /*Don't allow luma blocks to be skipped in 4MV mode when VP3 compatibility
848
        is enabled.*/
849
     (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
850
      int overhead_bits;
851
      overhead_bits=oc_fr_cost1(_fr);
852
      /*Although the fragment coding overhead determination is accurate, it is
853
         greedy, using very coarse-grained local information.
854
        Allowing it to mildly discourage coding turns out to be beneficial, but
855
         it's not clear that allowing it to encourage coding through negative
856
         coding overhead deltas is useful.
857
        For that reason, we disallow negative coding overheads.*/
858
      if(overhead_bits<0)overhead_bits=0;
859
      if(uncoded_ssd<=coded_ssd+(overhead_bits+ac_bits)*_enc->lambda){
860
        /*Hm, not worth it; roll back.*/
861
        oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
862
        *_stack=checkpoint;
863
        frags[_fragi].coded=0;
864
        frags[_fragi].refi=OC_FRAME_NONE;
865
        oc_fr_skip_block(_fr);
866
        return 0;
867
      }
868
    }
869
    else _mo->dc_flag=1;
870
    _mo->uncoded_ac_ssd+=uncoded_ssd;
871
    _mo->coded_ac_ssd+=coded_ssd;
872
    _mo->ac_bits+=ac_bits;
873
    oc_fr_code_block(_fr);
874
  }
875
  /*GCC 4.4.4 generates a warning here because it can't tell that
876
     the init code in the nqis check above will run anytime this
877
     line runs.*/
878
  if(nqis>1)*(_pipe->qs+_pli)=*&qs;
879
  frags[_fragi].dc=dc;
880
  frags[_fragi].coded=1;
881
  return 1;
882
}
883

884
static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc,
885
 oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead,
886
 const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
887
  /*Worst case token stack usage for 4 fragments.*/
888
  oc_token_checkpoint  stack[64*4];
889
  oc_token_checkpoint *stackptr;
890
  const oc_sb_map     *sb_maps;
891
  signed char         *mb_modes;
892
  oc_fragment         *frags;
893
  ptrdiff_t           *coded_fragis;
894
  ptrdiff_t            ncoded_fragis;
895
  ptrdiff_t           *uncoded_fragis;
896
  ptrdiff_t            nuncoded_fragis;
897
  oc_rd_metric         mo;
898
  oc_fr_state          fr_checkpoint;
899
  oc_qii_state         qs_checkpoint;
900
  int                  mb_mode;
901
  int                  refi;
902
  int                  ncoded;
903
  ptrdiff_t            fragi;
904
  int                  bi;
905
  *&fr_checkpoint=*(_pipe->fr+0);
906
  *&qs_checkpoint=*(_pipe->qs+0);
907
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
908
  mb_modes=_enc->state.mb_modes;
909
  frags=_enc->state.frags;
910
  coded_fragis=_pipe->coded_fragis[0];
911
  ncoded_fragis=_pipe->ncoded_fragis[0];
912
  uncoded_fragis=_pipe->uncoded_fragis[0];
913
  nuncoded_fragis=_pipe->nuncoded_fragis[0];
914
  mb_mode=mb_modes[_mbi];
915
  refi=OC_FRAME_FOR_MODE(mb_mode);
916
  ncoded=0;
917
  stackptr=stack;
918
  memset(&mo,0,sizeof(mo));
919
  for(bi=0;bi<4;bi++){
920
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
921
    frags[fragi].refi=refi;
922
    frags[fragi].mb_mode=mb_mode;
923
    if(oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
924
     _rd_scale[bi],_rd_iscale[bi],&mo,_pipe->fr+0,&stackptr)){
925
      coded_fragis[ncoded_fragis++]=fragi;
926
      ncoded++;
927
    }
928
    else *(uncoded_fragis-++nuncoded_fragis)=fragi;
929
  }
930
  if(ncoded>0&&!mo.dc_flag){
931
    int cost;
932
    /*Some individual blocks were worth coding.
933
      See if that's still true when accounting for mode and MV overhead.*/
934
    cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
935
     +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
936
    if(mo.uncoded_ac_ssd<=cost){
937
      /*Taking macroblock overhead into account, it is not worth coding this
938
         MB.*/
939
      oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
940
      *(_pipe->fr+0)=*&fr_checkpoint;
941
      *(_pipe->qs+0)=*&qs_checkpoint;
942
      for(bi=0;bi<4;bi++){
943
        fragi=sb_maps[_mbi>>2][_mbi&3][bi];
944
        if(frags[fragi].coded){
945
          *(uncoded_fragis-++nuncoded_fragis)=fragi;
946
          frags[fragi].coded=0;
947
          frags[fragi].refi=OC_FRAME_NONE;
948
        }
949
        oc_fr_skip_block(_pipe->fr+0);
950
      }
951
      ncoded_fragis-=ncoded;
952
      ncoded=0;
953
    }
954
  }
955
  /*If no luma blocks coded, the mode is forced.*/
956
  if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
957
  /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
958
     with a single coded block.
959
    This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
960
     skipped blocks, while a 1MV does not.*/
961
  else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
962
    mb_modes[_mbi]=OC_MODE_INTER_MV;
963
  }
964
  _pipe->ncoded_fragis[0]=ncoded_fragis;
965
  _pipe->nuncoded_fragis[0]=nuncoded_fragis;
966
  return ncoded;
967
}
968

969
static void oc_enc_sb_transform_quantize_inter_chroma(oc_enc_ctx *_enc,
970
 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
971
  const ogg_uint16_t *mcu_rd_scale;
972
  const ogg_uint16_t *mcu_rd_iscale;
973
  const oc_sb_map    *sb_maps;
974
  oc_sb_flags        *sb_flags;
975
  oc_fr_state        *fr;
976
  ptrdiff_t          *coded_fragis;
977
  ptrdiff_t           ncoded_fragis;
978
  ptrdiff_t          *uncoded_fragis;
979
  ptrdiff_t           nuncoded_fragis;
980
  ptrdiff_t           froffset;
981
  int                 sbi;
982
  fr=_pipe->fr+_pli;
983
  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
984
  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
985
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
986
  sb_flags=_enc->state.sb_flags;
987
  coded_fragis=_pipe->coded_fragis[_pli];
988
  ncoded_fragis=_pipe->ncoded_fragis[_pli];
989
  uncoded_fragis=_pipe->uncoded_fragis[_pli];
990
  nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
991
  froffset=_pipe->froffset[_pli];
992
  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
993
    /*Worst case token stack usage for 1 fragment.*/
994
    oc_token_checkpoint stack[64];
995
    oc_rd_metric        mo;
996
    int                 quadi;
997
    int                 bi;
998
    memset(&mo,0,sizeof(mo));
999
    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1000
      ptrdiff_t fragi;
1001
      fragi=sb_maps[sbi][quadi][bi];
1002
      if(fragi>=0){
1003
        oc_token_checkpoint *stackptr;
1004
        unsigned             rd_scale;
1005
        unsigned             rd_iscale;
1006
        rd_scale=mcu_rd_scale[fragi-froffset];
1007
        rd_iscale=mcu_rd_iscale[fragi-froffset];
1008
        stackptr=stack;
1009
        if(oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
1010
         rd_scale,rd_iscale,&mo,fr,&stackptr)){
1011
          coded_fragis[ncoded_fragis++]=fragi;
1012
        }
1013
        else *(uncoded_fragis-++nuncoded_fragis)=fragi;
1014
      }
1015
    }
1016
    oc_fr_state_flush_sb(fr);
1017
    sb_flags[sbi].coded_fully=fr->sb_full;
1018
    sb_flags[sbi].coded_partially=fr->sb_partial;
1019
  }
1020
  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
1021
  _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
1022
}
1023

1024
/*Mode decision is done by exhaustively examining all potential choices.
1025
  Obviously, doing the motion compensation, fDCT, tokenization, and then
1026
   counting the bits each token uses is computationally expensive.
1027
  Theora's EOB runs can also split the cost of these tokens across multiple
1028
   fragments, and naturally we don't know what the optimal choice of Huffman
1029
   codes will be until we know all the tokens we're going to encode in all the
1030
   fragments.
1031
  So we use a simple approach to estimating the bit cost and distortion of each
1032
   mode based upon the SATD value of the residual before coding.
1033
  The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
1034
   the process (modified somewhat from that of the paper) is very simple.
1035
  We build a non-linear regression of the mappings from
1036
   (pre-transform+quantization) SATD to (post-transform+quantization) bits and
1037
   SSD for each qi.
1038
  A separate set of mappings is kept for each quantization type and color
1039
   plane.
1040
  The mappings are constructed by partitioning the SATD values into a small
1041
   number of bins (currently 24) and using a linear regression in each bin
1042
   (as opposed to the 0th-order regression used by Kim).
1043
  The bit counts and SSD measurements are obtained by examining actual encoded
1044
   frames, with appropriate lambda values and optimal Huffman codes selected.
1045
  EOB bits are assigned to the fragment that started the EOB run (as opposed to
1046
   dividing them among all the blocks in the run; the latter approach seems
1047
   more theoretically correct, but Monty's testing showed a small improvement
1048
   with the former, though that may have been merely statistical noise).
1049

1050
  @ARTICLE{Kim03,
1051
    author="Hyun Mun Kim",
1052
    title="Adaptive Rate Control Using Nonlinear Regression",
1053
    journal="IEEE Transactions on Circuits and Systems for Video Technology",
1054
    volume=13,
1055
    number=5,
1056
    pages="432--439",
1057
    month=May,
1058
    year=2003
1059
  }*/
1060

1061
/*Computes (_ssd+_lambda*_rate)/(1<<OC_BIT_SCALE) with rounding, avoiding
1062
   overflow for large lambda values.*/
1063
#define OC_MODE_RD_COST(_ssd,_rate,_lambda) \
1064
 ((_ssd)>>OC_BIT_SCALE)+((_rate)>>OC_BIT_SCALE)*(_lambda) \
1065
 +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \
1066
 +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
1067

1068
static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){
1069
#if !defined(OC_COLLECT_METRICS)
1070
  const
1071
#endif
1072
  oc_mode_rd (*oc_mode_rd_table)[3][2][OC_COMP_BINS]=
1073
   _enc->sp_level<OC_SP_LEVEL_NOSATD?OC_MODE_RD_SATD:OC_MODE_RD_SAD;
1074
  int qii;
1075
#if defined(OC_COLLECT_METRICS)
1076
  oc_enc_mode_metrics_load(_enc);
1077
#endif
1078
  for(qii=0;qii<_enc->state.nqis;qii++){
1079
    int qi;
1080
    int pli;
1081
    qi=_enc->state.qis[qii];
1082
    for(pli=0;pli<3;pli++){
1083
      int qti;
1084
      for(qti=0;qti<2;qti++){
1085
        int log_plq;
1086
        int modeline;
1087
        int bin;
1088
        int dx;
1089
        int dq;
1090
        log_plq=_enc->log_plq[qi][pli][qti];
1091
        /*Find the pair of rows in the mode table that bracket this quantizer.
1092
          If it falls outside the range the table covers, then we just use a
1093
           pair on the edge for linear extrapolation.*/
1094
        for(modeline=0;modeline<OC_LOGQ_BINS-1&&
1095
         OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
1096
        /*Interpolate a row for this quantizer.*/
1097
        dx=OC_MODE_LOGQ[modeline][pli][qti]-log_plq;
1098
        dq=OC_MODE_LOGQ[modeline][pli][qti]-OC_MODE_LOGQ[modeline+1][pli][qti];
1099
        if(dq==0)dq=1;
1100
        for(bin=0;bin<OC_COMP_BINS;bin++){
1101
          int y0;
1102
          int z0;
1103
          int dy;
1104
          int dz;
1105
          y0=oc_mode_rd_table[modeline][pli][qti][bin].rate;
1106
          z0=oc_mode_rd_table[modeline][pli][qti][bin].rmse;
1107
          dy=oc_mode_rd_table[modeline+1][pli][qti][bin].rate-y0;
1108
          dz=oc_mode_rd_table[modeline+1][pli][qti][bin].rmse-z0;
1109
          _enc->mode_rd[qii][pli][qti][bin].rate=
1110
           (ogg_int16_t)OC_CLAMPI(-32768,y0+(dy*dx+(dq>>1))/dq,32767);
1111
          _enc->mode_rd[qii][pli][qti][bin].rmse=
1112
           (ogg_int16_t)OC_CLAMPI(-32768,z0+(dz*dx+(dq>>1))/dq,32767);
1113
        }
1114
      }
1115
    }
1116
  }
1117
}
1118

1119
/*Estimate the R-D cost of the DCT coefficients given the SATD of a block after
1120
   prediction.*/
1121
static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd,
1122
 int _qii,int _pli,int _qti,int _satd){
1123
  unsigned rmse;
1124
  int      shift;
1125
  int      bin;
1126
  int      dx;
1127
  int      y0;
1128
  int      z0;
1129
  int      dy;
1130
  int      dz;
1131
  /*SATD metrics for chroma planes vary much less than luma, so we scale them
1132
     by 4 to distribute them into the mode decision bins more evenly.*/
1133
  _satd<<=_pli+1&2;
1134
  shift=_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_SATD_SHIFT:OC_SAD_SHIFT;
1135
  bin=OC_MINI(_satd>>shift,OC_COMP_BINS-2);
1136
  dx=_satd-(bin<<shift);
1137
  y0=_enc->mode_rd[_qii][_pli][_qti][bin].rate;
1138
  z0=_enc->mode_rd[_qii][_pli][_qti][bin].rmse;
1139
  dy=_enc->mode_rd[_qii][_pli][_qti][bin+1].rate-y0;
1140
  dz=_enc->mode_rd[_qii][_pli][_qti][bin+1].rmse-z0;
1141
  rmse=OC_MAXI(z0+(dz*dx>>shift),0);
1142
  *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
1143
  return OC_MAXI(y0+(dy*dx>>shift),0);
1144
}
1145

1146
/*activity_avg must be positive, or flat regions could get a zero weight, which
1147
   confounds analysis.
1148
  We set the minimum to this value so that it also avoids the need for divide
1149
   by zero checks in oc_mb_masking().*/
1150
# define OC_ACTIVITY_AVG_MIN (1<<OC_RD_SCALE_BITS)
1151

1152
static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
1153
 unsigned _activity[4]){
1154
  const unsigned char *src;
1155
  const ptrdiff_t     *frag_buf_offs;
1156
  const ptrdiff_t     *sb_map;
1157
  unsigned             luma;
1158
  int                  ystride;
1159
  ptrdiff_t            frag_offs;
1160
  ptrdiff_t            fragi;
1161
  int                  bi;
1162
  frag_buf_offs=_enc->state.frag_buf_offs;
1163
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1164
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1165
  ystride=_enc->state.ref_ystride[0];
1166
  luma=0;
1167
  for(bi=0;bi<4;bi++){
1168
    const unsigned char *s;
1169
    unsigned             x;
1170
    unsigned             x2;
1171
    unsigned             act;
1172
    int                  i;
1173
    int                  j;
1174
    fragi=sb_map[bi];
1175
    frag_offs=frag_buf_offs[fragi];
1176
    /*TODO: This could be replaced with SATD^2, since we already have to
1177
       compute SATD.*/
1178
    x=x2=0;
1179
    s=src+frag_offs;
1180
    for(i=0;i<8;i++){
1181
      for(j=0;j<8;j++){
1182
        unsigned c;
1183
        c=s[j];
1184
        x+=c;
1185
        x2+=c*c;
1186
      }
1187
      s+=ystride;
1188
    }
1189
    luma+=x;
1190
    act=(x2<<6)-x*x;
1191
    if(act<8<<12){
1192
      /*The region is flat.*/
1193
      act=OC_MINI(act,5<<12);
1194
    }
1195
    else{
1196
      unsigned e1;
1197
      unsigned e2;
1198
      unsigned e3;
1199
      unsigned e4;
1200
      /*Test for an edge.
1201
        TODO: There are probably much simpler ways to do this (e.g., it could
1202
         probably be combined with the SATD calculation).
1203
        Alternatively, we could split the block around the mean and compute the
1204
         reduction in variance in each half.
1205
        For a Gaussian source the reduction should be
1206
         (1-2/pi) ~= 0.36338022763241865692446494650994.
1207
        Significantly more reduction is a good indication of a bi-level image.
1208
        This has the advantage of identifying, in addition to straight edges,
1209
         small text regions, which would otherwise be classified as "texture".*/
1210
      e1=e2=e3=e4=0;
1211
      s=src+frag_offs-1;
1212
      for(i=0;i<8;i++){
1213
        for(j=0;j<8;j++){
1214
          e1+=abs((s[j+2]-s[j]<<1)+(s-ystride)[j+2]-(s-ystride)[j]
1215
           +(s+ystride)[j+2]-(s+ystride)[j]);
1216
          e2+=abs(((s+ystride)[j+1]-(s-ystride)[j+1]<<1)
1217
           +(s+ystride)[j]-(s-ystride)[j]+(s+ystride)[j+2]-(s-ystride)[j+2]);
1218
          e3+=abs(((s+ystride)[j+2]-(s-ystride)[j]<<1)
1219
           +(s+ystride)[j+1]-s[j]+s[j+2]-(s-ystride)[j+1]);
1220
          e4+=abs(((s+ystride)[j]-(s-ystride)[j+2]<<1)
1221
           +(s+ystride)[j+1]-s[j+2]+s[j]-(s-ystride)[j+1]);
1222
        }
1223
        s+=ystride;
1224
      }
1225
      /*If the largest component of the edge energy is at least 40% of the
1226
         total, then classify the block as an edge block.*/
1227
      if(5*OC_MAXI(OC_MAXI(e1,e2),OC_MAXI(e3,e4))>2*(e1+e2+e3+e4)){
1228
         /*act=act_th*(act/act_th)**0.7
1229
              =exp(log(act_th)+0.7*(log(act)-log(act_th))).
1230
           Here act_th=5.0 and 0x394A=oc_blog32_q10(5<<12).*/
1231
         act=oc_bexp32_q10(0x394A+(7*(oc_blog32_q10(act)-0x394A+5)/10));
1232
      }
1233
    }
1234
    _activity[bi]=act;
1235
  }
1236
  return luma;
1237
}
1238

1239
static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
1240
 unsigned _activity[4],const unsigned _intra_satd[12]){
1241
  int bi;
1242
  for(bi=0;bi<4;bi++){
1243
    unsigned act;
1244
    act=(11*_intra_satd[bi]>>8)*_intra_satd[bi];
1245
    if(act<8<<12){
1246
      /*The region is flat.*/
1247
      act=OC_MINI(act,5<<12);
1248
    }
1249
    _activity[bi]=act;
1250
  }
1251
}
1252

1253
/*Compute the masking scales for the blocks in a macro block.
1254
  All masking is computed from the luma blocks.
1255
  We derive scaling factors for the chroma blocks from these, and use the same
1256
   ones for all chroma blocks, regardless of the subsampling.
1257
  It's possible for luma to be perfectly flat and yet have high chroma energy,
1258
   but this is unlikely in non-artificial images, and not a case that has been
1259
   addressed by any research to my knowledge.
1260
  The output of the masking process is two scale factors, which are fed into
1261
   the various R-D optimizations.
1262
  The first, rd_scale, is applied to D in the equation
1263
    D*rd_scale+lambda*R.
1264
  This is the form that must be used to properly combine scores from multiple
1265
   blocks, and can be interpreted as scaling distortions by their visibility.
1266
  The inverse, rd_iscale, is applied to lambda in the equation
1267
    D+rd_iscale*lambda*R.
1268
  This is equivalent to the first form within a single block, but much faster
1269
   to use when evaluating many possible distortions (e.g., during actual
1270
   quantization, where separate distortions are evaluated for every
1271
   coefficient).
1272
  The two macros OC_RD_SCALE(rd_scale,d) and OC_RD_ISCALE(rd_iscale,lambda) are
1273
   used to perform the multiplications with the proper re-scaling for the range
1274
   of the scaling factors.
1275
  Many researchers apply masking values directly to the quantizers used, and
1276
   not to the R-D cost.
1277
  Since we generally use MSE for D, rd_scale must use the square of their
1278
   values to generate an equivalent effect.*/
1279
static unsigned oc_mb_masking(unsigned _rd_scale[5],unsigned _rd_iscale[5],
1280
 const ogg_uint16_t _chroma_rd_scale[2],const unsigned _activity[4],
1281
 unsigned _activity_avg,unsigned _luma,unsigned _luma_avg){
1282
  unsigned activity_sum;
1283
  unsigned la;
1284
  unsigned lb;
1285
  unsigned d;
1286
  int      bi;
1287
  int      bi_min;
1288
  int      bi_min2;
1289
  /*The ratio lb/la is meant to approximate
1290
     ((((_luma-16)/219)*(255/128))**0.649**0.4**2), which is the
1291
     effective luminance masking from~\cite{LKW06} (including the self-masking
1292
     deflator).
1293
    The following actually turns out to be a pretty good approximation for
1294
     _luma>75 or so.
1295
    For smaller values luminance does not really follow Weber's Law anyway, and
1296
     this approximation gives a much less aggressive bitrate boost in this
1297
     region.
1298
    Though some researchers claim that contrast sensitivity actually decreases
1299
     for very low luminance values, in my experience excessive brightness on
1300
     LCDs or buggy color conversions (e.g., treating Y' as full-range instead
1301
     of the CCIR 601 range) make artifacts in such regions extremely visible.
1302
    We substitute _luma_avg for 128 to allow the strength of the masking to
1303
     vary with the actual average image luminance, within certain limits (the
1304
     caller has clamped _luma_avg to the range [90,160], inclusive).
1305
    @ARTICLE{LKW06,
1306
      author="Zhen Liu and Lina J. Karam and Andrew B. Watson",
1307
      title="{JPEG2000} Encoding With Perceptual Distortion Control",
1308
      journal="{IEEE} Transactions on Image Processing",
1309
      volume=15,
1310
      number=7,
1311
      pages="1763--1778",
1312
      month=Jul,
1313
      year=2006
1314
    }*/
1315
#if 0
1316
  la=_luma+4*_luma_avg;
1317
  lb=4*_luma+_luma_avg;
1318
#else
1319
  /*Disable luminance masking.*/
1320
  la=lb=1;
1321
#endif
1322
  activity_sum=0;
1323
  for(bi=0;bi<4;bi++){
1324
    unsigned a;
1325
    unsigned b;
1326
    activity_sum+=_activity[bi];
1327
    /*Apply activity masking.*/
1328
    a=_activity[bi]+4*_activity_avg;
1329
    b=4*_activity[bi]+_activity_avg;
1330
    d=OC_RD_SCALE(b,1);
1331
    /*And luminance masking.*/
1332
    d=(a+(d>>1))/d;
1333
    _rd_scale[bi]=(d*la+(lb>>1))/lb;
1334
    /*And now the inverse.*/
1335
    d=OC_MAXI(OC_RD_ISCALE(a,1),1);
1336
    d=(b+(d>>1))/d;
1337
    _rd_iscale[bi]=(d*lb+(la>>1))/la;
1338
  }
1339
  /*Now compute scaling factors for chroma blocks.
1340
    We start by finding the two smallest iscales from the luma blocks.*/
1341
  bi_min=_rd_iscale[1]<_rd_iscale[0];
1342
  bi_min2=1-bi_min;
1343
  for(bi=2;bi<4;bi++){
1344
    if(_rd_iscale[bi]<_rd_iscale[bi_min]){
1345
      bi_min2=bi_min;
1346
      bi_min=bi;
1347
    }
1348
    else if(_rd_iscale[bi]<_rd_iscale[bi_min2])bi_min2=bi;
1349
  }
1350
  /*If the minimum iscale is less than 1.0, use the second smallest instead,
1351
     and force the value to at least 1.0 (inflating chroma is a waste).*/
1352
  if(_rd_iscale[bi_min]<(1<<OC_RD_ISCALE_BITS))bi_min=bi_min2;
1353
  d=OC_MINI(_rd_scale[bi_min],1<<OC_RD_SCALE_BITS);
1354
  _rd_scale[4]=OC_RD_SCALE(d,_chroma_rd_scale[0]);
1355
  d=OC_MAXI(_rd_iscale[bi_min],1<<OC_RD_ISCALE_BITS);
1356
  _rd_iscale[4]=OC_RD_ISCALE(d,_chroma_rd_scale[1]);
1357
  return activity_sum;
1358
}
1359

1360
static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
1361
 unsigned _frag_satd[12]){
1362
  const unsigned char   *src;
1363
  const ptrdiff_t       *frag_buf_offs;
1364
  const ptrdiff_t       *sb_map;
1365
  const oc_mb_map_plane *mb_map;
1366
  const unsigned char   *map_idxs;
1367
  int                    map_nidxs;
1368
  int                    mapii;
1369
  int                    mapi;
1370
  int                    ystride;
1371
  int                    pli;
1372
  int                    bi;
1373
  ptrdiff_t              fragi;
1374
  ptrdiff_t              frag_offs;
1375
  unsigned               luma;
1376
  int                    dc;
1377
  frag_buf_offs=_enc->state.frag_buf_offs;
1378
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1379
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1380
  ystride=_enc->state.ref_ystride[0];
1381
  luma=0;
1382
  for(bi=0;bi<4;bi++){
1383
    fragi=sb_map[bi];
1384
    frag_offs=frag_buf_offs[fragi];
1385
    _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1386
    luma+=dc;
1387
  }
1388
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1389
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1390
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1391
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
1392
  ystride=_enc->state.ref_ystride[1];
1393
  for(mapii=4;mapii<map_nidxs;mapii++){
1394
    mapi=map_idxs[mapii];
1395
    pli=mapi>>2;
1396
    bi=mapi&3;
1397
    fragi=mb_map[pli][bi];
1398
    frag_offs=frag_buf_offs[fragi];
1399
    _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1400
  }
1401
  return luma;
1402
}
1403

1404
/*Select luma block-level quantizers for a MB in an INTRA frame.*/
1405
static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
1406
 const oc_qii_state *_qs,unsigned _mbi,const unsigned _rd_scale[4]){
1407
  const unsigned char *src;
1408
  const ptrdiff_t     *frag_buf_offs;
1409
  const oc_sb_map     *sb_maps;
1410
  oc_fragment         *frags;
1411
  ptrdiff_t            frag_offs;
1412
  ptrdiff_t            fragi;
1413
  oc_qii_state         qs[4][3];
1414
  unsigned             cost[4][3];
1415
  unsigned             ssd[4][3];
1416
  unsigned             rate[4][3];
1417
  int                  prev[3][3];
1418
  unsigned             satd;
1419
  int                  dc;
1420
  unsigned             best_cost;
1421
  unsigned             best_ssd;
1422
  unsigned             best_rate;
1423
  int                  best_qii;
1424
  int                  qii;
1425
  int                  lambda;
1426
  int                  ystride;
1427
  int                  nqis;
1428
  int                  bi;
1429
  frag_buf_offs=_enc->state.frag_buf_offs;
1430
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1431
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1432
  ystride=_enc->state.ref_ystride[0];
1433
  fragi=sb_maps[_mbi>>2][_mbi&3][0];
1434
  frag_offs=frag_buf_offs[fragi];
1435
  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1436
    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1437
  }
1438
  else{
1439
    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1440
  }
1441
  nqis=_enc->state.nqis;
1442
  lambda=_enc->lambda;
1443
  for(qii=0;qii<nqis;qii++){
1444
    oc_qii_state_advance(qs[0]+qii,_qs,qii);
1445
    rate[0][qii]=oc_dct_cost2(_enc,ssd[0]+qii,qii,0,0,satd)
1446
     +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE);
1447
    ssd[0][qii]=OC_RD_SCALE(ssd[0][qii],_rd_scale[0]);
1448
    cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda);
1449
  }
1450
  for(bi=1;bi<4;bi++){
1451
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1452
    frag_offs=frag_buf_offs[fragi];
1453
    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1454
      satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1455
    }
1456
    else{
1457
      satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1458
    }
1459
    for(qii=0;qii<nqis;qii++){
1460
      oc_qii_state qt[3];
1461
      unsigned     cur_ssd;
1462
      unsigned     cur_rate;
1463
      int          best_qij;
1464
      int          qij;
1465
      oc_qii_state_advance(qt+0,qs[bi-1]+0,qii);
1466
      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,0,satd);
1467
      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
1468
      best_ssd=ssd[bi-1][0]+cur_ssd;
1469
      best_rate=rate[bi-1][0]+cur_rate
1470
       +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE);
1471
      best_cost=OC_MODE_RD_COST(best_ssd,best_rate,lambda);
1472
      best_qij=0;
1473
      for(qij=1;qij<nqis;qij++){
1474
        unsigned chain_ssd;
1475
        unsigned chain_rate;
1476
        unsigned chain_cost;
1477
        oc_qii_state_advance(qt+qij,qs[bi-1]+qij,qii);
1478
        chain_ssd=ssd[bi-1][qij]+cur_ssd;
1479
        chain_rate=rate[bi-1][qij]+cur_rate
1480
         +(qt[qij].bits-qs[bi-1][qij].bits<<OC_BIT_SCALE);
1481
        chain_cost=OC_MODE_RD_COST(chain_ssd,chain_rate,lambda);
1482
        if(chain_cost<best_cost){
1483
          best_cost=chain_cost;
1484
          best_ssd=chain_ssd;
1485
          best_rate=chain_rate;
1486
          best_qij=qij;
1487
        }
1488
      }
1489
      *(qs[bi]+qii)=*(qt+best_qij);
1490
      cost[bi][qii]=best_cost;
1491
      ssd[bi][qii]=best_ssd;
1492
      rate[bi][qii]=best_rate;
1493
      prev[bi-1][qii]=best_qij;
1494
    }
1495
  }
1496
  best_qii=0;
1497
  best_cost=cost[3][0];
1498
  for(qii=1;qii<nqis;qii++){
1499
    if(cost[3][qii]<best_cost){
1500
      best_cost=cost[3][qii];
1501
      best_qii=qii;
1502
    }
1503
  }
1504
  frags=_enc->state.frags;
1505
  for(bi=3;;){
1506
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1507
    frags[fragi].qii=best_qii;
1508
    if(bi--<=0)break;
1509
    best_qii=prev[bi][best_qii];
1510
  }
1511
  return best_cost;
1512
}
1513

1514
/*Select a block-level quantizer for a single chroma block in an INTRA frame.*/
1515
static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
1516
 const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi,unsigned _rd_scale){
1517
  const unsigned char *src;
1518
  oc_fragment         *frags;
1519
  ptrdiff_t            frag_offs;
1520
  oc_qii_state         qt[3];
1521
  unsigned             cost[3];
1522
  unsigned             satd;
1523
  int                  dc;
1524
  unsigned             best_cost;
1525
  int                  best_qii;
1526
  int                  qii;
1527
  int                  lambda;
1528
  int                  ystride;
1529
  int                  nqis;
1530
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1531
  ystride=_enc->state.ref_ystride[_pli];
1532
  frag_offs=_enc->state.frag_buf_offs[_fragi];
1533
  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1534
    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1535
  }
1536
  else{
1537
    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1538
  }
1539
  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
1540
     worth spending the bits to change the AC quantizer.
1541
    TODO: This may be worth revisiting when we separate out DC and AC
1542
     predictions from SATD.*/
1543
#if 0
1544
  nqis=_enc->state.nqis;
1545
#else
1546
  nqis=1;
1547
#endif
1548
  lambda=_enc->lambda;
1549
  best_qii=0;
1550
  for(qii=0;qii<nqis;qii++){
1551
    unsigned cur_rate;
1552
    unsigned cur_ssd;
1553
    oc_qii_state_advance(qt+qii,_qs,qii);
1554
    cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,_pli,0,satd)
1555
     +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE);
1556
    cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
1557
    cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda);
1558
  }
1559
  best_cost=cost[0];
1560
  for(qii=1;qii<nqis;qii++){
1561
    if(cost[qii]<best_cost){
1562
      best_cost=cost[qii];
1563
      best_qii=qii;
1564
    }
1565
  }
1566
  frags=_enc->state.frags;
1567
  frags[_fragi].qii=best_qii;
1568
  return best_cost;
1569
}
1570

1571
static void oc_enc_mb_transform_quantize_intra_luma(oc_enc_ctx *_enc,
1572
 oc_enc_pipeline_state *_pipe,unsigned _mbi,
1573
 const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
1574
  /*Worst case token stack usage for 4 fragments.*/
1575
  oc_token_checkpoint  stack[64*4];
1576
  oc_token_checkpoint *stackptr;
1577
  const oc_sb_map     *sb_maps;
1578
  oc_fragment         *frags;
1579
  ptrdiff_t           *coded_fragis;
1580
  ptrdiff_t            ncoded_fragis;
1581
  ptrdiff_t            fragi;
1582
  int                  bi;
1583
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1584
  frags=_enc->state.frags;
1585
  coded_fragis=_pipe->coded_fragis[0];
1586
  ncoded_fragis=_pipe->ncoded_fragis[0];
1587
  stackptr=stack;
1588
  for(bi=0;bi<4;bi++){
1589
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1590
    frags[fragi].refi=OC_FRAME_SELF;
1591
    frags[fragi].mb_mode=OC_MODE_INTRA;
1592
    oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
1593
     _rd_scale[bi],_rd_iscale[bi],NULL,NULL,&stackptr);
1594
    coded_fragis[ncoded_fragis++]=fragi;
1595
  }
1596
  _pipe->ncoded_fragis[0]=ncoded_fragis;
1597
}
1598

1599
static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
1600
 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
1601
  const ogg_uint16_t *mcu_rd_scale;
1602
  const ogg_uint16_t *mcu_rd_iscale;
1603
  const oc_sb_map    *sb_maps;
1604
  ptrdiff_t          *coded_fragis;
1605
  ptrdiff_t           ncoded_fragis;
1606
  ptrdiff_t           froffset;
1607
  int                 sbi;
1608
  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
1609
  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
1610
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1611
  coded_fragis=_pipe->coded_fragis[_pli];
1612
  ncoded_fragis=_pipe->ncoded_fragis[_pli];
1613
  froffset=_pipe->froffset[_pli];
1614
  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
1615
    /*Worst case token stack usage for 1 fragment.*/
1616
    oc_token_checkpoint stack[64];
1617
    int                 quadi;
1618
    int                 bi;
1619
    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1620
      ptrdiff_t fragi;
1621
      fragi=sb_maps[sbi][quadi][bi];
1622
      if(fragi>=0){
1623
        oc_token_checkpoint *stackptr;
1624
        unsigned             rd_scale;
1625
        unsigned             rd_iscale;
1626
        rd_scale=mcu_rd_scale[fragi-froffset];
1627
        rd_iscale=mcu_rd_iscale[fragi-froffset];
1628
        oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi,rd_scale);
1629
        stackptr=stack;
1630
        oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
1631
         rd_scale,rd_iscale,NULL,NULL,&stackptr);
1632
        coded_fragis[ncoded_fragis++]=fragi;
1633
      }
1634
    }
1635
  }
1636
  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
1637
}
1638

1639
/*Analysis stage for an INTRA frame.*/
1640
void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
1641
  ogg_int64_t             activity_sum;
1642
  ogg_int64_t             luma_sum;
1643
  unsigned                activity_avg;
1644
  unsigned                luma_avg;
1645
  const ogg_uint16_t     *chroma_rd_scale;
1646
  ogg_uint16_t           *mcu_rd_scale;
1647
  ogg_uint16_t           *mcu_rd_iscale;
1648
  const unsigned char    *map_idxs;
1649
  int                     nmap_idxs;
1650
  oc_sb_flags            *sb_flags;
1651
  signed char            *mb_modes;
1652
  const oc_mb_map        *mb_maps;
1653
  const oc_sb_map        *sb_maps;
1654
  oc_fragment            *frags;
1655
  unsigned                stripe_sby;
1656
  unsigned                mcu_nvsbs;
1657
  int                     notstart;
1658
  int                     notdone;
1659
  int                     refi;
1660
  int                     pli;
1661
  _enc->state.frame_type=OC_INTRA_FRAME;
1662
  oc_enc_tokenize_start(_enc);
1663
  oc_enc_pipeline_init(_enc,&_enc->pipe);
1664
  oc_enc_mode_rd_init(_enc);
1665
  activity_sum=luma_sum=0;
1666
  activity_avg=_enc->activity_avg;
1667
  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
1668
  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTRA_FRAME][_enc->state.qis[0]];
1669
  mcu_rd_scale=_enc->mcu_rd_scale;
1670
  mcu_rd_iscale=_enc->mcu_rd_iscale;
1671
  /*Choose MVs and MB modes and quantize and code luma.
1672
    Must be done in Hilbert order.*/
1673
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1674
  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1675
  _enc->state.ncoded_fragis[0]=0;
1676
  _enc->state.ncoded_fragis[1]=0;
1677
  _enc->state.ncoded_fragis[2]=0;
1678
  sb_flags=_enc->state.sb_flags;
1679
  mb_modes=_enc->state.mb_modes;
1680
  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
1681
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1682
  frags=_enc->state.frags;
1683
  notstart=0;
1684
  notdone=1;
1685
  mcu_nvsbs=_enc->mcu_nvsbs;
1686
  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
1687
    ptrdiff_t cfroffset;
1688
    unsigned  sbi;
1689
    unsigned  sbi_end;
1690
    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
1691
    sbi_end=_enc->pipe.sbi_end[0];
1692
    cfroffset=_enc->pipe.froffset[1];
1693
    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
1694
      int quadi;
1695
      /*Mode addressing is through Y plane, always 4 MB per SB.*/
1696
      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
1697
        unsigned  activity[4];
1698
        unsigned  rd_scale[5];
1699
        unsigned  rd_iscale[5];
1700
        unsigned  luma;
1701
        unsigned  mbi;
1702
        int       mapii;
1703
        int       mapi;
1704
        int       bi;
1705
        ptrdiff_t fragi;
1706
        mbi=sbi<<2|quadi;
1707
        /*Activity masking.*/
1708
        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
1709
          luma=oc_mb_activity(_enc,mbi,activity);
1710
        }
1711
        else{
1712
          unsigned intra_satd[12];
1713
          luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
1714
          oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
1715
          for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
1716
        }
1717
        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
1718
         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
1719
        luma_sum+=luma;
1720
        /*Motion estimation:
1721
          We do a basic 1MV search for all macroblocks, coded or not,
1722
           keyframe or not, unless we aren't using motion estimation at all.*/
1723
        if(!_recode&&_enc->state.curframe_num>0&&
1724
         _enc->sp_level<OC_SP_LEVEL_NOMC&&_enc->keyframe_frequency_force>1){
1725
          oc_mcenc_search(_enc,mbi);
1726
        }
1727
        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
1728
          oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale);
1729
        }
1730
        mb_modes[mbi]=OC_MODE_INTRA;
1731
        oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe,
1732
         mbi,rd_scale,rd_iscale);
1733
        /*Propagate final MB mode and MVs to the chroma blocks.*/
1734
        for(mapii=4;mapii<nmap_idxs;mapii++){
1735
          mapi=map_idxs[mapii];
1736
          pli=mapi>>2;
1737
          bi=mapi&3;
1738
          fragi=mb_maps[mbi][pli][bi];
1739
          frags[fragi].refi=OC_FRAME_SELF;
1740
          frags[fragi].mb_mode=OC_MODE_INTRA;
1741
        }
1742
        /*Save masking scale factors for chroma blocks.*/
1743
        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
1744
          mapi=map_idxs[mapii];
1745
          bi=mapi&3;
1746
          fragi=mb_maps[mbi][1][bi];
1747
          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
1748
          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
1749
        }
1750
      }
1751
    }
1752
    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
1753
    /*Code chroma planes.*/
1754
    for(pli=1;pli<3;pli++){
1755
      oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe,
1756
       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
1757
      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
1758
    }
1759
    notstart=1;
1760
  }
1761
  /*Compute the average block activity and MB luma score for the frame.*/
1762
  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
1763
   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
1764
   _enc->state.fplanes[0].nfrags));
1765
  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
1766
  /*Finish filling in the reference frame borders.*/
1767
  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
1768
  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
1769
  _enc->state.ntotal_coded_fragis=_enc->state.nfrags;
1770
}
1771

1772

1773

1774
/*Cost information about a MB mode.*/
1775
struct oc_mode_choice{
1776
  unsigned      cost;
1777
  unsigned      ssd;
1778
  unsigned      rate;
1779
  unsigned      overhead;
1780
  unsigned char qii[12];
1781
};
1782

1783

1784

1785
static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
1786
  _modec->cost=OC_MODE_RD_COST(_modec->ssd,
1787
   _modec->rate+_modec->overhead,_lambda);
1788
}
1789

1790
/*A set of skip SSD's to use to disable early skipping.*/
1791
static const unsigned OC_NOSKIP[12]={
1792
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1793
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1794
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX
1795
};
1796

1797
/*The estimated number of bits used by a coded chroma block to specify the AC
1798
   quantizer.
1799
  TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression);
1800
   measurements suggest this is in the right ballpark, but it varies somewhat
1801
   with lambda.*/
1802
#define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1)
1803

1804
static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
1805
 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1806
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
1807
 const unsigned _rd_scale[4],int _qti){
1808
  oc_fr_state  fr;
1809
  oc_qii_state qs;
1810
  unsigned     ssd;
1811
  unsigned     rate;
1812
  unsigned     satd;
1813
  unsigned     best_ssd;
1814
  unsigned     best_rate;
1815
  int          best_fri;
1816
  int          best_qii;
1817
  int          lambda;
1818
  int          nqis;
1819
  int          nskipped;
1820
  int          bi;
1821
  lambda=_enc->lambda;
1822
  nqis=_enc->state.nqis;
1823
  /*We could do a trellis optimization here, but we don't make final skip
1824
     decisions until after transform+quantization, so the result wouldn't be
1825
     optimal anyway.
1826
    Instead we just use a greedy approach; for most SATD values, the
1827
     differences between the qiis are large enough to drown out the cost to
1828
     code the flags, anyway.*/
1829
  *&fr=*_fr;
1830
  *&qs=*_qs;
1831
  ssd=rate=nskipped=0;
1832
  for(bi=0;bi<4;bi++){
1833
    oc_fr_state  ft[2];
1834
    oc_qii_state qt[3];
1835
    unsigned     best_cost;
1836
    unsigned     cur_cost;
1837
    unsigned     cur_ssd;
1838
    unsigned     cur_rate;
1839
    unsigned     cur_overhead;
1840
    int          qii;
1841
    satd=_frag_satd[bi];
1842
    *(ft+0)=*&fr;
1843
    oc_fr_code_block(ft+0);
1844
    cur_overhead=ft[0].bits-fr.bits;
1845
    best_rate=oc_dct_cost2(_enc,&best_ssd,0,0,_qti,satd)
1846
     +(cur_overhead<<OC_BIT_SCALE);
1847
    if(nqis>1){
1848
      oc_qii_state_advance(qt+0,&qs,0);
1849
      best_rate+=qt[0].bits-qs.bits<<OC_BIT_SCALE;
1850
    }
1851
    best_ssd=OC_RD_SCALE(best_ssd,_rd_scale[bi]);
1852
    best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1853
    best_fri=0;
1854
    best_qii=0;
1855
    for(qii=1;qii<nqis;qii++){
1856
      oc_qii_state_advance(qt+qii,&qs,qii);
1857
      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,_qti,satd)
1858
       +(cur_overhead+qt[qii].bits-qs.bits<<OC_BIT_SCALE);
1859
      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
1860
      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1861
      if(cur_cost<best_cost){
1862
        best_cost=cur_cost;
1863
        best_ssd=cur_ssd;
1864
        best_rate=cur_rate;
1865
        best_qii=qii;
1866
      }
1867
    }
1868
    if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)&&nskipped<3){
1869
      *(ft+1)=*&fr;
1870
      oc_fr_skip_block(ft+1);
1871
      cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
1872
      cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1873
      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
1874
      if(cur_cost<=best_cost){
1875
        best_ssd=cur_ssd;
1876
        best_rate=cur_overhead;
1877
        best_fri=1;
1878
        best_qii+=4;
1879
      }
1880
    }
1881
    rate+=best_rate;
1882
    ssd+=best_ssd;
1883
    *&fr=*(ft+best_fri);
1884
    if(best_fri==0)*&qs=*(qt+best_qii);
1885
    else nskipped++;
1886
    _modec->qii[bi]=best_qii;
1887
  }
1888
  _modec->ssd=ssd;
1889
  _modec->rate=rate;
1890
}
1891

1892
static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
1893
 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1894
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
1895
 unsigned _rd_scale,int _qti){
1896
  unsigned ssd;
1897
  unsigned rate;
1898
  unsigned satd;
1899
  unsigned best_ssd;
1900
  unsigned best_rate;
1901
  int      best_qii;
1902
  unsigned cur_cost;
1903
  unsigned cur_ssd;
1904
  unsigned cur_rate;
1905
  int      lambda;
1906
  int      nblocks;
1907
  int      nqis;
1908
  int      pli;
1909
  int      bi;
1910
  int      qii;
1911
  lambda=_enc->lambda;
1912
  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
1913
     worth spending the bits to change the AC quantizer.
1914
    TODO: This may be worth revisiting when we separate out DC and AC
1915
     predictions from SATD.*/
1916
#if 0
1917
  nqis=_enc->state.nqis;
1918
#else
1919
  nqis=1;
1920
#endif
1921
  ssd=_modec->ssd;
1922
  rate=_modec->rate;
1923
  /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded
1924
     order, we assume a constant overhead for coded block and qii flags.*/
1925
  nblocks=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1926
  nblocks=(nblocks-4>>1)+4;
1927
  bi=4;
1928
  for(pli=1;pli<3;pli++){
1929
    for(;bi<nblocks;bi++){
1930
      unsigned best_cost;
1931
      satd=_frag_satd[bi];
1932
      best_rate=oc_dct_cost2(_enc,&best_ssd,0,pli,_qti,satd)
1933
       +OC_CHROMA_QII_RATE;
1934
      best_ssd=OC_RD_SCALE(best_ssd,_rd_scale);
1935
      best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1936
      best_qii=0;
1937
      for(qii=1;qii<nqis;qii++){
1938
        cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,pli,_qti,satd)
1939
         +OC_CHROMA_QII_RATE;
1940
        cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
1941
        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1942
        if(cur_cost<best_cost){
1943
          best_cost=cur_cost;
1944
          best_ssd=cur_ssd;
1945
          best_rate=cur_rate;
1946
          best_qii=qii;
1947
        }
1948
      }
1949
      if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)){
1950
        cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1951
        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
1952
        if(cur_cost<=best_cost){
1953
          best_ssd=cur_ssd;
1954
          best_rate=0;
1955
          best_qii+=4;
1956
        }
1957
      }
1958
      rate+=best_rate;
1959
      ssd+=best_ssd;
1960
      _modec->qii[bi]=best_qii;
1961
    }
1962
    nblocks=(nblocks-4<<1)+4;
1963
  }
1964
  _modec->ssd=ssd;
1965
  _modec->rate=rate;
1966
}
1967

1968
static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
1969
 unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){
1970
  const unsigned char   *src;
1971
  const unsigned char   *ref;
1972
  int                    ystride;
1973
  const oc_fragment     *frags;
1974
  const ptrdiff_t       *frag_buf_offs;
1975
  const ptrdiff_t       *sb_map;
1976
  const oc_mb_map_plane *mb_map;
1977
  const unsigned char   *map_idxs;
1978
  oc_mv                 *mvs;
1979
  int                    map_nidxs;
1980
  unsigned               uncoded_ssd;
1981
  int                    mapii;
1982
  int                    mapi;
1983
  int                    pli;
1984
  int                    bi;
1985
  ptrdiff_t              fragi;
1986
  ptrdiff_t              frag_offs;
1987
  int                    borderi;
1988
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1989
  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
1990
  ystride=_enc->state.ref_ystride[0];
1991
  frags=_enc->state.frags;
1992
  frag_buf_offs=_enc->state.frag_buf_offs;
1993
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1994
  mvs=_enc->mb_info[_mbi].block_mv;
1995
  for(bi=0;bi<4;bi++){
1996
    fragi=sb_map[bi];
1997
    borderi=frags[fragi].borderi;
1998
    frag_offs=frag_buf_offs[fragi];
1999
    if(borderi<0){
2000
      uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
2001
    }
2002
    else{
2003
      uncoded_ssd=oc_enc_frag_border_ssd(_enc,
2004
       src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
2005
    }
2006
    /*Scale to match DCT domain and RD.*/
2007
    uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[bi]);
2008
    /*Motion is a special case; if there is more than a full-pixel motion
2009
       against the prior frame, penalize skipping.
2010
      TODO: The factor of two here is a kludge, but it tested out better than a
2011
       hard limit.*/
2012
    if(mvs[bi]!=0)uncoded_ssd*=2;
2013
    _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
2014
  }
2015
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2016
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2017
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2018
  map_nidxs=(map_nidxs-4>>1)+4;
2019
  mapii=4;
2020
  mvs=_enc->mb_info[_mbi].unref_mv;
2021
  for(pli=1;pli<3;pli++){
2022
    ystride=_enc->state.ref_ystride[pli];
2023
    for(;mapii<map_nidxs;mapii++){
2024
      mapi=map_idxs[mapii];
2025
      bi=mapi&3;
2026
      fragi=mb_map[pli][bi];
2027
      borderi=frags[fragi].borderi;
2028
      frag_offs=frag_buf_offs[fragi];
2029
      if(borderi<0){
2030
        uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
2031
      }
2032
      else{
2033
        uncoded_ssd=oc_enc_frag_border_ssd(_enc,
2034
         src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
2035
      }
2036
      /*Scale to match DCT domain and RD.*/
2037
      uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[4]);
2038
      /*Motion is a special case; if there is more than a full-pixel motion
2039
         against the prior frame, penalize skipping.
2040
        TODO: The factor of two here is a kludge, but it tested out better than
2041
         a hard limit*/
2042
      if(mvs[OC_FRAME_PREV]!=0)uncoded_ssd*=2;
2043
      _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
2044
    }
2045
    map_nidxs=(map_nidxs-4<<1)+4;
2046
  }
2047
}
2048

2049

2050
static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2051
 unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
2052
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
2053
 const unsigned _rd_scale[5]){
2054
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,_rd_scale,0);
2055
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2056
   _frag_satd,_skip_ssd,_rd_scale[4],0);
2057
  _modec->overhead=
2058
   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
2059
  oc_mode_set_cost(_modec,_enc->lambda);
2060
}
2061

2062
static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2063
 unsigned _mbi,int _mb_mode,oc_mv _mv,
2064
 const oc_fr_state *_fr,const oc_qii_state *_qs,
2065
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2066
  unsigned               frag_satd[12];
2067
  const unsigned char   *src;
2068
  const unsigned char   *ref;
2069
  int                    ystride;
2070
  const ptrdiff_t       *frag_buf_offs;
2071
  const ptrdiff_t       *sb_map;
2072
  const oc_mb_map_plane *mb_map;
2073
  const unsigned char   *map_idxs;
2074
  int                    map_nidxs;
2075
  int                    mapii;
2076
  int                    mapi;
2077
  int                    mv_offs[2];
2078
  int                    pli;
2079
  int                    bi;
2080
  ptrdiff_t              fragi;
2081
  ptrdiff_t              frag_offs;
2082
  int                    dc;
2083
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
2084
  ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
2085
  ystride=_enc->state.ref_ystride[0];
2086
  frag_buf_offs=_enc->state.frag_buf_offs;
2087
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
2088
  _modec->rate=_modec->ssd=0;
2089
  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv)>1){
2090
    for(bi=0;bi<4;bi++){
2091
      fragi=sb_map[bi];
2092
      frag_offs=frag_buf_offs[fragi];
2093
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2094
        frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2095
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2096
        frag_satd[bi]+=abs(dc);
2097
      }
2098
      else{
2099
        frag_satd[bi]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
2100
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
2101
      }
2102
    }
2103
  }
2104
  else{
2105
    for(bi=0;bi<4;bi++){
2106
      fragi=sb_map[bi];
2107
      frag_offs=frag_buf_offs[fragi];
2108
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2109
        frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2110
         ref+frag_offs+mv_offs[0],ystride);
2111
        frag_satd[bi]+=abs(dc);
2112
      }
2113
      else{
2114
        frag_satd[bi]=oc_enc_frag_sad(_enc,src+frag_offs,
2115
         ref+frag_offs+mv_offs[0],ystride);
2116
      }
2117
    }
2118
  }
2119
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2120
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2121
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2122
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
2123
  ystride=_enc->state.ref_ystride[1];
2124
  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,_mv)>1){
2125
    for(mapii=4;mapii<map_nidxs;mapii++){
2126
      mapi=map_idxs[mapii];
2127
      pli=mapi>>2;
2128
      bi=mapi&3;
2129
      fragi=mb_map[pli][bi];
2130
      frag_offs=frag_buf_offs[fragi];
2131
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2132
        frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2133
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2134
        frag_satd[mapii]+=abs(dc);
2135
      }
2136
      else{
2137
        frag_satd[mapii]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
2138
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
2139
      }
2140
    }
2141
  }
2142
  else{
2143
    for(mapii=4;mapii<map_nidxs;mapii++){
2144
      mapi=map_idxs[mapii];
2145
      pli=mapi>>2;
2146
      bi=mapi&3;
2147
      fragi=mb_map[pli][bi];
2148
      frag_offs=frag_buf_offs[fragi];
2149
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2150
        frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2151
         ref+frag_offs+mv_offs[0],ystride);
2152
        frag_satd[mapii]+=abs(dc);
2153
      }
2154
      else{
2155
        frag_satd[mapii]=oc_enc_frag_sad(_enc,src+frag_offs,
2156
         ref+frag_offs+mv_offs[0],ystride);
2157
      }
2158
    }
2159
  }
2160
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1);
2161
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2162
   frag_satd,_skip_ssd,_rd_scale[4],1);
2163
  _modec->overhead=
2164
   oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
2165
  oc_mode_set_cost(_modec,_enc->lambda);
2166
}
2167

2168
static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2169
 unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs,
2170
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2171
  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,0,_fr,_qs,_skip_ssd,_rd_scale);
2172
}
2173

2174
static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2175
 unsigned _mbi,int _mb_mode,oc_mv _mv,
2176
 const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12],
2177
 const unsigned _rd_scale[5]){
2178
  int bits0;
2179
  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd,_rd_scale);
2180
  bits0=OC_MV_BITS[0][OC_MV_X(_mv)+31]+OC_MV_BITS[0][OC_MV_Y(_mv)+31];
2181
  _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
2182
   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2183
  oc_mode_set_cost(_modec,_enc->lambda);
2184
  return bits0;
2185
}
2186

2187
/*A mapping from oc_mb_map (raster) ordering to oc_sb_map (Hilbert) ordering.*/
2188
static const unsigned char OC_MB_PHASE[4][4]={
2189
  {0,1,3,2},{0,3,1,2},{0,3,1,2},{2,3,1,0}
2190
};
2191

2192
static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2193
 unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs,
2194
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2195
  unsigned               frag_satd[12];
2196
  oc_mv                  lbmvs[4];
2197
  oc_mv                  cbmvs[4];
2198
  const unsigned char   *src;
2199
  const unsigned char   *ref;
2200
  int                    ystride;
2201
  const ptrdiff_t       *frag_buf_offs;
2202
  oc_mv                 *frag_mvs;
2203
  const oc_mb_map_plane *mb_map;
2204
  const unsigned char   *map_idxs;
2205
  int                    map_nidxs;
2206
  int                    nqis;
2207
  int                    mapii;
2208
  int                    mapi;
2209
  int                    mv_offs[2];
2210
  int                    pli;
2211
  int                    bi;
2212
  ptrdiff_t              fragi;
2213
  ptrdiff_t              frag_offs;
2214
  int                    bits0;
2215
  int                    bits1;
2216
  unsigned               satd;
2217
  int                    dc;
2218
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
2219
  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
2220
  ystride=_enc->state.ref_ystride[0];
2221
  frag_buf_offs=_enc->state.frag_buf_offs;
2222
  frag_mvs=_enc->state.frag_mvs;
2223
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2224
  _modec->rate=_modec->ssd=0;
2225
  for(bi=0;bi<4;bi++){
2226
    fragi=mb_map[0][bi];
2227
    /*Save the block MVs as the current ones while we're here; we'll replace
2228
       them if we don't ultimately choose 4MV mode.*/
2229
    frag_mvs[fragi]=_mv[bi];
2230
    frag_offs=frag_buf_offs[fragi];
2231
    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv[bi])>1){
2232
      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2233
       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2234
    }
2235
    else{
2236
      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2237
       ref+frag_offs+mv_offs[0],ystride);
2238
    }
2239
    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc);
2240
  }
2241
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
2242
   _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1);
2243
  /*Figure out which blocks are being skipped and give them (0,0) MVs.*/
2244
  bits0=0;
2245
  bits1=0;
2246
  nqis=_enc->state.nqis;
2247
  for(bi=0;bi<4;bi++){
2248
    if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis)lbmvs[bi]=0;
2249
    else{
2250
      lbmvs[bi]=_mv[bi];
2251
      bits0+=OC_MV_BITS[0][OC_MV_X(_mv[bi])+31]
2252
       +OC_MV_BITS[0][OC_MV_Y(_mv[bi])+31];
2253
      bits1+=12;
2254
    }
2255
  }
2256
  (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,lbmvs);
2257
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2258
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2259
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
2260
  ystride=_enc->state.ref_ystride[1];
2261
  for(mapii=4;mapii<map_nidxs;mapii++){
2262
    mapi=map_idxs[mapii];
2263
    pli=mapi>>2;
2264
    bi=mapi&3;
2265
    fragi=mb_map[pli][bi];
2266
    frag_offs=frag_buf_offs[fragi];
2267
    /*TODO: We could save half these calls by re-using the results for the Cb
2268
       and Cr planes; is it worth it?*/
2269
    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,cbmvs[bi])>1){
2270
      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2271
       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2272
    }
2273
    else{
2274
      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2275
       ref+frag_offs+mv_offs[0],ystride);
2276
    }
2277
    frag_satd[mapii]=satd+abs(dc);
2278
  }
2279
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2280
   frag_satd,_skip_ssd,_rd_scale[4],1);
2281
  _modec->overhead=
2282
   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR)
2283
   +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1)
2284
   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2285
  oc_mode_set_cost(_modec,_enc->lambda);
2286
}
2287

2288
int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
2289
  oc_set_chroma_mvs_func  set_chroma_mvs;
2290
  oc_qii_state            intra_luma_qs;
2291
  oc_mv                   last_mv;
2292
  oc_mv                   prior_mv;
2293
  ogg_int64_t             interbits;
2294
  ogg_int64_t             intrabits;
2295
  ogg_int64_t             activity_sum;
2296
  ogg_int64_t             luma_sum;
2297
  unsigned                activity_avg;
2298
  unsigned                luma_avg;
2299
  const ogg_uint16_t     *chroma_rd_scale;
2300
  ogg_uint16_t           *mcu_rd_scale;
2301
  ogg_uint16_t           *mcu_rd_iscale;
2302
  const unsigned char    *map_idxs;
2303
  int                     nmap_idxs;
2304
  unsigned               *coded_mbis;
2305
  unsigned               *uncoded_mbis;
2306
  size_t                  ncoded_mbis;
2307
  size_t                  nuncoded_mbis;
2308
  oc_sb_flags            *sb_flags;
2309
  signed char            *mb_modes;
2310
  const oc_sb_map        *sb_maps;
2311
  const oc_mb_map        *mb_maps;
2312
  oc_mb_enc_info         *embs;
2313
  oc_fragment            *frags;
2314
  oc_mv                  *frag_mvs;
2315
  unsigned                stripe_sby;
2316
  unsigned                mcu_nvsbs;
2317
  int                     notstart;
2318
  int                     notdone;
2319
  unsigned                sbi;
2320
  unsigned                sbi_end;
2321
  int                     refi;
2322
  int                     pli;
2323
  int                     sp_level;
2324
  sp_level=_enc->sp_level;
2325
  set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
2326
  _enc->state.frame_type=OC_INTER_FRAME;
2327
  oc_mode_scheme_chooser_reset(&_enc->chooser);
2328
  oc_enc_tokenize_start(_enc);
2329
  oc_enc_pipeline_init(_enc,&_enc->pipe);
2330
  oc_enc_mode_rd_init(_enc);
2331
  if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
2332
  _enc->mv_bits[0]=_enc->mv_bits[1]=0;
2333
  interbits=intrabits=0;
2334
  activity_sum=luma_sum=0;
2335
  activity_avg=_enc->activity_avg;
2336
  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
2337
  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTER_FRAME][_enc->state.qis[0]];
2338
  mcu_rd_scale=_enc->mcu_rd_scale;
2339
  mcu_rd_iscale=_enc->mcu_rd_iscale;
2340
  last_mv=prior_mv=0;
2341
  /*Choose MVs and MB modes and quantize and code luma.
2342
    Must be done in Hilbert order.*/
2343
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2344
  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2345
  coded_mbis=_enc->coded_mbis;
2346
  uncoded_mbis=coded_mbis+_enc->state.nmbs;
2347
  ncoded_mbis=0;
2348
  nuncoded_mbis=0;
2349
  _enc->state.ncoded_fragis[0]=0;
2350
  _enc->state.ncoded_fragis[1]=0;
2351
  _enc->state.ncoded_fragis[2]=0;
2352
  sb_flags=_enc->state.sb_flags;
2353
  mb_modes=_enc->state.mb_modes;
2354
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
2355
  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
2356
  embs=_enc->mb_info;
2357
  frags=_enc->state.frags;
2358
  frag_mvs=_enc->state.frag_mvs;
2359
  notstart=0;
2360
  notdone=1;
2361
  mcu_nvsbs=_enc->mcu_nvsbs;
2362
  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
2363
    ptrdiff_t cfroffset;
2364
    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
2365
    sbi_end=_enc->pipe.sbi_end[0];
2366
    cfroffset=_enc->pipe.froffset[1];
2367
    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
2368
      int quadi;
2369
      /*Mode addressing is through Y plane, always 4 MB per SB.*/
2370
      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
2371
        oc_mode_choice modes[8];
2372
        unsigned       activity[4];
2373
        unsigned       rd_scale[5];
2374
        unsigned       rd_iscale[5];
2375
        unsigned       skip_ssd[12];
2376
        unsigned       intra_satd[12];
2377
        unsigned       luma;
2378
        int            mb_mv_bits_0;
2379
        int            mb_gmv_bits_0;
2380
        int            inter_mv_pref;
2381
        int            mb_mode;
2382
        int            refi;
2383
        int            mv;
2384
        unsigned       mbi;
2385
        int            mapii;
2386
        int            mapi;
2387
        int            bi;
2388
        ptrdiff_t      fragi;
2389
        mbi=sbi<<2|quadi;
2390
        luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
2391
        /*Activity masking.*/
2392
        if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
2393
          oc_mb_activity(_enc,mbi,activity);
2394
        }
2395
        else oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
2396
        luma_sum+=luma;
2397
        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
2398
         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
2399
        /*Motion estimation:
2400
          We always do a basic 1MV search for all macroblocks, coded or not,
2401
           keyframe or not.*/
2402
        if(!_recode&&sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
2403
        mv=0;
2404
        /*Find the block choice with the lowest estimated coding cost.
2405
          If a Cb or Cr block is coded but no Y' block from a macro block then
2406
           the mode MUST be OC_MODE_INTER_NOMV.
2407
          This is the default state to which the mode data structure is
2408
           initialised in encoder and decoder at the start of each frame.*/
2409
        /*Block coding cost is estimated from correlated SATD metrics.*/
2410
        /*At this point, all blocks that are in frame are still marked coded.*/
2411
        if(!_recode){
2412
          embs[mbi].unref_mv[OC_FRAME_GOLD]=
2413
           embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
2414
          embs[mbi].unref_mv[OC_FRAME_PREV]=
2415
           embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2416
          embs[mbi].refined=0;
2417
        }
2418
        /*Estimate the cost of coding this MB in a keyframe.*/
2419
        if(_allow_keyframe){
2420
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2421
           _enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
2422
          intrabits+=modes[OC_MODE_INTRA].rate;
2423
          for(bi=0;bi<4;bi++){
2424
            oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
2425
             modes[OC_MODE_INTRA].qii[bi]);
2426
          }
2427
        }
2428
        /*Estimate the cost in a delta frame for various modes.*/
2429
        oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd);
2430
        if(sp_level<OC_SP_LEVEL_NOMC){
2431
          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
2432
           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2433
           skip_ssd,rd_scale);
2434
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2435
           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
2436
          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2437
           OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
2438
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2439
          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
2440
           OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2441
           skip_ssd,rd_scale);
2442
          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
2443
           OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2444
           skip_ssd,rd_scale);
2445
          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2446
           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2447
           skip_ssd,rd_scale);
2448
          mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2449
           OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
2450
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2451
          /*The explicit MV modes (2,6,7) have not yet gone through halfpel
2452
             refinement.
2453
            We choose the explicit MV mode that's already furthest ahead on
2454
             R-D cost and refine only that one.
2455
            We have to be careful to remember which ones we've refined so that
2456
             we don't refine it again if we re-encode this frame.*/
2457
          inter_mv_pref=_enc->lambda*3;
2458
          if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
2459
            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2460
             embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2461
             skip_ssd,rd_scale);
2462
          }
2463
          else{
2464
            modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX;
2465
          }
2466
          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
2467
           modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
2468
            if(!(embs[mbi].refined&0x80)){
2469
              oc_mcenc_refine4mv(_enc,mbi);
2470
              embs[mbi].refined|=0x80;
2471
            }
2472
            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2473
             embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2474
             skip_ssd,rd_scale);
2475
          }
2476
          else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
2477
           modes[OC_MODE_INTER_MV].cost){
2478
            if(!(embs[mbi].refined&0x40)){
2479
              oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
2480
              embs[mbi].refined|=0x40;
2481
            }
2482
            mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2483
             OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
2484
             _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2485
          }
2486
          if(!(embs[mbi].refined&0x04)){
2487
            oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
2488
            embs[mbi].refined|=0x04;
2489
          }
2490
          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2491
           OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
2492
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2493
          /*Finally, pick the mode with the cheapest estimated R-D cost.*/
2494
          mb_mode=OC_MODE_INTER_NOMV;
2495
          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2496
            mb_mode=OC_MODE_INTRA;
2497
          }
2498
          if(modes[OC_MODE_INTER_MV_LAST].cost<modes[mb_mode].cost){
2499
            mb_mode=OC_MODE_INTER_MV_LAST;
2500
          }
2501
          if(modes[OC_MODE_INTER_MV_LAST2].cost<modes[mb_mode].cost){
2502
            mb_mode=OC_MODE_INTER_MV_LAST2;
2503
          }
2504
          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2505
            mb_mode=OC_MODE_GOLDEN_NOMV;
2506
          }
2507
          if(modes[OC_MODE_GOLDEN_MV].cost<modes[mb_mode].cost){
2508
            mb_mode=OC_MODE_GOLDEN_MV;
2509
          }
2510
          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[mb_mode].cost){
2511
            mb_mode=OC_MODE_INTER_MV_FOUR;
2512
          }
2513
          /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
2514
          if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
2515
            inter_mv_pref=0;
2516
          }
2517
          if(modes[OC_MODE_INTER_MV].cost<modes[mb_mode].cost+inter_mv_pref){
2518
            mb_mode=OC_MODE_INTER_MV;
2519
          }
2520
        }
2521
        else{
2522
          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
2523
           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2524
           skip_ssd,rd_scale);
2525
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2526
           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
2527
          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2528
           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2529
           skip_ssd,rd_scale);
2530
          mb_mode=OC_MODE_INTER_NOMV;
2531
          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2532
            mb_mode=OC_MODE_INTRA;
2533
          }
2534
          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2535
            mb_mode=OC_MODE_GOLDEN_NOMV;
2536
          }
2537
          mb_mv_bits_0=mb_gmv_bits_0=0;
2538
        }
2539
        mb_modes[mbi]=mb_mode;
2540
        /*Propagate the MVs to the luma blocks.*/
2541
        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2542
          switch(mb_mode){
2543
            case OC_MODE_INTER_MV:{
2544
              mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2545
            }break;
2546
            case OC_MODE_INTER_MV_LAST:mv=last_mv;break;
2547
            case OC_MODE_INTER_MV_LAST2:mv=prior_mv;break;
2548
            case OC_MODE_GOLDEN_MV:{
2549
              mv=embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
2550
            }break;
2551
          }
2552
          for(bi=0;bi<4;bi++){
2553
            fragi=mb_maps[mbi][0][bi];
2554
            frag_mvs[fragi]=mv;
2555
          }
2556
        }
2557
        for(bi=0;bi<4;bi++){
2558
          fragi=sb_maps[mbi>>2][mbi&3][bi];
2559
          frags[fragi].qii=modes[mb_mode].qii[bi];
2560
        }
2561
        if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi,
2562
         modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){
2563
          int orig_mb_mode;
2564
          orig_mb_mode=mb_mode;
2565
          mb_mode=mb_modes[mbi];
2566
          refi=OC_FRAME_FOR_MODE(mb_mode);
2567
          switch(mb_mode){
2568
            case OC_MODE_INTER_MV:{
2569
              prior_mv=last_mv;
2570
              /*If we're backing out from 4MV, find the MV we're actually
2571
                 using.*/
2572
              if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
2573
                for(bi=0;;bi++){
2574
                  fragi=mb_maps[mbi][0][bi];
2575
                  if(frags[fragi].coded){
2576
                    mv=last_mv=frag_mvs[fragi];
2577
                    break;
2578
                  }
2579
                }
2580
                mb_mv_bits_0=OC_MV_BITS[0][OC_MV_X(mv)+31]
2581
                 +OC_MV_BITS[0][OC_MV_Y(mv)+31];
2582
              }
2583
              /*Otherwise we used the original analysis MV.*/
2584
              else last_mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2585
              _enc->mv_bits[0]+=mb_mv_bits_0;
2586
              _enc->mv_bits[1]+=12;
2587
            }break;
2588
            case OC_MODE_INTER_MV_LAST2:{
2589
              oc_mv tmp_mv;
2590
              tmp_mv=prior_mv;
2591
              prior_mv=last_mv;
2592
              last_mv=tmp_mv;
2593
            }break;
2594
            case OC_MODE_GOLDEN_MV:{
2595
              _enc->mv_bits[0]+=mb_gmv_bits_0;
2596
              _enc->mv_bits[1]+=12;
2597
            }break;
2598
            case OC_MODE_INTER_MV_FOUR:{
2599
              oc_mv lbmvs[4];
2600
              oc_mv cbmvs[4];
2601
              prior_mv=last_mv;
2602
              for(bi=0;bi<4;bi++){
2603
                fragi=mb_maps[mbi][0][bi];
2604
                if(frags[fragi].coded){
2605
                  lbmvs[bi]=last_mv=frag_mvs[fragi];
2606
                  _enc->mv_bits[0]+=OC_MV_BITS[0][OC_MV_X(last_mv)+31]
2607
                   +OC_MV_BITS[0][OC_MV_Y(last_mv)+31];
2608
                  _enc->mv_bits[1]+=12;
2609
                }
2610
                /*Replace the block MVs for not-coded blocks with (0,0).*/
2611
                else lbmvs[bi]=0;
2612
              }
2613
              (*set_chroma_mvs)(cbmvs,lbmvs);
2614
              for(mapii=4;mapii<nmap_idxs;mapii++){
2615
                mapi=map_idxs[mapii];
2616
                pli=mapi>>2;
2617
                bi=mapi&3;
2618
                fragi=mb_maps[mbi][pli][bi];
2619
                frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
2620
                frags[fragi].refi=refi;
2621
                frags[fragi].mb_mode=mb_mode;
2622
                frag_mvs[fragi]=cbmvs[bi];
2623
              }
2624
            }break;
2625
          }
2626
          coded_mbis[ncoded_mbis++]=mbi;
2627
          oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
2628
          interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
2629
        }
2630
        else{
2631
          *(uncoded_mbis-++nuncoded_mbis)=mbi;
2632
          mb_mode=OC_MODE_INTER_NOMV;
2633
          refi=OC_FRAME_PREV;
2634
          mv=0;
2635
        }
2636
        /*Propagate final MB mode and MVs to the chroma blocks.
2637
          This has already been done for 4MV mode, since it requires individual
2638
           block motion vectors.*/
2639
        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2640
          for(mapii=4;mapii<nmap_idxs;mapii++){
2641
            mapi=map_idxs[mapii];
2642
            pli=mapi>>2;
2643
            bi=mapi&3;
2644
            fragi=mb_maps[mbi][pli][bi];
2645
            /*If we switched from 4MV mode to INTER_MV mode, then the qii
2646
               values won't have been chosen with the right MV, but it's
2647
               probably not worth re-estimating them.*/
2648
            frags[fragi].qii=modes[mb_mode].qii[mapii];
2649
            frags[fragi].refi=refi;
2650
            frags[fragi].mb_mode=mb_mode;
2651
            frag_mvs[fragi]=mv;
2652
          }
2653
        }
2654
        /*Save masking scale factors for chroma blocks.*/
2655
        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
2656
          mapi=map_idxs[mapii];
2657
          bi=mapi&3;
2658
          fragi=mb_maps[mbi][1][bi];
2659
          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
2660
          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
2661
        }
2662
      }
2663
      oc_fr_state_flush_sb(_enc->pipe.fr+0);
2664
      sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full;
2665
      sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial;
2666
    }
2667
    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
2668
    /*Code chroma planes.*/
2669
    for(pli=1;pli<3;pli++){
2670
      oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe,
2671
       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
2672
      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
2673
    }
2674
    notstart=1;
2675
  }
2676
  /*Update the average block activity and MB luma score for the frame.
2677
    We could use a Bessel follower here, but fast reaction is probably almost
2678
     always best.*/
2679
  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
2680
   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
2681
   _enc->state.fplanes[0].nfrags));
2682
  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
2683
  /*Finish filling in the reference frame borders.*/
2684
  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
2685
  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
2686
  /*Finish adding flagging overhead costs to inter bit counts to determine if
2687
     we should have coded a key frame instead.*/
2688
  if(_allow_keyframe){
2689
    /*Technically the chroma plane counts are over-estimations, because they
2690
       don't account for continuing runs from the luma planes, but the
2691
       inaccuracy is small.
2692
      We don't need to add the luma plane coding flag costs, because they are
2693
       already included in the MB rate estimates.*/
2694
    for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE;
2695
    if(interbits>intrabits)return 1;
2696
  }
2697
  _enc->ncoded_mbis=ncoded_mbis;
2698
  /*Compact the coded fragment list.*/
2699
  {
2700
    ptrdiff_t ncoded_fragis;
2701
    ncoded_fragis=_enc->state.ncoded_fragis[0];
2702
    for(pli=1;pli<3;pli++){
2703
      memmove(_enc->state.coded_fragis+ncoded_fragis,
2704
       _enc->state.coded_fragis+_enc->state.fplanes[pli].froffset,
2705
       _enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis));
2706
      ncoded_fragis+=_enc->state.ncoded_fragis[pli];
2707
    }
2708
    _enc->state.ntotal_coded_fragis=ncoded_fragis;
2709
  }
2710
  return 0;
2711
}
2712

2713
Product

Resources

Company