Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libtheora/analyze.c
9905 views
1
/********************************************************************
2
* *
3
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7
* *
8
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009,2025 *
9
* by the Xiph.Org Foundation https://www.xiph.org/ *
10
* *
11
********************************************************************
12
13
function: mode selection code
14
15
********************************************************************/
16
#include <limits.h>
17
#include <string.h>
18
#include "encint.h"
19
#include "modedec.h"
20
#if defined(OC_COLLECT_METRICS)
21
# include "collect.c"
22
#endif
23
24
25
26
typedef struct oc_rd_metric oc_rd_metric;
27
typedef struct oc_mode_choice oc_mode_choice;
28
29
30
31
/*There are 8 possible schemes used to encode macro block modes.
32
Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
33
The same set of Huffman codes is used for each of these 7 schemes, but the
34
mode assigned to each codeword varies.
35
Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream,
36
while schemes 1-6 have a fixed mapping.
37
Scheme 7 just encodes each mode directly in 3 bits.*/
38
39
/*The mode orderings for the various mode coding schemes.
40
Scheme 0 uses a custom alphabet, which is not stored in this table.
41
This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
42
decoder.*/
43
static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
44
/*Last MV dominates.*/
45
/*L P M N I G GM 4*/
46
{3,4,2,0,1,5,6,7},
47
/*L P N M I G GM 4*/
48
{2,4,3,0,1,5,6,7},
49
/*L M P N I G GM 4*/
50
{3,4,1,0,2,5,6,7},
51
/*L M N P I G GM 4*/
52
{2,4,1,0,3,5,6,7},
53
/*No MV dominates.*/
54
/*N L P M I G GM 4*/
55
{0,4,3,1,2,5,6,7},
56
/*N G L P M I GM 4*/
57
{0,5,4,2,3,1,6,7},
58
/*Default ordering.*/
59
/*N I M L P G GM 4*/
60
{0,1,2,3,4,5,6,7}
61
};
62
63
64
65
/*Initialize the mode scheme chooser.
66
This need only be called once per encoder.*/
67
void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
68
int si;
69
_chooser->mode_ranks[0]=_chooser->scheme0_ranks;
70
for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1];
71
}
72
73
/*Reset the mode scheme chooser.
74
This needs to be called once for each frame, including the first.*/
75
static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
76
int si;
77
memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
78
/*Scheme 0 starts with 24 bits to store the mode list in.*/
79
_chooser->scheme_bits[0]=24;
80
memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
81
for(si=0;si<8;si++){
82
/*Scheme 7 should always start first, and scheme 0 should always start
83
last.*/
84
_chooser->scheme_list[si]=7-si;
85
_chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
86
}
87
}
88
89
/*Return the cost of coding _mb_mode in the specified scheme.*/
90
static int oc_mode_scheme_chooser_scheme_mb_cost(
91
const oc_mode_scheme_chooser *_chooser,int _scheme,int _mb_mode){
92
int codebook;
93
int ri;
94
codebook=_scheme+1>>3;
95
/*For any scheme except 0, we can just use the bit cost of the mode's rank
96
in that scheme.*/
97
ri=_chooser->mode_ranks[_scheme][_mb_mode];
98
if(_scheme==0){
99
int mc;
100
/*For scheme 0, incrementing the mode count could potentially change the
101
mode's rank.
102
Find the index where the mode would be moved to in the optimal list,
103
and use its bit cost instead of the one for the mode's current
104
position in the list.*/
105
/*We don't actually reorder the list; this is for computing opportunity
106
cost, not an update.*/
107
mc=_chooser->mode_counts[_mb_mode];
108
while(ri>0&&mc>=_chooser->mode_counts[_chooser->scheme0_list[ri-1]])ri--;
109
}
110
return OC_MODE_BITS[codebook][ri];
111
}
112
113
/*This is the real purpose of this data structure: not actually selecting a
114
mode scheme, but estimating the cost of coding a given mode given all the
115
modes selected so far.
116
This is done via opportunity cost: the cost is defined as the number of bits
117
required to encode all the modes selected so far including the current one
118
using the best possible scheme, minus the number of bits required to encode
119
all the modes selected so far not including the current one using the best
120
possible scheme.
121
The computational expense of doing this probably makes it overkill.
122
Just be happy we take a greedy approach instead of trying to solve the
123
global mode-selection problem (which is NP-hard).
124
_mb_mode: The mode to determine the cost of.
125
Return: The number of bits required to code this mode.*/
126
static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
127
int _mb_mode){
128
int scheme0;
129
int scheme1;
130
int best_bits;
131
int mode_bits;
132
int si;
133
int scheme0_bits;
134
int scheme1_bits;
135
scheme0=_chooser->scheme_list[0];
136
scheme1=_chooser->scheme_list[1];
137
scheme0_bits=_chooser->scheme_bits[scheme0];
138
scheme1_bits=_chooser->scheme_bits[scheme1];
139
mode_bits=oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme0,_mb_mode);
140
/*Typical case: If the difference between the best scheme and the next best
141
is greater than 6 bits, then adding just one mode cannot change which
142
scheme we use.*/
143
if(scheme1_bits-scheme0_bits>6)return mode_bits;
144
/*Otherwise, check to see if adding this mode selects a different scheme as
145
the best.*/
146
si=1;
147
best_bits=scheme0_bits+mode_bits;
148
do{
149
int cur_bits;
150
cur_bits=scheme1_bits+
151
oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme1,_mb_mode);
152
if(cur_bits<best_bits)best_bits=cur_bits;
153
if(++si>=8)break;
154
scheme1=_chooser->scheme_list[si];
155
scheme1_bits=_chooser->scheme_bits[scheme1];
156
}
157
while(scheme1_bits-scheme0_bits<=6);
158
return best_bits-scheme0_bits;
159
}
160
161
/*Incrementally update the mode counts and per-scheme bit counts and re-order
162
the scheme lists once a mode has been selected.
163
_mb_mode: The mode that was chosen.*/
164
static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
165
int _mb_mode){
166
int ri;
167
int si;
168
_chooser->mode_counts[_mb_mode]++;
169
/*Re-order the scheme0 mode list if necessary.*/
170
for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){
171
int pmode;
172
pmode=_chooser->scheme0_list[ri-1];
173
if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break;
174
/*Reorder the mode ranking.*/
175
_chooser->scheme0_ranks[pmode]++;
176
_chooser->scheme0_list[ri]=pmode;
177
}
178
_chooser->scheme0_ranks[_mb_mode]=ri;
179
_chooser->scheme0_list[ri]=_mb_mode;
180
/*Now add the bit cost for the mode to each scheme.*/
181
for(si=0;si<8;si++){
182
_chooser->scheme_bits[si]+=
183
OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]];
184
}
185
/*Finally, re-order the list of schemes.*/
186
for(si=1;si<8;si++){
187
int sj;
188
int scheme0;
189
int bits0;
190
sj=si;
191
scheme0=_chooser->scheme_list[si];
192
bits0=_chooser->scheme_bits[scheme0];
193
do{
194
int scheme1;
195
scheme1=_chooser->scheme_list[sj-1];
196
if(bits0>=_chooser->scheme_bits[scheme1])break;
197
_chooser->scheme_list[sj]=scheme1;
198
}
199
while(--sj>0);
200
_chooser->scheme_list[sj]=scheme0;
201
}
202
}
203
204
205
206
/*The number of bits required to encode a super block run.
207
_run_count: The desired run count; must be positive and less than 4130.*/
208
static int oc_sb_run_bits(int _run_count){
209
int i;
210
for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
211
return OC_SB_RUN_CODE_NBITS[i];
212
}
213
214
/*The number of bits required to encode a block run.
215
_run_count: The desired run count; must be positive and less than 30.*/
216
static int oc_block_run_bits(int _run_count){
217
return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
218
}
219
220
221
222
static void oc_fr_state_init(oc_fr_state *_fr){
223
_fr->bits=0;
224
_fr->sb_partial_count=0;
225
_fr->sb_full_count=0;
226
_fr->b_coded_count_prev=0;
227
_fr->b_coded_count=0;
228
_fr->b_count=0;
229
_fr->sb_prefer_partial=0;
230
_fr->sb_bits=0;
231
_fr->sb_partial=-1;
232
_fr->sb_full=-1;
233
_fr->b_coded_prev=-1;
234
_fr->b_coded=-1;
235
}
236
237
238
static int oc_fr_state_sb_cost(const oc_fr_state *_fr,
239
int _sb_partial,int _sb_full){
240
int bits;
241
int sb_partial_count;
242
int sb_full_count;
243
bits=0;
244
sb_partial_count=_fr->sb_partial_count;
245
/*Extend the sb_partial run, or start a new one.*/
246
if(_fr->sb_partial==_sb_partial){
247
if(sb_partial_count>=4129){
248
bits++;
249
sb_partial_count=0;
250
}
251
else bits-=oc_sb_run_bits(sb_partial_count);
252
}
253
else sb_partial_count=0;
254
bits+=oc_sb_run_bits(++sb_partial_count);
255
if(!_sb_partial){
256
/*Extend the sb_full run, or start a new one.*/
257
sb_full_count=_fr->sb_full_count;
258
if(_fr->sb_full==_sb_full){
259
if(sb_full_count>=4129){
260
bits++;
261
sb_full_count=0;
262
}
263
else bits-=oc_sb_run_bits(sb_full_count);
264
}
265
else sb_full_count=0;
266
bits+=oc_sb_run_bits(++sb_full_count);
267
}
268
return bits;
269
}
270
271
static void oc_fr_state_advance_sb(oc_fr_state *_fr,
272
int _sb_partial,int _sb_full){
273
int sb_partial_count;
274
int sb_full_count;
275
sb_partial_count=_fr->sb_partial_count;
276
if(_fr->sb_partial!=_sb_partial||sb_partial_count>=4129)sb_partial_count=0;
277
sb_partial_count++;
278
if(!_sb_partial){
279
sb_full_count=_fr->sb_full_count;
280
if(_fr->sb_full!=_sb_full||sb_full_count>=4129)sb_full_count=0;
281
sb_full_count++;
282
_fr->sb_full_count=sb_full_count;
283
_fr->sb_full=_sb_full;
284
/*Roll back the partial block state.*/
285
_fr->b_coded=_fr->b_coded_prev;
286
_fr->b_coded_count=_fr->b_coded_count_prev;
287
}
288
else{
289
/*Commit back the partial block state.*/
290
_fr->b_coded_prev=_fr->b_coded;
291
_fr->b_coded_count_prev=_fr->b_coded_count;
292
}
293
_fr->sb_partial_count=sb_partial_count;
294
_fr->sb_partial=_sb_partial;
295
_fr->b_count=0;
296
_fr->sb_prefer_partial=0;
297
_fr->sb_bits=0;
298
}
299
300
/*Commit the state of the current super block and advance to the next.*/
301
static void oc_fr_state_flush_sb(oc_fr_state *_fr){
302
int sb_partial;
303
int sb_full;
304
int b_coded_count;
305
int b_count;
306
b_count=_fr->b_count;
307
b_coded_count=_fr->b_coded_count;
308
sb_full=_fr->b_coded;
309
sb_partial=b_coded_count<b_count;
310
if(!sb_partial){
311
/*If the super block is fully coded/uncoded...*/
312
if(_fr->sb_prefer_partial){
313
/*So far coding this super block as partial was cheaper anyway.*/
314
if(b_coded_count>15||_fr->b_coded_prev<0){
315
int sb_bits;
316
/*If the block run is too long, this will limit how far it can be
317
extended into the next partial super block.
318
If we need to extend it farther, we don't want to have to roll all
319
the way back here (since there could be many full SBs between now
320
and then), so we disallow this.
321
Similarly, if this is the start of a stripe, we don't know how the
322
length of the outstanding block run from the previous stripe.*/
323
sb_bits=oc_fr_state_sb_cost(_fr,sb_partial,sb_full);
324
_fr->bits+=sb_bits-_fr->sb_bits;
325
_fr->sb_bits=sb_bits;
326
}
327
else sb_partial=1;
328
}
329
}
330
oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
331
}
332
333
static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){
334
ptrdiff_t bits;
335
int sb_bits;
336
int b_coded_count;
337
int b_count;
338
int sb_prefer_partial;
339
sb_bits=_fr->sb_bits;
340
bits=_fr->bits-sb_bits;
341
b_count=_fr->b_count;
342
b_coded_count=_fr->b_coded_count;
343
sb_prefer_partial=_fr->sb_prefer_partial;
344
if(b_coded_count>=b_count){
345
int sb_partial_bits;
346
/*This super block is currently fully coded/uncoded.*/
347
if(b_count<=0){
348
/*This is the first block in this SB.*/
349
b_count=1;
350
/*Check to see whether it's cheaper to code it partially or fully.*/
351
if(_fr->b_coded==_b_coded){
352
sb_partial_bits=-oc_block_run_bits(b_coded_count);
353
sb_partial_bits+=oc_block_run_bits(++b_coded_count);
354
}
355
else{
356
b_coded_count=1;
357
sb_partial_bits=2;
358
}
359
sb_partial_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
360
sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
361
sb_prefer_partial=sb_partial_bits<sb_bits;
362
sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
363
}
364
else if(_fr->b_coded==_b_coded){
365
b_coded_count++;
366
if(++b_count<16){
367
if(sb_prefer_partial){
368
/*Check to see if it's cheaper to code it fully.*/
369
sb_partial_bits=sb_bits;
370
sb_partial_bits+=oc_block_run_bits(b_coded_count);
371
if(b_coded_count>0){
372
sb_partial_bits-=oc_block_run_bits(b_coded_count-1);
373
}
374
sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
375
sb_prefer_partial=sb_partial_bits<sb_bits;
376
sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
377
}
378
/*There's no need to check the converse (whether it's cheaper to code
379
this SB partially if we were coding it fully), since the cost to
380
code a SB partially can only increase as we add more blocks, whereas
381
the cost to code it fully stays constant.*/
382
}
383
else{
384
/*If we get to the end and this SB is still full, then force it to be
385
coded full.
386
Otherwise we might not be able to extend the block run far enough
387
into the next partial SB.*/
388
if(sb_prefer_partial){
389
sb_prefer_partial=0;
390
sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
391
}
392
}
393
}
394
else{
395
/*This SB was full, but now must be made partial.*/
396
if(!sb_prefer_partial){
397
sb_bits=oc_block_run_bits(b_coded_count);
398
if(b_coded_count>b_count){
399
sb_bits-=oc_block_run_bits(b_coded_count-b_count);
400
}
401
sb_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
402
}
403
b_count++;
404
b_coded_count=1;
405
sb_prefer_partial=1;
406
sb_bits+=2;
407
}
408
}
409
else{
410
b_count++;
411
if(_fr->b_coded==_b_coded)sb_bits-=oc_block_run_bits(b_coded_count);
412
else b_coded_count=0;
413
sb_bits+=oc_block_run_bits(++b_coded_count);
414
}
415
_fr->bits=bits+sb_bits;
416
_fr->b_coded_count=b_coded_count;
417
_fr->b_coded=_b_coded;
418
_fr->b_count=b_count;
419
_fr->sb_prefer_partial=sb_prefer_partial;
420
_fr->sb_bits=sb_bits;
421
}
422
423
static void oc_fr_skip_block(oc_fr_state *_fr){
424
oc_fr_state_advance_block(_fr,0);
425
}
426
427
static void oc_fr_code_block(oc_fr_state *_fr){
428
oc_fr_state_advance_block(_fr,1);
429
}
430
431
static int oc_fr_cost1(const oc_fr_state *_fr){
432
oc_fr_state tmp;
433
ptrdiff_t bits;
434
*&tmp=*_fr;
435
oc_fr_skip_block(&tmp);
436
bits=tmp.bits;
437
*&tmp=*_fr;
438
oc_fr_code_block(&tmp);
439
return (int)(tmp.bits-bits);
440
}
441
442
static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
443
oc_fr_state tmp;
444
*&tmp=*_pre;
445
oc_fr_skip_block(&tmp);
446
oc_fr_skip_block(&tmp);
447
oc_fr_skip_block(&tmp);
448
oc_fr_skip_block(&tmp);
449
return (int)(_post->bits-tmp.bits);
450
}
451
452
453
454
static void oc_qii_state_init(oc_qii_state *_qs){
455
_qs->bits=0;
456
_qs->qi01_count=0;
457
_qs->qi01=-1;
458
_qs->qi12_count=0;
459
_qs->qi12=-1;
460
}
461
462
463
static void oc_qii_state_advance(oc_qii_state *_qd,
464
const oc_qii_state *_qs,int _qii){
465
ptrdiff_t bits;
466
int qi01;
467
int qi01_count;
468
int qi12;
469
int qi12_count;
470
bits=_qs->bits;
471
qi01=_qii+1>>1;
472
qi01_count=_qs->qi01_count;
473
if(qi01==_qs->qi01){
474
if(qi01_count>=4129){
475
bits++;
476
qi01_count=0;
477
}
478
else bits-=oc_sb_run_bits(qi01_count);
479
}
480
else qi01_count=0;
481
qi01_count++;
482
bits+=oc_sb_run_bits(qi01_count);
483
qi12_count=_qs->qi12_count;
484
if(_qii){
485
qi12=_qii>>1;
486
if(qi12==_qs->qi12){
487
if(qi12_count>=4129){
488
bits++;
489
qi12_count=0;
490
}
491
else bits-=oc_sb_run_bits(qi12_count);
492
}
493
else qi12_count=0;
494
qi12_count++;
495
bits+=oc_sb_run_bits(qi12_count);
496
}
497
else qi12=_qs->qi12;
498
_qd->bits=bits;
499
_qd->qi01=qi01;
500
_qd->qi01_count=qi01_count;
501
_qd->qi12=qi12;
502
_qd->qi12_count=qi12_count;
503
}
504
505
506
507
static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
508
ptrdiff_t *coded_fragis;
509
unsigned mcu_nvsbs;
510
ptrdiff_t mcu_nfrags;
511
int flimit;
512
int hdec;
513
int vdec;
514
int pli;
515
int nqis;
516
int qii;
517
int qi0;
518
int qti;
519
/*Initialize the per-plane coded block flag trackers.
520
These are used for bit-estimation purposes only; the real flag bits span
521
all three planes, so we can't compute them in parallel.*/
522
for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
523
for(pli=0;pli<3;pli++)oc_qii_state_init(_pipe->qs+pli);
524
/*Set up the per-plane skip SSD storage pointers.*/
525
mcu_nvsbs=_enc->mcu_nvsbs;
526
mcu_nfrags=mcu_nvsbs*_enc->state.fplanes[0].nhsbs*16;
527
hdec=!(_enc->state.info.pixel_fmt&1);
528
vdec=!(_enc->state.info.pixel_fmt&2);
529
_pipe->skip_ssd[0]=_enc->mcu_skip_ssd;
530
_pipe->skip_ssd[1]=_pipe->skip_ssd[0]+mcu_nfrags;
531
_pipe->skip_ssd[2]=_pipe->skip_ssd[1]+(mcu_nfrags>>hdec+vdec);
532
/*Set up per-plane pointers to the coded and uncoded fragments lists.
533
Unlike the decoder, each planes' coded and uncoded fragment list is kept
534
separate during the analysis stage; we only make the coded list for all
535
three planes contiguous right before the final packet is output
536
(destroying the uncoded lists, which are no longer needed).*/
537
coded_fragis=_enc->state.coded_fragis;
538
for(pli=0;pli<3;pli++){
539
_pipe->coded_fragis[pli]=coded_fragis;
540
coded_fragis+=_enc->state.fplanes[pli].nfrags;
541
_pipe->uncoded_fragis[pli]=coded_fragis;
542
}
543
memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
544
memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
545
/*Set up condensed quantizer tables.*/
546
qi0=_enc->state.qis[0];
547
nqis=_enc->state.nqis;
548
for(pli=0;pli<3;pli++){
549
for(qii=0;qii<nqis;qii++){
550
int qi;
551
qi=_enc->state.qis[qii];
552
for(qti=0;qti<2;qti++){
553
/*Set the DC coefficient in the dequantization table.*/
554
_enc->state.dequant_tables[qi][pli][qti][0]=
555
_enc->dequant_dc[qi0][pli][qti];
556
_enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
557
/*Copy over the quantization table.*/
558
memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti],
559
_enc->opt_data.enquant_table_size);
560
}
561
}
562
}
563
/*Fix up the DC coefficients in the quantization tables.*/
564
oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis);
565
/*Initialize the tokenization state.*/
566
for(pli=0;pli<3;pli++){
567
_pipe->ndct_tokens1[pli]=0;
568
_pipe->eob_run1[pli]=0;
569
}
570
/*Initialize the bounding value array for the loop filter.*/
571
flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]];
572
_pipe->loop_filter=flimit!=0;
573
if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit);
574
/*Clear the temporary DCT scratch space.*/
575
memset(_pipe->dct_data,0,sizeof(_pipe->dct_data));
576
}
577
578
/*Sets the current MCU stripe to super block row _sby.
579
Return: A non-zero value if this was the last MCU.*/
580
static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
581
oc_enc_pipeline_state *_pipe,int _sby){
582
const oc_fragment_plane *fplane;
583
unsigned mcu_nvsbs;
584
int sby_end;
585
int notdone;
586
int vdec;
587
int pli;
588
mcu_nvsbs=_enc->mcu_nvsbs;
589
sby_end=_enc->state.fplanes[0].nvsbs;
590
notdone=_sby+mcu_nvsbs<sby_end;
591
if(notdone)sby_end=_sby+mcu_nvsbs;
592
vdec=0;
593
for(pli=0;pli<3;pli++){
594
fplane=_enc->state.fplanes+pli;
595
_pipe->sbi0[pli]=fplane->sboffset+(_sby>>vdec)*fplane->nhsbs;
596
_pipe->fragy0[pli]=_sby<<2-vdec;
597
_pipe->froffset[pli]=fplane->froffset
598
+_pipe->fragy0[pli]*(ptrdiff_t)fplane->nhfrags;
599
if(notdone){
600
_pipe->sbi_end[pli]=fplane->sboffset+(sby_end>>vdec)*fplane->nhsbs;
601
_pipe->fragy_end[pli]=sby_end<<2-vdec;
602
}
603
else{
604
_pipe->sbi_end[pli]=fplane->sboffset+fplane->nsbs;
605
_pipe->fragy_end[pli]=fplane->nvfrags;
606
}
607
vdec=!(_enc->state.info.pixel_fmt&2);
608
}
609
return notdone;
610
}
611
612
static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
613
oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
614
/*Copy over all the uncoded fragments from this plane and advance the uncoded
615
fragment list.*/
616
if(_pipe->nuncoded_fragis[_pli]>0){
617
_pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
618
oc_frag_copy_list(&_enc->state,
619
_enc->state.ref_frame_data[OC_FRAME_SELF],
620
_enc->state.ref_frame_data[OC_FRAME_PREV],
621
_enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
622
_pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
623
_pipe->nuncoded_fragis[_pli]=0;
624
}
625
/*Perform DC prediction.*/
626
oc_enc_pred_dc_frag_rows(_enc,_pli,
627
_pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
628
/*Finish DC tokenization.*/
629
oc_enc_tokenize_dc_frag_list(_enc,_pli,
630
_pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
631
_pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
632
_pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
633
_pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
634
/*And advance the coded fragment list.*/
635
_enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
636
_pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
637
_pipe->ncoded_fragis[_pli]=0;
638
/*Apply the loop filter if necessary.*/
639
if(_pipe->loop_filter){
640
oc_state_loop_filter_frag_rows(&_enc->state,
641
_pipe->bounding_values,OC_FRAME_SELF,_pli,
642
_pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
643
}
644
else _sdelay=_edelay=0;
645
/*To fill borders, we have an additional two pixel delay, since a fragment
646
in the next row could filter its top edge, using two pixels from a
647
fragment in this row.
648
But there's no reason to delay a full fragment between the two.*/
649
oc_state_borders_fill_rows(&_enc->state,
650
_enc->state.ref_frame_idx[OC_FRAME_SELF],_pli,
651
(_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
652
(_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
653
}
654
655
656
657
/*Cost information about the coded blocks in a MB.*/
658
struct oc_rd_metric{
659
int uncoded_ac_ssd;
660
int coded_ac_ssd;
661
int ac_bits;
662
int dc_flag;
663
};
664
665
666
667
static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
668
oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
669
unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
670
oc_fr_state *_fr,oc_token_checkpoint **_stack){
671
ogg_int16_t *data;
672
ogg_int16_t *dct;
673
ogg_int16_t *idct;
674
oc_qii_state qs;
675
const ogg_uint16_t *dequant;
676
ogg_uint16_t dequant_dc;
677
ptrdiff_t frag_offs;
678
int ystride;
679
const unsigned char *src;
680
const unsigned char *ref;
681
unsigned char *dst;
682
int nonzero;
683
unsigned uncoded_ssd;
684
unsigned coded_ssd;
685
oc_token_checkpoint *checkpoint;
686
oc_fragment *frags;
687
int mb_mode;
688
int refi;
689
int mv_offs[2];
690
int nmv_offs;
691
int ac_bits;
692
int borderi;
693
int nqis;
694
int qti;
695
int qii;
696
int dc;
697
nqis=_enc->state.nqis;
698
frags=_enc->state.frags;
699
frag_offs=_enc->state.frag_buf_offs[_fragi];
700
ystride=_enc->state.ref_ystride[_pli];
701
src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
702
borderi=frags[_fragi].borderi;
703
qii=frags[_fragi].qii;
704
data=_enc->pipe.dct_data;
705
dct=data+64;
706
idct=data+128;
707
if(qii&~3){
708
#if !defined(OC_COLLECT_METRICS)
709
if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
710
/*Enable early skip detection.*/
711
frags[_fragi].coded=0;
712
frags[_fragi].refi=OC_FRAME_NONE;
713
oc_fr_skip_block(_fr);
714
return 0;
715
}
716
#endif
717
/*Try and code this block anyway.*/
718
qii&=3;
719
}
720
refi=frags[_fragi].refi;
721
mb_mode=frags[_fragi].mb_mode;
722
ref=_enc->state.ref_frame_data[refi]+frag_offs;
723
dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs;
724
/*Motion compensation:*/
725
switch(mb_mode){
726
case OC_MODE_INTRA:{
727
nmv_offs=0;
728
oc_enc_frag_sub_128(_enc,data,src,ystride);
729
}break;
730
case OC_MODE_GOLDEN_NOMV:
731
case OC_MODE_INTER_NOMV:{
732
nmv_offs=1;
733
mv_offs[0]=0;
734
oc_enc_frag_sub(_enc,data,src,ref,ystride);
735
}break;
736
default:{
737
const oc_mv *frag_mvs;
738
frag_mvs=_enc->state.frag_mvs;
739
nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,
740
_pli,frag_mvs[_fragi]);
741
if(nmv_offs>1){
742
oc_enc_frag_copy2(_enc,dst,
743
ref+mv_offs[0],ref+mv_offs[1],ystride);
744
oc_enc_frag_sub(_enc,data,src,dst,ystride);
745
}
746
else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride);
747
}break;
748
}
749
#if defined(OC_COLLECT_METRICS)
750
{
751
unsigned sad;
752
unsigned satd;
753
switch(nmv_offs){
754
case 0:{
755
sad=oc_enc_frag_intra_sad(_enc,src,ystride);
756
satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);
757
}break;
758
case 1:{
759
sad=oc_enc_frag_sad_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
760
satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride);
761
satd+=abs(dc);
762
}break;
763
default:{
764
sad=oc_enc_frag_sad_thresh(_enc,src,dst,ystride,UINT_MAX);
765
satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride);
766
satd+=abs(dc);
767
}break;
768
}
769
_enc->frag_sad[_fragi]=sad;
770
_enc->frag_satd[_fragi]=satd;
771
}
772
#endif
773
/*Transform:*/
774
oc_enc_fdct8x8(_enc,dct,data);
775
/*Quantize:*/
776
qti=mb_mode!=OC_MODE_INTRA;
777
dequant=_enc->dequant[_pli][qii][qti];
778
nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]);
779
dc=data[0];
780
/*Tokenize.*/
781
checkpoint=*_stack;
782
if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
783
ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct,
784
nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
785
}
786
else{
787
ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct,
788
nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
789
}
790
/*Reconstruct.
791
TODO: nonzero may need to be adjusted after tokenization.*/
792
dequant_dc=dequant[0];
793
if(nonzero==0){
794
ogg_int16_t p;
795
int ci;
796
int qi01;
797
int qi12;
798
/*We round this dequant product (and not any of the others) because there's
799
no iDCT rounding.*/
800
p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5);
801
/*LOOP VECTORIZES.*/
802
for(ci=0;ci<64;ci++)data[ci]=p;
803
/*We didn't code any AC coefficients, so don't change the quantizer.*/
804
qi01=_pipe->qs[_pli].qi01;
805
qi12=_pipe->qs[_pli].qi12;
806
if(qi01>0)qii=1+qi12;
807
else if(qi01>=0)qii=0;
808
}
809
else{
810
idct[0]=dc*dequant_dc;
811
/*Note: This clears idct[] back to zero for the next block.*/
812
oc_idct8x8(&_enc->state,data,idct,nonzero+1);
813
}
814
frags[_fragi].qii=qii;
815
if(nqis>1){
816
oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
817
ac_bits+=qs.bits-_pipe->qs[_pli].bits;
818
}
819
if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
820
else{
821
oc_enc_frag_recon_inter(_enc,dst,
822
nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
823
}
824
/*If _fr is NULL, then this is an INTRA frame, and we can't skip blocks.*/
825
#if !defined(OC_COLLECT_METRICS)
826
if(_fr!=NULL)
827
#endif
828
{
829
/*In retrospect, should we have skipped this block?*/
830
if(borderi<0){
831
coded_ssd=oc_enc_frag_ssd(_enc,src,dst,ystride);
832
}
833
else{
834
coded_ssd=oc_enc_frag_border_ssd(_enc,src,dst,ystride,
835
_enc->state.borders[borderi].mask);
836
}
837
/*Scale to match DCT domain.*/
838
coded_ssd<<=4;
839
#if defined(OC_COLLECT_METRICS)
840
_enc->frag_ssd[_fragi]=coded_ssd;
841
}
842
if(_fr!=NULL){
843
#endif
844
coded_ssd=OC_RD_SCALE(coded_ssd,_rd_scale);
845
uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]];
846
if(uncoded_ssd<UINT_MAX&&
847
/*Don't allow luma blocks to be skipped in 4MV mode when VP3 compatibility
848
is enabled.*/
849
(!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
850
int overhead_bits;
851
overhead_bits=oc_fr_cost1(_fr);
852
/*Although the fragment coding overhead determination is accurate, it is
853
greedy, using very coarse-grained local information.
854
Allowing it to mildly discourage coding turns out to be beneficial, but
855
it's not clear that allowing it to encourage coding through negative
856
coding overhead deltas is useful.
857
For that reason, we disallow negative coding overheads.*/
858
if(overhead_bits<0)overhead_bits=0;
859
if(uncoded_ssd<=coded_ssd+(overhead_bits+ac_bits)*_enc->lambda){
860
/*Hm, not worth it; roll back.*/
861
oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
862
*_stack=checkpoint;
863
frags[_fragi].coded=0;
864
frags[_fragi].refi=OC_FRAME_NONE;
865
oc_fr_skip_block(_fr);
866
return 0;
867
}
868
}
869
else _mo->dc_flag=1;
870
_mo->uncoded_ac_ssd+=uncoded_ssd;
871
_mo->coded_ac_ssd+=coded_ssd;
872
_mo->ac_bits+=ac_bits;
873
oc_fr_code_block(_fr);
874
}
875
/*GCC 4.4.4 generates a warning here because it can't tell that
876
the init code in the nqis check above will run anytime this
877
line runs.*/
878
if(nqis>1)*(_pipe->qs+_pli)=*&qs;
879
frags[_fragi].dc=dc;
880
frags[_fragi].coded=1;
881
return 1;
882
}
883
884
static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc,
885
oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead,
886
const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
887
/*Worst case token stack usage for 4 fragments.*/
888
oc_token_checkpoint stack[64*4];
889
oc_token_checkpoint *stackptr;
890
const oc_sb_map *sb_maps;
891
signed char *mb_modes;
892
oc_fragment *frags;
893
ptrdiff_t *coded_fragis;
894
ptrdiff_t ncoded_fragis;
895
ptrdiff_t *uncoded_fragis;
896
ptrdiff_t nuncoded_fragis;
897
oc_rd_metric mo;
898
oc_fr_state fr_checkpoint;
899
oc_qii_state qs_checkpoint;
900
int mb_mode;
901
int refi;
902
int ncoded;
903
ptrdiff_t fragi;
904
int bi;
905
*&fr_checkpoint=*(_pipe->fr+0);
906
*&qs_checkpoint=*(_pipe->qs+0);
907
sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
908
mb_modes=_enc->state.mb_modes;
909
frags=_enc->state.frags;
910
coded_fragis=_pipe->coded_fragis[0];
911
ncoded_fragis=_pipe->ncoded_fragis[0];
912
uncoded_fragis=_pipe->uncoded_fragis[0];
913
nuncoded_fragis=_pipe->nuncoded_fragis[0];
914
mb_mode=mb_modes[_mbi];
915
refi=OC_FRAME_FOR_MODE(mb_mode);
916
ncoded=0;
917
stackptr=stack;
918
memset(&mo,0,sizeof(mo));
919
for(bi=0;bi<4;bi++){
920
fragi=sb_maps[_mbi>>2][_mbi&3][bi];
921
frags[fragi].refi=refi;
922
frags[fragi].mb_mode=mb_mode;
923
if(oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
924
_rd_scale[bi],_rd_iscale[bi],&mo,_pipe->fr+0,&stackptr)){
925
coded_fragis[ncoded_fragis++]=fragi;
926
ncoded++;
927
}
928
else *(uncoded_fragis-++nuncoded_fragis)=fragi;
929
}
930
if(ncoded>0&&!mo.dc_flag){
931
int cost;
932
/*Some individual blocks were worth coding.
933
See if that's still true when accounting for mode and MV overhead.*/
934
cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
935
+oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
936
if(mo.uncoded_ac_ssd<=cost){
937
/*Taking macroblock overhead into account, it is not worth coding this
938
MB.*/
939
oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
940
*(_pipe->fr+0)=*&fr_checkpoint;
941
*(_pipe->qs+0)=*&qs_checkpoint;
942
for(bi=0;bi<4;bi++){
943
fragi=sb_maps[_mbi>>2][_mbi&3][bi];
944
if(frags[fragi].coded){
945
*(uncoded_fragis-++nuncoded_fragis)=fragi;
946
frags[fragi].coded=0;
947
frags[fragi].refi=OC_FRAME_NONE;
948
}
949
oc_fr_skip_block(_pipe->fr+0);
950
}
951
ncoded_fragis-=ncoded;
952
ncoded=0;
953
}
954
}
955
/*If no luma blocks coded, the mode is forced.*/
956
if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
957
/*Assume that a 1MV with a single coded block is always cheaper than a 4MV
958
with a single coded block.
959
This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
960
skipped blocks, while a 1MV does not.*/
961
else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
962
mb_modes[_mbi]=OC_MODE_INTER_MV;
963
}
964
_pipe->ncoded_fragis[0]=ncoded_fragis;
965
_pipe->nuncoded_fragis[0]=nuncoded_fragis;
966
return ncoded;
967
}
968
969
static void oc_enc_sb_transform_quantize_inter_chroma(oc_enc_ctx *_enc,
970
oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
971
const ogg_uint16_t *mcu_rd_scale;
972
const ogg_uint16_t *mcu_rd_iscale;
973
const oc_sb_map *sb_maps;
974
oc_sb_flags *sb_flags;
975
oc_fr_state *fr;
976
ptrdiff_t *coded_fragis;
977
ptrdiff_t ncoded_fragis;
978
ptrdiff_t *uncoded_fragis;
979
ptrdiff_t nuncoded_fragis;
980
ptrdiff_t froffset;
981
int sbi;
982
fr=_pipe->fr+_pli;
983
mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
984
mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
985
sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
986
sb_flags=_enc->state.sb_flags;
987
coded_fragis=_pipe->coded_fragis[_pli];
988
ncoded_fragis=_pipe->ncoded_fragis[_pli];
989
uncoded_fragis=_pipe->uncoded_fragis[_pli];
990
nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
991
froffset=_pipe->froffset[_pli];
992
for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
993
/*Worst case token stack usage for 1 fragment.*/
994
oc_token_checkpoint stack[64];
995
oc_rd_metric mo;
996
int quadi;
997
int bi;
998
memset(&mo,0,sizeof(mo));
999
for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1000
ptrdiff_t fragi;
1001
fragi=sb_maps[sbi][quadi][bi];
1002
if(fragi>=0){
1003
oc_token_checkpoint *stackptr;
1004
unsigned rd_scale;
1005
unsigned rd_iscale;
1006
rd_scale=mcu_rd_scale[fragi-froffset];
1007
rd_iscale=mcu_rd_iscale[fragi-froffset];
1008
stackptr=stack;
1009
if(oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
1010
rd_scale,rd_iscale,&mo,fr,&stackptr)){
1011
coded_fragis[ncoded_fragis++]=fragi;
1012
}
1013
else *(uncoded_fragis-++nuncoded_fragis)=fragi;
1014
}
1015
}
1016
oc_fr_state_flush_sb(fr);
1017
sb_flags[sbi].coded_fully=fr->sb_full;
1018
sb_flags[sbi].coded_partially=fr->sb_partial;
1019
}
1020
_pipe->ncoded_fragis[_pli]=ncoded_fragis;
1021
_pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
1022
}
1023
1024
/*Mode decision is done by exhaustively examining all potential choices.
1025
Obviously, doing the motion compensation, fDCT, tokenization, and then
1026
counting the bits each token uses is computationally expensive.
1027
Theora's EOB runs can also split the cost of these tokens across multiple
1028
fragments, and naturally we don't know what the optimal choice of Huffman
1029
codes will be until we know all the tokens we're going to encode in all the
1030
fragments.
1031
So we use a simple approach to estimating the bit cost and distortion of each
1032
mode based upon the SATD value of the residual before coding.
1033
The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
1034
the process (modified somewhat from that of the paper) is very simple.
1035
We build a non-linear regression of the mappings from
1036
(pre-transform+quantization) SATD to (post-transform+quantization) bits and
1037
SSD for each qi.
1038
A separate set of mappings is kept for each quantization type and color
1039
plane.
1040
The mappings are constructed by partitioning the SATD values into a small
1041
number of bins (currently 24) and using a linear regression in each bin
1042
(as opposed to the 0th-order regression used by Kim).
1043
The bit counts and SSD measurements are obtained by examining actual encoded
1044
frames, with appropriate lambda values and optimal Huffman codes selected.
1045
EOB bits are assigned to the fragment that started the EOB run (as opposed to
1046
dividing them among all the blocks in the run; the latter approach seems
1047
more theoretically correct, but Monty's testing showed a small improvement
1048
with the former, though that may have been merely statistical noise).
1049
1050
@ARTICLE{Kim03,
1051
author="Hyun Mun Kim",
1052
title="Adaptive Rate Control Using Nonlinear Regression",
1053
journal="IEEE Transactions on Circuits and Systems for Video Technology",
1054
volume=13,
1055
number=5,
1056
pages="432--439",
1057
month=May,
1058
year=2003
1059
}*/
1060
1061
/*Computes (_ssd+_lambda*_rate)/(1<<OC_BIT_SCALE) with rounding, avoiding
1062
overflow for large lambda values.*/
1063
#define OC_MODE_RD_COST(_ssd,_rate,_lambda) \
1064
((_ssd)>>OC_BIT_SCALE)+((_rate)>>OC_BIT_SCALE)*(_lambda) \
1065
+(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \
1066
+((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
1067
1068
static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){
1069
#if !defined(OC_COLLECT_METRICS)
1070
const
1071
#endif
1072
oc_mode_rd (*oc_mode_rd_table)[3][2][OC_COMP_BINS]=
1073
_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_MODE_RD_SATD:OC_MODE_RD_SAD;
1074
int qii;
1075
#if defined(OC_COLLECT_METRICS)
1076
oc_enc_mode_metrics_load(_enc);
1077
#endif
1078
for(qii=0;qii<_enc->state.nqis;qii++){
1079
int qi;
1080
int pli;
1081
qi=_enc->state.qis[qii];
1082
for(pli=0;pli<3;pli++){
1083
int qti;
1084
for(qti=0;qti<2;qti++){
1085
int log_plq;
1086
int modeline;
1087
int bin;
1088
int dx;
1089
int dq;
1090
log_plq=_enc->log_plq[qi][pli][qti];
1091
/*Find the pair of rows in the mode table that bracket this quantizer.
1092
If it falls outside the range the table covers, then we just use a
1093
pair on the edge for linear extrapolation.*/
1094
for(modeline=0;modeline<OC_LOGQ_BINS-1&&
1095
OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
1096
/*Interpolate a row for this quantizer.*/
1097
dx=OC_MODE_LOGQ[modeline][pli][qti]-log_plq;
1098
dq=OC_MODE_LOGQ[modeline][pli][qti]-OC_MODE_LOGQ[modeline+1][pli][qti];
1099
if(dq==0)dq=1;
1100
for(bin=0;bin<OC_COMP_BINS;bin++){
1101
int y0;
1102
int z0;
1103
int dy;
1104
int dz;
1105
y0=oc_mode_rd_table[modeline][pli][qti][bin].rate;
1106
z0=oc_mode_rd_table[modeline][pli][qti][bin].rmse;
1107
dy=oc_mode_rd_table[modeline+1][pli][qti][bin].rate-y0;
1108
dz=oc_mode_rd_table[modeline+1][pli][qti][bin].rmse-z0;
1109
_enc->mode_rd[qii][pli][qti][bin].rate=
1110
(ogg_int16_t)OC_CLAMPI(-32768,y0+(dy*dx+(dq>>1))/dq,32767);
1111
_enc->mode_rd[qii][pli][qti][bin].rmse=
1112
(ogg_int16_t)OC_CLAMPI(-32768,z0+(dz*dx+(dq>>1))/dq,32767);
1113
}
1114
}
1115
}
1116
}
1117
}
1118
1119
/*Estimate the R-D cost of the DCT coefficients given the SATD of a block after
1120
prediction.*/
1121
static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd,
1122
int _qii,int _pli,int _qti,int _satd){
1123
unsigned rmse;
1124
int shift;
1125
int bin;
1126
int dx;
1127
int y0;
1128
int z0;
1129
int dy;
1130
int dz;
1131
/*SATD metrics for chroma planes vary much less than luma, so we scale them
1132
by 4 to distribute them into the mode decision bins more evenly.*/
1133
_satd<<=_pli+1&2;
1134
shift=_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_SATD_SHIFT:OC_SAD_SHIFT;
1135
bin=OC_MINI(_satd>>shift,OC_COMP_BINS-2);
1136
dx=_satd-(bin<<shift);
1137
y0=_enc->mode_rd[_qii][_pli][_qti][bin].rate;
1138
z0=_enc->mode_rd[_qii][_pli][_qti][bin].rmse;
1139
dy=_enc->mode_rd[_qii][_pli][_qti][bin+1].rate-y0;
1140
dz=_enc->mode_rd[_qii][_pli][_qti][bin+1].rmse-z0;
1141
rmse=OC_MAXI(z0+(dz*dx>>shift),0);
1142
*_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
1143
return OC_MAXI(y0+(dy*dx>>shift),0);
1144
}
1145
1146
/*activity_avg must be positive, or flat regions could get a zero weight, which
1147
confounds analysis.
1148
We set the minimum to this value so that it also avoids the need for divide
1149
by zero checks in oc_mb_masking().*/
1150
# define OC_ACTIVITY_AVG_MIN (1<<OC_RD_SCALE_BITS)
1151
1152
static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
1153
unsigned _activity[4]){
1154
const unsigned char *src;
1155
const ptrdiff_t *frag_buf_offs;
1156
const ptrdiff_t *sb_map;
1157
unsigned luma;
1158
int ystride;
1159
ptrdiff_t frag_offs;
1160
ptrdiff_t fragi;
1161
int bi;
1162
frag_buf_offs=_enc->state.frag_buf_offs;
1163
sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1164
src=_enc->state.ref_frame_data[OC_FRAME_IO];
1165
ystride=_enc->state.ref_ystride[0];
1166
luma=0;
1167
for(bi=0;bi<4;bi++){
1168
const unsigned char *s;
1169
unsigned x;
1170
unsigned x2;
1171
unsigned act;
1172
int i;
1173
int j;
1174
fragi=sb_map[bi];
1175
frag_offs=frag_buf_offs[fragi];
1176
/*TODO: This could be replaced with SATD^2, since we already have to
1177
compute SATD.*/
1178
x=x2=0;
1179
s=src+frag_offs;
1180
for(i=0;i<8;i++){
1181
for(j=0;j<8;j++){
1182
unsigned c;
1183
c=s[j];
1184
x+=c;
1185
x2+=c*c;
1186
}
1187
s+=ystride;
1188
}
1189
luma+=x;
1190
act=(x2<<6)-x*x;
1191
if(act<8<<12){
1192
/*The region is flat.*/
1193
act=OC_MINI(act,5<<12);
1194
}
1195
else{
1196
unsigned e1;
1197
unsigned e2;
1198
unsigned e3;
1199
unsigned e4;
1200
/*Test for an edge.
1201
TODO: There are probably much simpler ways to do this (e.g., it could
1202
probably be combined with the SATD calculation).
1203
Alternatively, we could split the block around the mean and compute the
1204
reduction in variance in each half.
1205
For a Gaussian source the reduction should be
1206
(1-2/pi) ~= 0.36338022763241865692446494650994.
1207
Significantly more reduction is a good indication of a bi-level image.
1208
This has the advantage of identifying, in addition to straight edges,
1209
small text regions, which would otherwise be classified as "texture".*/
1210
e1=e2=e3=e4=0;
1211
s=src+frag_offs-1;
1212
for(i=0;i<8;i++){
1213
for(j=0;j<8;j++){
1214
e1+=abs((s[j+2]-s[j]<<1)+(s-ystride)[j+2]-(s-ystride)[j]
1215
+(s+ystride)[j+2]-(s+ystride)[j]);
1216
e2+=abs(((s+ystride)[j+1]-(s-ystride)[j+1]<<1)
1217
+(s+ystride)[j]-(s-ystride)[j]+(s+ystride)[j+2]-(s-ystride)[j+2]);
1218
e3+=abs(((s+ystride)[j+2]-(s-ystride)[j]<<1)
1219
+(s+ystride)[j+1]-s[j]+s[j+2]-(s-ystride)[j+1]);
1220
e4+=abs(((s+ystride)[j]-(s-ystride)[j+2]<<1)
1221
+(s+ystride)[j+1]-s[j+2]+s[j]-(s-ystride)[j+1]);
1222
}
1223
s+=ystride;
1224
}
1225
/*If the largest component of the edge energy is at least 40% of the
1226
total, then classify the block as an edge block.*/
1227
if(5*OC_MAXI(OC_MAXI(e1,e2),OC_MAXI(e3,e4))>2*(e1+e2+e3+e4)){
1228
/*act=act_th*(act/act_th)**0.7
1229
=exp(log(act_th)+0.7*(log(act)-log(act_th))).
1230
Here act_th=5.0 and 0x394A=oc_blog32_q10(5<<12).*/
1231
act=oc_bexp32_q10(0x394A+(7*(oc_blog32_q10(act)-0x394A+5)/10));
1232
}
1233
}
1234
_activity[bi]=act;
1235
}
1236
return luma;
1237
}
1238
1239
static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
1240
unsigned _activity[4],const unsigned _intra_satd[12]){
1241
int bi;
1242
for(bi=0;bi<4;bi++){
1243
unsigned act;
1244
act=(11*_intra_satd[bi]>>8)*_intra_satd[bi];
1245
if(act<8<<12){
1246
/*The region is flat.*/
1247
act=OC_MINI(act,5<<12);
1248
}
1249
_activity[bi]=act;
1250
}
1251
}
1252
1253
/*Compute the masking scales for the blocks in a macro block.
1254
All masking is computed from the luma blocks.
1255
We derive scaling factors for the chroma blocks from these, and use the same
1256
ones for all chroma blocks, regardless of the subsampling.
1257
It's possible for luma to be perfectly flat and yet have high chroma energy,
1258
but this is unlikely in non-artificial images, and not a case that has been
1259
addressed by any research to my knowledge.
1260
The output of the masking process is two scale factors, which are fed into
1261
the various R-D optimizations.
1262
The first, rd_scale, is applied to D in the equation
1263
D*rd_scale+lambda*R.
1264
This is the form that must be used to properly combine scores from multiple
1265
blocks, and can be interpreted as scaling distortions by their visibility.
1266
The inverse, rd_iscale, is applied to lambda in the equation
1267
D+rd_iscale*lambda*R.
1268
This is equivalent to the first form within a single block, but much faster
1269
to use when evaluating many possible distortions (e.g., during actual
1270
quantization, where separate distortions are evaluated for every
1271
coefficient).
1272
The two macros OC_RD_SCALE(rd_scale,d) and OC_RD_ISCALE(rd_iscale,lambda) are
1273
used to perform the multiplications with the proper re-scaling for the range
1274
of the scaling factors.
1275
Many researchers apply masking values directly to the quantizers used, and
1276
not to the R-D cost.
1277
Since we generally use MSE for D, rd_scale must use the square of their
1278
values to generate an equivalent effect.*/
1279
static unsigned oc_mb_masking(unsigned _rd_scale[5],unsigned _rd_iscale[5],
1280
const ogg_uint16_t _chroma_rd_scale[2],const unsigned _activity[4],
1281
unsigned _activity_avg,unsigned _luma,unsigned _luma_avg){
1282
unsigned activity_sum;
1283
unsigned la;
1284
unsigned lb;
1285
unsigned d;
1286
int bi;
1287
int bi_min;
1288
int bi_min2;
1289
/*The ratio lb/la is meant to approximate
1290
((((_luma-16)/219)*(255/128))**0.649**0.4**2), which is the
1291
effective luminance masking from~\cite{LKW06} (including the self-masking
1292
deflator).
1293
The following actually turns out to be a pretty good approximation for
1294
_luma>75 or so.
1295
For smaller values luminance does not really follow Weber's Law anyway, and
1296
this approximation gives a much less aggressive bitrate boost in this
1297
region.
1298
Though some researchers claim that contrast sensitivity actually decreases
1299
for very low luminance values, in my experience excessive brightness on
1300
LCDs or buggy color conversions (e.g., treating Y' as full-range instead
1301
of the CCIR 601 range) make artifacts in such regions extremely visible.
1302
We substitute _luma_avg for 128 to allow the strength of the masking to
1303
vary with the actual average image luminance, within certain limits (the
1304
caller has clamped _luma_avg to the range [90,160], inclusive).
1305
@ARTICLE{LKW06,
1306
author="Zhen Liu and Lina J. Karam and Andrew B. Watson",
1307
title="{JPEG2000} Encoding With Perceptual Distortion Control",
1308
journal="{IEEE} Transactions on Image Processing",
1309
volume=15,
1310
number=7,
1311
pages="1763--1778",
1312
month=Jul,
1313
year=2006
1314
}*/
1315
#if 0
1316
la=_luma+4*_luma_avg;
1317
lb=4*_luma+_luma_avg;
1318
#else
1319
/*Disable luminance masking.*/
1320
la=lb=1;
1321
#endif
1322
activity_sum=0;
1323
for(bi=0;bi<4;bi++){
1324
unsigned a;
1325
unsigned b;
1326
activity_sum+=_activity[bi];
1327
/*Apply activity masking.*/
1328
a=_activity[bi]+4*_activity_avg;
1329
b=4*_activity[bi]+_activity_avg;
1330
d=OC_RD_SCALE(b,1);
1331
/*And luminance masking.*/
1332
d=(a+(d>>1))/d;
1333
_rd_scale[bi]=(d*la+(lb>>1))/lb;
1334
/*And now the inverse.*/
1335
d=OC_MAXI(OC_RD_ISCALE(a,1),1);
1336
d=(b+(d>>1))/d;
1337
_rd_iscale[bi]=(d*lb+(la>>1))/la;
1338
}
1339
/*Now compute scaling factors for chroma blocks.
1340
We start by finding the two smallest iscales from the luma blocks.*/
1341
bi_min=_rd_iscale[1]<_rd_iscale[0];
1342
bi_min2=1-bi_min;
1343
for(bi=2;bi<4;bi++){
1344
if(_rd_iscale[bi]<_rd_iscale[bi_min]){
1345
bi_min2=bi_min;
1346
bi_min=bi;
1347
}
1348
else if(_rd_iscale[bi]<_rd_iscale[bi_min2])bi_min2=bi;
1349
}
1350
/*If the minimum iscale is less than 1.0, use the second smallest instead,
1351
and force the value to at least 1.0 (inflating chroma is a waste).*/
1352
if(_rd_iscale[bi_min]<(1<<OC_RD_ISCALE_BITS))bi_min=bi_min2;
1353
d=OC_MINI(_rd_scale[bi_min],1<<OC_RD_SCALE_BITS);
1354
_rd_scale[4]=OC_RD_SCALE(d,_chroma_rd_scale[0]);
1355
d=OC_MAXI(_rd_iscale[bi_min],1<<OC_RD_ISCALE_BITS);
1356
_rd_iscale[4]=OC_RD_ISCALE(d,_chroma_rd_scale[1]);
1357
return activity_sum;
1358
}
1359
1360
static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
1361
unsigned _frag_satd[12]){
1362
const unsigned char *src;
1363
const ptrdiff_t *frag_buf_offs;
1364
const ptrdiff_t *sb_map;
1365
const oc_mb_map_plane *mb_map;
1366
const unsigned char *map_idxs;
1367
int map_nidxs;
1368
int mapii;
1369
int mapi;
1370
int ystride;
1371
int pli;
1372
int bi;
1373
ptrdiff_t fragi;
1374
ptrdiff_t frag_offs;
1375
unsigned luma;
1376
int dc;
1377
frag_buf_offs=_enc->state.frag_buf_offs;
1378
sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1379
src=_enc->state.ref_frame_data[OC_FRAME_IO];
1380
ystride=_enc->state.ref_ystride[0];
1381
luma=0;
1382
for(bi=0;bi<4;bi++){
1383
fragi=sb_map[bi];
1384
frag_offs=frag_buf_offs[fragi];
1385
_frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1386
luma+=dc;
1387
}
1388
mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1389
map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1390
map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1391
/*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
1392
ystride=_enc->state.ref_ystride[1];
1393
for(mapii=4;mapii<map_nidxs;mapii++){
1394
mapi=map_idxs[mapii];
1395
pli=mapi>>2;
1396
bi=mapi&3;
1397
fragi=mb_map[pli][bi];
1398
frag_offs=frag_buf_offs[fragi];
1399
_frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1400
}
1401
return luma;
1402
}
1403
1404
/*Select luma block-level quantizers for a MB in an INTRA frame.*/
1405
static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
1406
const oc_qii_state *_qs,unsigned _mbi,const unsigned _rd_scale[4]){
1407
const unsigned char *src;
1408
const ptrdiff_t *frag_buf_offs;
1409
const oc_sb_map *sb_maps;
1410
oc_fragment *frags;
1411
ptrdiff_t frag_offs;
1412
ptrdiff_t fragi;
1413
oc_qii_state qs[4][3];
1414
unsigned cost[4][3];
1415
unsigned ssd[4][3];
1416
unsigned rate[4][3];
1417
int prev[3][3];
1418
unsigned satd;
1419
int dc;
1420
unsigned best_cost;
1421
unsigned best_ssd;
1422
unsigned best_rate;
1423
int best_qii;
1424
int qii;
1425
int lambda;
1426
int ystride;
1427
int nqis;
1428
int bi;
1429
frag_buf_offs=_enc->state.frag_buf_offs;
1430
sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1431
src=_enc->state.ref_frame_data[OC_FRAME_IO];
1432
ystride=_enc->state.ref_ystride[0];
1433
fragi=sb_maps[_mbi>>2][_mbi&3][0];
1434
frag_offs=frag_buf_offs[fragi];
1435
if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1436
satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1437
}
1438
else{
1439
satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1440
}
1441
nqis=_enc->state.nqis;
1442
lambda=_enc->lambda;
1443
for(qii=0;qii<nqis;qii++){
1444
oc_qii_state_advance(qs[0]+qii,_qs,qii);
1445
rate[0][qii]=oc_dct_cost2(_enc,ssd[0]+qii,qii,0,0,satd)
1446
+(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE);
1447
ssd[0][qii]=OC_RD_SCALE(ssd[0][qii],_rd_scale[0]);
1448
cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda);
1449
}
1450
for(bi=1;bi<4;bi++){
1451
fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1452
frag_offs=frag_buf_offs[fragi];
1453
if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1454
satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1455
}
1456
else{
1457
satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1458
}
1459
for(qii=0;qii<nqis;qii++){
1460
oc_qii_state qt[3];
1461
unsigned cur_ssd;
1462
unsigned cur_rate;
1463
int best_qij;
1464
int qij;
1465
oc_qii_state_advance(qt+0,qs[bi-1]+0,qii);
1466
cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,0,satd);
1467
cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
1468
best_ssd=ssd[bi-1][0]+cur_ssd;
1469
best_rate=rate[bi-1][0]+cur_rate
1470
+(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE);
1471
best_cost=OC_MODE_RD_COST(best_ssd,best_rate,lambda);
1472
best_qij=0;
1473
for(qij=1;qij<nqis;qij++){
1474
unsigned chain_ssd;
1475
unsigned chain_rate;
1476
unsigned chain_cost;
1477
oc_qii_state_advance(qt+qij,qs[bi-1]+qij,qii);
1478
chain_ssd=ssd[bi-1][qij]+cur_ssd;
1479
chain_rate=rate[bi-1][qij]+cur_rate
1480
+(qt[qij].bits-qs[bi-1][qij].bits<<OC_BIT_SCALE);
1481
chain_cost=OC_MODE_RD_COST(chain_ssd,chain_rate,lambda);
1482
if(chain_cost<best_cost){
1483
best_cost=chain_cost;
1484
best_ssd=chain_ssd;
1485
best_rate=chain_rate;
1486
best_qij=qij;
1487
}
1488
}
1489
*(qs[bi]+qii)=*(qt+best_qij);
1490
cost[bi][qii]=best_cost;
1491
ssd[bi][qii]=best_ssd;
1492
rate[bi][qii]=best_rate;
1493
prev[bi-1][qii]=best_qij;
1494
}
1495
}
1496
best_qii=0;
1497
best_cost=cost[3][0];
1498
for(qii=1;qii<nqis;qii++){
1499
if(cost[3][qii]<best_cost){
1500
best_cost=cost[3][qii];
1501
best_qii=qii;
1502
}
1503
}
1504
frags=_enc->state.frags;
1505
for(bi=3;;){
1506
fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1507
frags[fragi].qii=best_qii;
1508
if(bi--<=0)break;
1509
best_qii=prev[bi][best_qii];
1510
}
1511
return best_cost;
1512
}
1513
1514
/*Select a block-level quantizer for a single chroma block in an INTRA frame.*/
1515
static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
1516
const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi,unsigned _rd_scale){
1517
const unsigned char *src;
1518
oc_fragment *frags;
1519
ptrdiff_t frag_offs;
1520
oc_qii_state qt[3];
1521
unsigned cost[3];
1522
unsigned satd;
1523
int dc;
1524
unsigned best_cost;
1525
int best_qii;
1526
int qii;
1527
int lambda;
1528
int ystride;
1529
int nqis;
1530
src=_enc->state.ref_frame_data[OC_FRAME_IO];
1531
ystride=_enc->state.ref_ystride[_pli];
1532
frag_offs=_enc->state.frag_buf_offs[_fragi];
1533
if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1534
satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1535
}
1536
else{
1537
satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1538
}
1539
/*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
1540
worth spending the bits to change the AC quantizer.
1541
TODO: This may be worth revisiting when we separate out DC and AC
1542
predictions from SATD.*/
1543
#if 0
1544
nqis=_enc->state.nqis;
1545
#else
1546
nqis=1;
1547
#endif
1548
lambda=_enc->lambda;
1549
best_qii=0;
1550
for(qii=0;qii<nqis;qii++){
1551
unsigned cur_rate;
1552
unsigned cur_ssd;
1553
oc_qii_state_advance(qt+qii,_qs,qii);
1554
cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,_pli,0,satd)
1555
+(qt[qii].bits-_qs->bits<<OC_BIT_SCALE);
1556
cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
1557
cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda);
1558
}
1559
best_cost=cost[0];
1560
for(qii=1;qii<nqis;qii++){
1561
if(cost[qii]<best_cost){
1562
best_cost=cost[qii];
1563
best_qii=qii;
1564
}
1565
}
1566
frags=_enc->state.frags;
1567
frags[_fragi].qii=best_qii;
1568
return best_cost;
1569
}
1570
1571
static void oc_enc_mb_transform_quantize_intra_luma(oc_enc_ctx *_enc,
1572
oc_enc_pipeline_state *_pipe,unsigned _mbi,
1573
const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
1574
/*Worst case token stack usage for 4 fragments.*/
1575
oc_token_checkpoint stack[64*4];
1576
oc_token_checkpoint *stackptr;
1577
const oc_sb_map *sb_maps;
1578
oc_fragment *frags;
1579
ptrdiff_t *coded_fragis;
1580
ptrdiff_t ncoded_fragis;
1581
ptrdiff_t fragi;
1582
int bi;
1583
sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1584
frags=_enc->state.frags;
1585
coded_fragis=_pipe->coded_fragis[0];
1586
ncoded_fragis=_pipe->ncoded_fragis[0];
1587
stackptr=stack;
1588
for(bi=0;bi<4;bi++){
1589
fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1590
frags[fragi].refi=OC_FRAME_SELF;
1591
frags[fragi].mb_mode=OC_MODE_INTRA;
1592
oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
1593
_rd_scale[bi],_rd_iscale[bi],NULL,NULL,&stackptr);
1594
coded_fragis[ncoded_fragis++]=fragi;
1595
}
1596
_pipe->ncoded_fragis[0]=ncoded_fragis;
1597
}
1598
1599
static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
1600
oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
1601
const ogg_uint16_t *mcu_rd_scale;
1602
const ogg_uint16_t *mcu_rd_iscale;
1603
const oc_sb_map *sb_maps;
1604
ptrdiff_t *coded_fragis;
1605
ptrdiff_t ncoded_fragis;
1606
ptrdiff_t froffset;
1607
int sbi;
1608
mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
1609
mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
1610
sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1611
coded_fragis=_pipe->coded_fragis[_pli];
1612
ncoded_fragis=_pipe->ncoded_fragis[_pli];
1613
froffset=_pipe->froffset[_pli];
1614
for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
1615
/*Worst case token stack usage for 1 fragment.*/
1616
oc_token_checkpoint stack[64];
1617
int quadi;
1618
int bi;
1619
for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1620
ptrdiff_t fragi;
1621
fragi=sb_maps[sbi][quadi][bi];
1622
if(fragi>=0){
1623
oc_token_checkpoint *stackptr;
1624
unsigned rd_scale;
1625
unsigned rd_iscale;
1626
rd_scale=mcu_rd_scale[fragi-froffset];
1627
rd_iscale=mcu_rd_iscale[fragi-froffset];
1628
oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi,rd_scale);
1629
stackptr=stack;
1630
oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
1631
rd_scale,rd_iscale,NULL,NULL,&stackptr);
1632
coded_fragis[ncoded_fragis++]=fragi;
1633
}
1634
}
1635
}
1636
_pipe->ncoded_fragis[_pli]=ncoded_fragis;
1637
}
1638
1639
/*Analysis stage for an INTRA frame.*/
1640
void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
1641
ogg_int64_t activity_sum;
1642
ogg_int64_t luma_sum;
1643
unsigned activity_avg;
1644
unsigned luma_avg;
1645
const ogg_uint16_t *chroma_rd_scale;
1646
ogg_uint16_t *mcu_rd_scale;
1647
ogg_uint16_t *mcu_rd_iscale;
1648
const unsigned char *map_idxs;
1649
int nmap_idxs;
1650
oc_sb_flags *sb_flags;
1651
signed char *mb_modes;
1652
const oc_mb_map *mb_maps;
1653
const oc_sb_map *sb_maps;
1654
oc_fragment *frags;
1655
unsigned stripe_sby;
1656
unsigned mcu_nvsbs;
1657
int notstart;
1658
int notdone;
1659
int refi;
1660
int pli;
1661
_enc->state.frame_type=OC_INTRA_FRAME;
1662
oc_enc_tokenize_start(_enc);
1663
oc_enc_pipeline_init(_enc,&_enc->pipe);
1664
oc_enc_mode_rd_init(_enc);
1665
activity_sum=luma_sum=0;
1666
activity_avg=_enc->activity_avg;
1667
luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
1668
chroma_rd_scale=_enc->chroma_rd_scale[OC_INTRA_FRAME][_enc->state.qis[0]];
1669
mcu_rd_scale=_enc->mcu_rd_scale;
1670
mcu_rd_iscale=_enc->mcu_rd_iscale;
1671
/*Choose MVs and MB modes and quantize and code luma.
1672
Must be done in Hilbert order.*/
1673
map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1674
nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1675
_enc->state.ncoded_fragis[0]=0;
1676
_enc->state.ncoded_fragis[1]=0;
1677
_enc->state.ncoded_fragis[2]=0;
1678
sb_flags=_enc->state.sb_flags;
1679
mb_modes=_enc->state.mb_modes;
1680
mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
1681
sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1682
frags=_enc->state.frags;
1683
notstart=0;
1684
notdone=1;
1685
mcu_nvsbs=_enc->mcu_nvsbs;
1686
for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
1687
ptrdiff_t cfroffset;
1688
unsigned sbi;
1689
unsigned sbi_end;
1690
notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
1691
sbi_end=_enc->pipe.sbi_end[0];
1692
cfroffset=_enc->pipe.froffset[1];
1693
for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
1694
int quadi;
1695
/*Mode addressing is through Y plane, always 4 MB per SB.*/
1696
for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
1697
unsigned activity[4];
1698
unsigned rd_scale[5];
1699
unsigned rd_iscale[5];
1700
unsigned luma;
1701
unsigned mbi;
1702
int mapii;
1703
int mapi;
1704
int bi;
1705
ptrdiff_t fragi;
1706
mbi=sbi<<2|quadi;
1707
/*Activity masking.*/
1708
if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
1709
luma=oc_mb_activity(_enc,mbi,activity);
1710
}
1711
else{
1712
unsigned intra_satd[12];
1713
luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
1714
oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
1715
for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
1716
}
1717
activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
1718
chroma_rd_scale,activity,activity_avg,luma,luma_avg);
1719
luma_sum+=luma;
1720
/*Motion estimation:
1721
We do a basic 1MV search for all macroblocks, coded or not,
1722
keyframe or not, unless we aren't using motion estimation at all.*/
1723
if(!_recode&&_enc->state.curframe_num>0&&
1724
_enc->sp_level<OC_SP_LEVEL_NOMC&&_enc->keyframe_frequency_force>1){
1725
oc_mcenc_search(_enc,mbi);
1726
}
1727
if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
1728
oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale);
1729
}
1730
mb_modes[mbi]=OC_MODE_INTRA;
1731
oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe,
1732
mbi,rd_scale,rd_iscale);
1733
/*Propagate final MB mode and MVs to the chroma blocks.*/
1734
for(mapii=4;mapii<nmap_idxs;mapii++){
1735
mapi=map_idxs[mapii];
1736
pli=mapi>>2;
1737
bi=mapi&3;
1738
fragi=mb_maps[mbi][pli][bi];
1739
frags[fragi].refi=OC_FRAME_SELF;
1740
frags[fragi].mb_mode=OC_MODE_INTRA;
1741
}
1742
/*Save masking scale factors for chroma blocks.*/
1743
for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
1744
mapi=map_idxs[mapii];
1745
bi=mapi&3;
1746
fragi=mb_maps[mbi][1][bi];
1747
mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
1748
mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
1749
}
1750
}
1751
}
1752
oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
1753
/*Code chroma planes.*/
1754
for(pli=1;pli<3;pli++){
1755
oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe,
1756
pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
1757
oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
1758
}
1759
notstart=1;
1760
}
1761
/*Compute the average block activity and MB luma score for the frame.*/
1762
_enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
1763
(unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
1764
_enc->state.fplanes[0].nfrags));
1765
_enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
1766
/*Finish filling in the reference frame borders.*/
1767
refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
1768
for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
1769
_enc->state.ntotal_coded_fragis=_enc->state.nfrags;
1770
}
1771
1772
1773
1774
/*Cost information about a MB mode.*/
1775
struct oc_mode_choice{
1776
unsigned cost;
1777
unsigned ssd;
1778
unsigned rate;
1779
unsigned overhead;
1780
unsigned char qii[12];
1781
};
1782
1783
1784
1785
static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
1786
_modec->cost=OC_MODE_RD_COST(_modec->ssd,
1787
_modec->rate+_modec->overhead,_lambda);
1788
}
1789
1790
/*A set of skip SSD's to use to disable early skipping.*/
1791
static const unsigned OC_NOSKIP[12]={
1792
UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1793
UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1794
UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX
1795
};
1796
1797
/*The estimated number of bits used by a coded chroma block to specify the AC
1798
quantizer.
1799
TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression);
1800
measurements suggest this is in the right ballpark, but it varies somewhat
1801
with lambda.*/
1802
#define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1)
1803
1804
static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
1805
oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1806
const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
1807
const unsigned _rd_scale[4],int _qti){
1808
oc_fr_state fr;
1809
oc_qii_state qs;
1810
unsigned ssd;
1811
unsigned rate;
1812
unsigned satd;
1813
unsigned best_ssd;
1814
unsigned best_rate;
1815
int best_fri;
1816
int best_qii;
1817
int lambda;
1818
int nqis;
1819
int nskipped;
1820
int bi;
1821
lambda=_enc->lambda;
1822
nqis=_enc->state.nqis;
1823
/*We could do a trellis optimization here, but we don't make final skip
1824
decisions until after transform+quantization, so the result wouldn't be
1825
optimal anyway.
1826
Instead we just use a greedy approach; for most SATD values, the
1827
differences between the qiis are large enough to drown out the cost to
1828
code the flags, anyway.*/
1829
*&fr=*_fr;
1830
*&qs=*_qs;
1831
ssd=rate=nskipped=0;
1832
for(bi=0;bi<4;bi++){
1833
oc_fr_state ft[2];
1834
oc_qii_state qt[3];
1835
unsigned best_cost;
1836
unsigned cur_cost;
1837
unsigned cur_ssd;
1838
unsigned cur_rate;
1839
unsigned cur_overhead;
1840
int qii;
1841
satd=_frag_satd[bi];
1842
*(ft+0)=*&fr;
1843
oc_fr_code_block(ft+0);
1844
cur_overhead=ft[0].bits-fr.bits;
1845
best_rate=oc_dct_cost2(_enc,&best_ssd,0,0,_qti,satd)
1846
+(cur_overhead<<OC_BIT_SCALE);
1847
if(nqis>1){
1848
oc_qii_state_advance(qt+0,&qs,0);
1849
best_rate+=qt[0].bits-qs.bits<<OC_BIT_SCALE;
1850
}
1851
best_ssd=OC_RD_SCALE(best_ssd,_rd_scale[bi]);
1852
best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1853
best_fri=0;
1854
best_qii=0;
1855
for(qii=1;qii<nqis;qii++){
1856
oc_qii_state_advance(qt+qii,&qs,qii);
1857
cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,_qti,satd)
1858
+(cur_overhead+qt[qii].bits-qs.bits<<OC_BIT_SCALE);
1859
cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
1860
cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1861
if(cur_cost<best_cost){
1862
best_cost=cur_cost;
1863
best_ssd=cur_ssd;
1864
best_rate=cur_rate;
1865
best_qii=qii;
1866
}
1867
}
1868
if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)&&nskipped<3){
1869
*(ft+1)=*&fr;
1870
oc_fr_skip_block(ft+1);
1871
cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
1872
cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1873
cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
1874
if(cur_cost<=best_cost){
1875
best_ssd=cur_ssd;
1876
best_rate=cur_overhead;
1877
best_fri=1;
1878
best_qii+=4;
1879
}
1880
}
1881
rate+=best_rate;
1882
ssd+=best_ssd;
1883
*&fr=*(ft+best_fri);
1884
if(best_fri==0)*&qs=*(qt+best_qii);
1885
else nskipped++;
1886
_modec->qii[bi]=best_qii;
1887
}
1888
_modec->ssd=ssd;
1889
_modec->rate=rate;
1890
}
1891
1892
static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
1893
oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1894
const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
1895
unsigned _rd_scale,int _qti){
1896
unsigned ssd;
1897
unsigned rate;
1898
unsigned satd;
1899
unsigned best_ssd;
1900
unsigned best_rate;
1901
int best_qii;
1902
unsigned cur_cost;
1903
unsigned cur_ssd;
1904
unsigned cur_rate;
1905
int lambda;
1906
int nblocks;
1907
int nqis;
1908
int pli;
1909
int bi;
1910
int qii;
1911
lambda=_enc->lambda;
1912
/*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
1913
worth spending the bits to change the AC quantizer.
1914
TODO: This may be worth revisiting when we separate out DC and AC
1915
predictions from SATD.*/
1916
#if 0
1917
nqis=_enc->state.nqis;
1918
#else
1919
nqis=1;
1920
#endif
1921
ssd=_modec->ssd;
1922
rate=_modec->rate;
1923
/*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded
1924
order, we assume a constant overhead for coded block and qii flags.*/
1925
nblocks=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1926
nblocks=(nblocks-4>>1)+4;
1927
bi=4;
1928
for(pli=1;pli<3;pli++){
1929
for(;bi<nblocks;bi++){
1930
unsigned best_cost;
1931
satd=_frag_satd[bi];
1932
best_rate=oc_dct_cost2(_enc,&best_ssd,0,pli,_qti,satd)
1933
+OC_CHROMA_QII_RATE;
1934
best_ssd=OC_RD_SCALE(best_ssd,_rd_scale);
1935
best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1936
best_qii=0;
1937
for(qii=1;qii<nqis;qii++){
1938
cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,pli,_qti,satd)
1939
+OC_CHROMA_QII_RATE;
1940
cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
1941
cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1942
if(cur_cost<best_cost){
1943
best_cost=cur_cost;
1944
best_ssd=cur_ssd;
1945
best_rate=cur_rate;
1946
best_qii=qii;
1947
}
1948
}
1949
if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)){
1950
cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1951
cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
1952
if(cur_cost<=best_cost){
1953
best_ssd=cur_ssd;
1954
best_rate=0;
1955
best_qii+=4;
1956
}
1957
}
1958
rate+=best_rate;
1959
ssd+=best_ssd;
1960
_modec->qii[bi]=best_qii;
1961
}
1962
nblocks=(nblocks-4<<1)+4;
1963
}
1964
_modec->ssd=ssd;
1965
_modec->rate=rate;
1966
}
1967
1968
static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
1969
unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){
1970
const unsigned char *src;
1971
const unsigned char *ref;
1972
int ystride;
1973
const oc_fragment *frags;
1974
const ptrdiff_t *frag_buf_offs;
1975
const ptrdiff_t *sb_map;
1976
const oc_mb_map_plane *mb_map;
1977
const unsigned char *map_idxs;
1978
oc_mv *mvs;
1979
int map_nidxs;
1980
unsigned uncoded_ssd;
1981
int mapii;
1982
int mapi;
1983
int pli;
1984
int bi;
1985
ptrdiff_t fragi;
1986
ptrdiff_t frag_offs;
1987
int borderi;
1988
src=_enc->state.ref_frame_data[OC_FRAME_IO];
1989
ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
1990
ystride=_enc->state.ref_ystride[0];
1991
frags=_enc->state.frags;
1992
frag_buf_offs=_enc->state.frag_buf_offs;
1993
sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1994
mvs=_enc->mb_info[_mbi].block_mv;
1995
for(bi=0;bi<4;bi++){
1996
fragi=sb_map[bi];
1997
borderi=frags[fragi].borderi;
1998
frag_offs=frag_buf_offs[fragi];
1999
if(borderi<0){
2000
uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
2001
}
2002
else{
2003
uncoded_ssd=oc_enc_frag_border_ssd(_enc,
2004
src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
2005
}
2006
/*Scale to match DCT domain and RD.*/
2007
uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[bi]);
2008
/*Motion is a special case; if there is more than a full-pixel motion
2009
against the prior frame, penalize skipping.
2010
TODO: The factor of two here is a kludge, but it tested out better than a
2011
hard limit.*/
2012
if(mvs[bi]!=0)uncoded_ssd*=2;
2013
_pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
2014
}
2015
mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2016
map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2017
map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2018
map_nidxs=(map_nidxs-4>>1)+4;
2019
mapii=4;
2020
mvs=_enc->mb_info[_mbi].unref_mv;
2021
for(pli=1;pli<3;pli++){
2022
ystride=_enc->state.ref_ystride[pli];
2023
for(;mapii<map_nidxs;mapii++){
2024
mapi=map_idxs[mapii];
2025
bi=mapi&3;
2026
fragi=mb_map[pli][bi];
2027
borderi=frags[fragi].borderi;
2028
frag_offs=frag_buf_offs[fragi];
2029
if(borderi<0){
2030
uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
2031
}
2032
else{
2033
uncoded_ssd=oc_enc_frag_border_ssd(_enc,
2034
src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
2035
}
2036
/*Scale to match DCT domain and RD.*/
2037
uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[4]);
2038
/*Motion is a special case; if there is more than a full-pixel motion
2039
against the prior frame, penalize skipping.
2040
TODO: The factor of two here is a kludge, but it tested out better than
2041
a hard limit*/
2042
if(mvs[OC_FRAME_PREV]!=0)uncoded_ssd*=2;
2043
_pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
2044
}
2045
map_nidxs=(map_nidxs-4<<1)+4;
2046
}
2047
}
2048
2049
2050
static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2051
unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
2052
const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
2053
const unsigned _rd_scale[5]){
2054
oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,_rd_scale,0);
2055
oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2056
_frag_satd,_skip_ssd,_rd_scale[4],0);
2057
_modec->overhead=
2058
oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
2059
oc_mode_set_cost(_modec,_enc->lambda);
2060
}
2061
2062
static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2063
unsigned _mbi,int _mb_mode,oc_mv _mv,
2064
const oc_fr_state *_fr,const oc_qii_state *_qs,
2065
const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2066
unsigned frag_satd[12];
2067
const unsigned char *src;
2068
const unsigned char *ref;
2069
int ystride;
2070
const ptrdiff_t *frag_buf_offs;
2071
const ptrdiff_t *sb_map;
2072
const oc_mb_map_plane *mb_map;
2073
const unsigned char *map_idxs;
2074
int map_nidxs;
2075
int mapii;
2076
int mapi;
2077
int mv_offs[2];
2078
int pli;
2079
int bi;
2080
ptrdiff_t fragi;
2081
ptrdiff_t frag_offs;
2082
int dc;
2083
src=_enc->state.ref_frame_data[OC_FRAME_IO];
2084
ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
2085
ystride=_enc->state.ref_ystride[0];
2086
frag_buf_offs=_enc->state.frag_buf_offs;
2087
sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
2088
_modec->rate=_modec->ssd=0;
2089
if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv)>1){
2090
for(bi=0;bi<4;bi++){
2091
fragi=sb_map[bi];
2092
frag_offs=frag_buf_offs[fragi];
2093
if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2094
frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2095
ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2096
frag_satd[bi]+=abs(dc);
2097
}
2098
else{
2099
frag_satd[bi]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
2100
ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
2101
}
2102
}
2103
}
2104
else{
2105
for(bi=0;bi<4;bi++){
2106
fragi=sb_map[bi];
2107
frag_offs=frag_buf_offs[fragi];
2108
if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2109
frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2110
ref+frag_offs+mv_offs[0],ystride);
2111
frag_satd[bi]+=abs(dc);
2112
}
2113
else{
2114
frag_satd[bi]=oc_enc_frag_sad(_enc,src+frag_offs,
2115
ref+frag_offs+mv_offs[0],ystride);
2116
}
2117
}
2118
}
2119
mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2120
map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2121
map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2122
/*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
2123
ystride=_enc->state.ref_ystride[1];
2124
if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,_mv)>1){
2125
for(mapii=4;mapii<map_nidxs;mapii++){
2126
mapi=map_idxs[mapii];
2127
pli=mapi>>2;
2128
bi=mapi&3;
2129
fragi=mb_map[pli][bi];
2130
frag_offs=frag_buf_offs[fragi];
2131
if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2132
frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2133
ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2134
frag_satd[mapii]+=abs(dc);
2135
}
2136
else{
2137
frag_satd[mapii]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
2138
ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
2139
}
2140
}
2141
}
2142
else{
2143
for(mapii=4;mapii<map_nidxs;mapii++){
2144
mapi=map_idxs[mapii];
2145
pli=mapi>>2;
2146
bi=mapi&3;
2147
fragi=mb_map[pli][bi];
2148
frag_offs=frag_buf_offs[fragi];
2149
if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2150
frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2151
ref+frag_offs+mv_offs[0],ystride);
2152
frag_satd[mapii]+=abs(dc);
2153
}
2154
else{
2155
frag_satd[mapii]=oc_enc_frag_sad(_enc,src+frag_offs,
2156
ref+frag_offs+mv_offs[0],ystride);
2157
}
2158
}
2159
}
2160
oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1);
2161
oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2162
frag_satd,_skip_ssd,_rd_scale[4],1);
2163
_modec->overhead=
2164
oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
2165
oc_mode_set_cost(_modec,_enc->lambda);
2166
}
2167
2168
static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2169
unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs,
2170
const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2171
oc_cost_inter(_enc,_modec,_mbi,_mb_mode,0,_fr,_qs,_skip_ssd,_rd_scale);
2172
}
2173
2174
static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2175
unsigned _mbi,int _mb_mode,oc_mv _mv,
2176
const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12],
2177
const unsigned _rd_scale[5]){
2178
int bits0;
2179
oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd,_rd_scale);
2180
bits0=OC_MV_BITS[0][OC_MV_X(_mv)+31]+OC_MV_BITS[0][OC_MV_Y(_mv)+31];
2181
_modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
2182
-OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2183
oc_mode_set_cost(_modec,_enc->lambda);
2184
return bits0;
2185
}
2186
2187
/*A mapping from oc_mb_map (raster) ordering to oc_sb_map (Hilbert) ordering.*/
2188
static const unsigned char OC_MB_PHASE[4][4]={
2189
{0,1,3,2},{0,3,1,2},{0,3,1,2},{2,3,1,0}
2190
};
2191
2192
static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2193
unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs,
2194
const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2195
unsigned frag_satd[12];
2196
oc_mv lbmvs[4];
2197
oc_mv cbmvs[4];
2198
const unsigned char *src;
2199
const unsigned char *ref;
2200
int ystride;
2201
const ptrdiff_t *frag_buf_offs;
2202
oc_mv *frag_mvs;
2203
const oc_mb_map_plane *mb_map;
2204
const unsigned char *map_idxs;
2205
int map_nidxs;
2206
int nqis;
2207
int mapii;
2208
int mapi;
2209
int mv_offs[2];
2210
int pli;
2211
int bi;
2212
ptrdiff_t fragi;
2213
ptrdiff_t frag_offs;
2214
int bits0;
2215
int bits1;
2216
unsigned satd;
2217
int dc;
2218
src=_enc->state.ref_frame_data[OC_FRAME_IO];
2219
ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
2220
ystride=_enc->state.ref_ystride[0];
2221
frag_buf_offs=_enc->state.frag_buf_offs;
2222
frag_mvs=_enc->state.frag_mvs;
2223
mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2224
_modec->rate=_modec->ssd=0;
2225
for(bi=0;bi<4;bi++){
2226
fragi=mb_map[0][bi];
2227
/*Save the block MVs as the current ones while we're here; we'll replace
2228
them if we don't ultimately choose 4MV mode.*/
2229
frag_mvs[fragi]=_mv[bi];
2230
frag_offs=frag_buf_offs[fragi];
2231
if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv[bi])>1){
2232
satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2233
ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2234
}
2235
else{
2236
satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2237
ref+frag_offs+mv_offs[0],ystride);
2238
}
2239
frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc);
2240
}
2241
oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
2242
_enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1);
2243
/*Figure out which blocks are being skipped and give them (0,0) MVs.*/
2244
bits0=0;
2245
bits1=0;
2246
nqis=_enc->state.nqis;
2247
for(bi=0;bi<4;bi++){
2248
if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis)lbmvs[bi]=0;
2249
else{
2250
lbmvs[bi]=_mv[bi];
2251
bits0+=OC_MV_BITS[0][OC_MV_X(_mv[bi])+31]
2252
+OC_MV_BITS[0][OC_MV_Y(_mv[bi])+31];
2253
bits1+=12;
2254
}
2255
}
2256
(*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,lbmvs);
2257
map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2258
map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2259
/*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
2260
ystride=_enc->state.ref_ystride[1];
2261
for(mapii=4;mapii<map_nidxs;mapii++){
2262
mapi=map_idxs[mapii];
2263
pli=mapi>>2;
2264
bi=mapi&3;
2265
fragi=mb_map[pli][bi];
2266
frag_offs=frag_buf_offs[fragi];
2267
/*TODO: We could save half these calls by re-using the results for the Cb
2268
and Cr planes; is it worth it?*/
2269
if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,cbmvs[bi])>1){
2270
satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2271
ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2272
}
2273
else{
2274
satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2275
ref+frag_offs+mv_offs[0],ystride);
2276
}
2277
frag_satd[mapii]=satd+abs(dc);
2278
}
2279
oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2280
frag_satd,_skip_ssd,_rd_scale[4],1);
2281
_modec->overhead=
2282
oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR)
2283
+OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1)
2284
-OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2285
oc_mode_set_cost(_modec,_enc->lambda);
2286
}
2287
2288
int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
2289
oc_set_chroma_mvs_func set_chroma_mvs;
2290
oc_qii_state intra_luma_qs;
2291
oc_mv last_mv;
2292
oc_mv prior_mv;
2293
ogg_int64_t interbits;
2294
ogg_int64_t intrabits;
2295
ogg_int64_t activity_sum;
2296
ogg_int64_t luma_sum;
2297
unsigned activity_avg;
2298
unsigned luma_avg;
2299
const ogg_uint16_t *chroma_rd_scale;
2300
ogg_uint16_t *mcu_rd_scale;
2301
ogg_uint16_t *mcu_rd_iscale;
2302
const unsigned char *map_idxs;
2303
int nmap_idxs;
2304
unsigned *coded_mbis;
2305
unsigned *uncoded_mbis;
2306
size_t ncoded_mbis;
2307
size_t nuncoded_mbis;
2308
oc_sb_flags *sb_flags;
2309
signed char *mb_modes;
2310
const oc_sb_map *sb_maps;
2311
const oc_mb_map *mb_maps;
2312
oc_mb_enc_info *embs;
2313
oc_fragment *frags;
2314
oc_mv *frag_mvs;
2315
unsigned stripe_sby;
2316
unsigned mcu_nvsbs;
2317
int notstart;
2318
int notdone;
2319
unsigned sbi;
2320
unsigned sbi_end;
2321
int refi;
2322
int pli;
2323
int sp_level;
2324
sp_level=_enc->sp_level;
2325
set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
2326
_enc->state.frame_type=OC_INTER_FRAME;
2327
oc_mode_scheme_chooser_reset(&_enc->chooser);
2328
oc_enc_tokenize_start(_enc);
2329
oc_enc_pipeline_init(_enc,&_enc->pipe);
2330
oc_enc_mode_rd_init(_enc);
2331
if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
2332
_enc->mv_bits[0]=_enc->mv_bits[1]=0;
2333
interbits=intrabits=0;
2334
activity_sum=luma_sum=0;
2335
activity_avg=_enc->activity_avg;
2336
luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
2337
chroma_rd_scale=_enc->chroma_rd_scale[OC_INTER_FRAME][_enc->state.qis[0]];
2338
mcu_rd_scale=_enc->mcu_rd_scale;
2339
mcu_rd_iscale=_enc->mcu_rd_iscale;
2340
last_mv=prior_mv=0;
2341
/*Choose MVs and MB modes and quantize and code luma.
2342
Must be done in Hilbert order.*/
2343
map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2344
nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2345
coded_mbis=_enc->coded_mbis;
2346
uncoded_mbis=coded_mbis+_enc->state.nmbs;
2347
ncoded_mbis=0;
2348
nuncoded_mbis=0;
2349
_enc->state.ncoded_fragis[0]=0;
2350
_enc->state.ncoded_fragis[1]=0;
2351
_enc->state.ncoded_fragis[2]=0;
2352
sb_flags=_enc->state.sb_flags;
2353
mb_modes=_enc->state.mb_modes;
2354
sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
2355
mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
2356
embs=_enc->mb_info;
2357
frags=_enc->state.frags;
2358
frag_mvs=_enc->state.frag_mvs;
2359
notstart=0;
2360
notdone=1;
2361
mcu_nvsbs=_enc->mcu_nvsbs;
2362
for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
2363
ptrdiff_t cfroffset;
2364
notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
2365
sbi_end=_enc->pipe.sbi_end[0];
2366
cfroffset=_enc->pipe.froffset[1];
2367
for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
2368
int quadi;
2369
/*Mode addressing is through Y plane, always 4 MB per SB.*/
2370
for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
2371
oc_mode_choice modes[8];
2372
unsigned activity[4];
2373
unsigned rd_scale[5];
2374
unsigned rd_iscale[5];
2375
unsigned skip_ssd[12];
2376
unsigned intra_satd[12];
2377
unsigned luma;
2378
int mb_mv_bits_0;
2379
int mb_gmv_bits_0;
2380
int inter_mv_pref;
2381
int mb_mode;
2382
int refi;
2383
int mv;
2384
unsigned mbi;
2385
int mapii;
2386
int mapi;
2387
int bi;
2388
ptrdiff_t fragi;
2389
mbi=sbi<<2|quadi;
2390
luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
2391
/*Activity masking.*/
2392
if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
2393
oc_mb_activity(_enc,mbi,activity);
2394
}
2395
else oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
2396
luma_sum+=luma;
2397
activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
2398
chroma_rd_scale,activity,activity_avg,luma,luma_avg);
2399
/*Motion estimation:
2400
We always do a basic 1MV search for all macroblocks, coded or not,
2401
keyframe or not.*/
2402
if(!_recode&&sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
2403
mv=0;
2404
/*Find the block choice with the lowest estimated coding cost.
2405
If a Cb or Cr block is coded but no Y' block from a macro block then
2406
the mode MUST be OC_MODE_INTER_NOMV.
2407
This is the default state to which the mode data structure is
2408
initialised in encoder and decoder at the start of each frame.*/
2409
/*Block coding cost is estimated from correlated SATD metrics.*/
2410
/*At this point, all blocks that are in frame are still marked coded.*/
2411
if(!_recode){
2412
embs[mbi].unref_mv[OC_FRAME_GOLD]=
2413
embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
2414
embs[mbi].unref_mv[OC_FRAME_PREV]=
2415
embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2416
embs[mbi].refined=0;
2417
}
2418
/*Estimate the cost of coding this MB in a keyframe.*/
2419
if(_allow_keyframe){
2420
oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2421
_enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
2422
intrabits+=modes[OC_MODE_INTRA].rate;
2423
for(bi=0;bi<4;bi++){
2424
oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
2425
modes[OC_MODE_INTRA].qii[bi]);
2426
}
2427
}
2428
/*Estimate the cost in a delta frame for various modes.*/
2429
oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd);
2430
if(sp_level<OC_SP_LEVEL_NOMC){
2431
oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
2432
OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2433
skip_ssd,rd_scale);
2434
oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2435
_enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
2436
mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2437
OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
2438
_enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2439
oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
2440
OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2441
skip_ssd,rd_scale);
2442
oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
2443
OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2444
skip_ssd,rd_scale);
2445
oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2446
OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2447
skip_ssd,rd_scale);
2448
mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2449
OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
2450
_enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2451
/*The explicit MV modes (2,6,7) have not yet gone through halfpel
2452
refinement.
2453
We choose the explicit MV mode that's already furthest ahead on
2454
R-D cost and refine only that one.
2455
We have to be careful to remember which ones we've refined so that
2456
we don't refine it again if we re-encode this frame.*/
2457
inter_mv_pref=_enc->lambda*3;
2458
if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
2459
oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2460
embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2461
skip_ssd,rd_scale);
2462
}
2463
else{
2464
modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX;
2465
}
2466
if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
2467
modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
2468
if(!(embs[mbi].refined&0x80)){
2469
oc_mcenc_refine4mv(_enc,mbi);
2470
embs[mbi].refined|=0x80;
2471
}
2472
oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2473
embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2474
skip_ssd,rd_scale);
2475
}
2476
else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
2477
modes[OC_MODE_INTER_MV].cost){
2478
if(!(embs[mbi].refined&0x40)){
2479
oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
2480
embs[mbi].refined|=0x40;
2481
}
2482
mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2483
OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
2484
_enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2485
}
2486
if(!(embs[mbi].refined&0x04)){
2487
oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
2488
embs[mbi].refined|=0x04;
2489
}
2490
mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2491
OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
2492
_enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2493
/*Finally, pick the mode with the cheapest estimated R-D cost.*/
2494
mb_mode=OC_MODE_INTER_NOMV;
2495
if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2496
mb_mode=OC_MODE_INTRA;
2497
}
2498
if(modes[OC_MODE_INTER_MV_LAST].cost<modes[mb_mode].cost){
2499
mb_mode=OC_MODE_INTER_MV_LAST;
2500
}
2501
if(modes[OC_MODE_INTER_MV_LAST2].cost<modes[mb_mode].cost){
2502
mb_mode=OC_MODE_INTER_MV_LAST2;
2503
}
2504
if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2505
mb_mode=OC_MODE_GOLDEN_NOMV;
2506
}
2507
if(modes[OC_MODE_GOLDEN_MV].cost<modes[mb_mode].cost){
2508
mb_mode=OC_MODE_GOLDEN_MV;
2509
}
2510
if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[mb_mode].cost){
2511
mb_mode=OC_MODE_INTER_MV_FOUR;
2512
}
2513
/*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
2514
if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
2515
inter_mv_pref=0;
2516
}
2517
if(modes[OC_MODE_INTER_MV].cost<modes[mb_mode].cost+inter_mv_pref){
2518
mb_mode=OC_MODE_INTER_MV;
2519
}
2520
}
2521
else{
2522
oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
2523
OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2524
skip_ssd,rd_scale);
2525
oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2526
_enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
2527
oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2528
OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2529
skip_ssd,rd_scale);
2530
mb_mode=OC_MODE_INTER_NOMV;
2531
if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2532
mb_mode=OC_MODE_INTRA;
2533
}
2534
if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2535
mb_mode=OC_MODE_GOLDEN_NOMV;
2536
}
2537
mb_mv_bits_0=mb_gmv_bits_0=0;
2538
}
2539
mb_modes[mbi]=mb_mode;
2540
/*Propagate the MVs to the luma blocks.*/
2541
if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2542
switch(mb_mode){
2543
case OC_MODE_INTER_MV:{
2544
mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2545
}break;
2546
case OC_MODE_INTER_MV_LAST:mv=last_mv;break;
2547
case OC_MODE_INTER_MV_LAST2:mv=prior_mv;break;
2548
case OC_MODE_GOLDEN_MV:{
2549
mv=embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
2550
}break;
2551
}
2552
for(bi=0;bi<4;bi++){
2553
fragi=mb_maps[mbi][0][bi];
2554
frag_mvs[fragi]=mv;
2555
}
2556
}
2557
for(bi=0;bi<4;bi++){
2558
fragi=sb_maps[mbi>>2][mbi&3][bi];
2559
frags[fragi].qii=modes[mb_mode].qii[bi];
2560
}
2561
if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi,
2562
modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){
2563
int orig_mb_mode;
2564
orig_mb_mode=mb_mode;
2565
mb_mode=mb_modes[mbi];
2566
refi=OC_FRAME_FOR_MODE(mb_mode);
2567
switch(mb_mode){
2568
case OC_MODE_INTER_MV:{
2569
prior_mv=last_mv;
2570
/*If we're backing out from 4MV, find the MV we're actually
2571
using.*/
2572
if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
2573
for(bi=0;;bi++){
2574
fragi=mb_maps[mbi][0][bi];
2575
if(frags[fragi].coded){
2576
mv=last_mv=frag_mvs[fragi];
2577
break;
2578
}
2579
}
2580
mb_mv_bits_0=OC_MV_BITS[0][OC_MV_X(mv)+31]
2581
+OC_MV_BITS[0][OC_MV_Y(mv)+31];
2582
}
2583
/*Otherwise we used the original analysis MV.*/
2584
else last_mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2585
_enc->mv_bits[0]+=mb_mv_bits_0;
2586
_enc->mv_bits[1]+=12;
2587
}break;
2588
case OC_MODE_INTER_MV_LAST2:{
2589
oc_mv tmp_mv;
2590
tmp_mv=prior_mv;
2591
prior_mv=last_mv;
2592
last_mv=tmp_mv;
2593
}break;
2594
case OC_MODE_GOLDEN_MV:{
2595
_enc->mv_bits[0]+=mb_gmv_bits_0;
2596
_enc->mv_bits[1]+=12;
2597
}break;
2598
case OC_MODE_INTER_MV_FOUR:{
2599
oc_mv lbmvs[4];
2600
oc_mv cbmvs[4];
2601
prior_mv=last_mv;
2602
for(bi=0;bi<4;bi++){
2603
fragi=mb_maps[mbi][0][bi];
2604
if(frags[fragi].coded){
2605
lbmvs[bi]=last_mv=frag_mvs[fragi];
2606
_enc->mv_bits[0]+=OC_MV_BITS[0][OC_MV_X(last_mv)+31]
2607
+OC_MV_BITS[0][OC_MV_Y(last_mv)+31];
2608
_enc->mv_bits[1]+=12;
2609
}
2610
/*Replace the block MVs for not-coded blocks with (0,0).*/
2611
else lbmvs[bi]=0;
2612
}
2613
(*set_chroma_mvs)(cbmvs,lbmvs);
2614
for(mapii=4;mapii<nmap_idxs;mapii++){
2615
mapi=map_idxs[mapii];
2616
pli=mapi>>2;
2617
bi=mapi&3;
2618
fragi=mb_maps[mbi][pli][bi];
2619
frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
2620
frags[fragi].refi=refi;
2621
frags[fragi].mb_mode=mb_mode;
2622
frag_mvs[fragi]=cbmvs[bi];
2623
}
2624
}break;
2625
}
2626
coded_mbis[ncoded_mbis++]=mbi;
2627
oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
2628
interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
2629
}
2630
else{
2631
*(uncoded_mbis-++nuncoded_mbis)=mbi;
2632
mb_mode=OC_MODE_INTER_NOMV;
2633
refi=OC_FRAME_PREV;
2634
mv=0;
2635
}
2636
/*Propagate final MB mode and MVs to the chroma blocks.
2637
This has already been done for 4MV mode, since it requires individual
2638
block motion vectors.*/
2639
if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2640
for(mapii=4;mapii<nmap_idxs;mapii++){
2641
mapi=map_idxs[mapii];
2642
pli=mapi>>2;
2643
bi=mapi&3;
2644
fragi=mb_maps[mbi][pli][bi];
2645
/*If we switched from 4MV mode to INTER_MV mode, then the qii
2646
values won't have been chosen with the right MV, but it's
2647
probably not worth re-estimating them.*/
2648
frags[fragi].qii=modes[mb_mode].qii[mapii];
2649
frags[fragi].refi=refi;
2650
frags[fragi].mb_mode=mb_mode;
2651
frag_mvs[fragi]=mv;
2652
}
2653
}
2654
/*Save masking scale factors for chroma blocks.*/
2655
for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
2656
mapi=map_idxs[mapii];
2657
bi=mapi&3;
2658
fragi=mb_maps[mbi][1][bi];
2659
mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
2660
mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
2661
}
2662
}
2663
oc_fr_state_flush_sb(_enc->pipe.fr+0);
2664
sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full;
2665
sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial;
2666
}
2667
oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
2668
/*Code chroma planes.*/
2669
for(pli=1;pli<3;pli++){
2670
oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe,
2671
pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
2672
oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
2673
}
2674
notstart=1;
2675
}
2676
/*Update the average block activity and MB luma score for the frame.
2677
We could use a Bessel follower here, but fast reaction is probably almost
2678
always best.*/
2679
_enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
2680
(unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
2681
_enc->state.fplanes[0].nfrags));
2682
_enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
2683
/*Finish filling in the reference frame borders.*/
2684
refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
2685
for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
2686
/*Finish adding flagging overhead costs to inter bit counts to determine if
2687
we should have coded a key frame instead.*/
2688
if(_allow_keyframe){
2689
/*Technically the chroma plane counts are over-estimations, because they
2690
don't account for continuing runs from the luma planes, but the
2691
inaccuracy is small.
2692
We don't need to add the luma plane coding flag costs, because they are
2693
already included in the MB rate estimates.*/
2694
for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE;
2695
if(interbits>intrabits)return 1;
2696
}
2697
_enc->ncoded_mbis=ncoded_mbis;
2698
/*Compact the coded fragment list.*/
2699
{
2700
ptrdiff_t ncoded_fragis;
2701
ncoded_fragis=_enc->state.ncoded_fragis[0];
2702
for(pli=1;pli<3;pli++){
2703
memmove(_enc->state.coded_fragis+ncoded_fragis,
2704
_enc->state.coded_fragis+_enc->state.fplanes[pli].froffset,
2705
_enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis));
2706
ncoded_fragis+=_enc->state.ncoded_fragis[pli];
2707
}
2708
_enc->state.ntotal_coded_fragis=ncoded_fragis;
2709
}
2710
return 0;
2711
}
2712
2713