[Ffmpeg-devel] [RFC] VC3/DNxHD decoder

Sun Mar 18 20:32:43 CET 2007

Hi

Michael Niedermayer wrote:
> Hi
> 
> On Sat, Mar 17, 2007 at 03:40:19PM +0100, Baptiste Coudurier wrote:
>> Hi
>>
>> $subject.
>>
>> I need to include "mpegvideo.h" only for Scantable and
>> ff_init_scantable, can we separate them from mpegvideo.h ? like
>> scantable.h ?
>>
>> It decodes 1080p atm, but you should only need to add ac/dc/run vlc
>> tables to decode other progressive modes. Interlaced mode is almost
>> finished, bitstream is one complete frame but 2 sequentials fields
>> bitstreams.
>>
>> Also if someone could explain me how CODEC_CAP_DR1 and draw_horiz_band
>> and low_res works, I can implement them.
>>
>> Optimization ideas very welcome, same for dsp functions use.
> 
> [...]
> 
>> +    int cid; //<< compression id
> 
> hmm shouldnt it be ///<

Fixed.

> [...]
>> +    DCTELEM blocks[8][64];
> 
> this should be DECLARE_ALIGNED_16

Done.

>> +    DECLARE_ALIGNED_8(ScanTable, scantable);
>> +    const CIDEntry *cid_table;
>> +} DNXHDContext;
>> +
>> +static const CIDEntry cid_table[] = {
>> +    { 1238, 1920, 1080, 0, 917504, 4, 8,
>> +      dnxhd_1238_luma_weigth, dnxhd_1238_chroma_weigth,
>> +      dnxhd_1238_dc_codes, dnxhd_1238_dc_bits,
>> +      dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_level,
>> +      dnxhd_1238_ac_run_flag, dnxhd_1238_ac_index_flag,
>> +      dnxhd_1238_run_codes, dnxhd_1238_run_bits, dnxhd_1238_run_level },
>> +/*     { 1243, 1920, 1080, 1, 917504, 4, 8, */
>> +/*       dnxhd_1243_luma_weigth, dnxhd_1243_chroma_weigth, */
>> +/*       dnxhd_1238_dc_codes, dnxhd_1238_dc_bits, */
>> +/*       dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_level, */
>> +/*       dnxhd_1238_ac_run_flag, dnxhd_1238_ac_index_flag, */
>> +/*       dnxhd_1238_run_codes, dnxhd_1238_run_bits, dnxhd_1238_run_level }, */
>> +};
> 
> id suggest that all values which are always the same no matter which cid
> should be hardcoded as that should be faster

What do you mean by hardcoded ?

> [...]
>> +        init_vlc(&ctx->dc_vlc, DNXHD_VLC_BITS, 12,
>> +                 cid_table->dc_bits, 1, 1,
>> +                 cid_table->dc_codes, 1, 1, 0);
> 
> the longest dc bits is just 6 so DNXHD_VLC_BITS, which is 9 would waste
> memory and cache

added DNXHD_DC_VLC_BITS defined to 6 and changed init_vlc.

> [...]
>> +        for(i = 0; i < 64; i++)
>> +            ctx->luma_weigth[dnxhd_zigzag[i]] = cid_table->luma_weigth[i];
>> +        for(i = 0; i < 64; i++)
>> +            ctx->chroma_weigth[dnxhd_zigzag[i]] = cid_table->chroma_weigth[i];
> 
> you could store these tables already permutated in dnxhd_zigzag order in the
> header

Yes, done.

> [...]
>> +    av_log(ctx->avctx, AV_LOG_DEBUG, "mb width %d, mb height %d\n", ctx->mb_width, ctx->mb_height);
>> +    for (i = 0; i < ctx->mb_height; i++) {
>> +        ctx->mb_scan_index[i] = AV_RB32(buf + 0x170 + (i<<2));
>> +        av_log(ctx->avctx, AV_LOG_DEBUG, "mb scan index %d\n", ctx->mb_scan_index[i]);
> 
> i think you should check that mb_scan_index is within the buffer

Done.

> [...]
> 
>> +static int dnxhd_decode_dc(DNXHDContext *ctx)
>> +{
>> +    int len;
>> +    unsigned int p = 0;
>> +    int e = 0;
>> +
>> +    len = get_vlc2(&ctx->gb, ctx->dc_vlc.table, DNXHD_VLC_BITS, 1);
>> +    if (len) {
>> +        p = get_bits(&ctx->gb, len);
>> +        //av_log(ctx->avctx, AV_LOG_DEBUG, "p %d, len %d\n", p, len);
>> +        e = p >= (1<<(len-1)) ? p : p + 1 - (1<<len);
> 
> hmm try get_xbits() maybe it does the same

Right, changed.

>> +    }
>> +    return e;
>> +}
>> +
>> +static int dnxhd_decode_dct_block(DNXHDContext *ctx, DCTELEM *block, int n, int qscale)
>> +{
>> +    int dc, i, j, index, index2;
>> +    int16_t level;
>> +    int component, sign;
>> +    const uint8_t *quant_matrix;
>> +    int diff;
>> +
>> +    if (n == 2 || n == 6) {
>> +        component = 1;
>> +        quant_matrix = ctx->chroma_weigth;
>> +    } else if (n == 3 || n == 7) {
>> +        component = 2;
>> +        quant_matrix = ctx->chroma_weigth;
>> +    } else {
>> +        component = 0;
>> +        quant_matrix = ctx->luma_weigth;
>> +    }
> 
> if(n&2){
>     component= 1 + (n&1);
>     quant_matrix = ctx->chroma_weigth;
> }else{
>     component = 0;
>     quant_matrix = ctx->luma_weigth;
> }

Done.

>> +    diff = dnxhd_decode_dc(ctx);
>> +    dc = ctx->last_dc[component];
>> +    dc += diff;
>> +    ctx->last_dc[component] = dc;
>> +    block[0] = dc;
> 
> ctx->last_dc[component] += dnxhd_decode_dc(ctx);
> block[0]= ctx->last_dc[component];

Done.

>> +    //av_log(ctx->avctx, AV_LOG_DEBUG, "dc %d, diff %d\n", dc, diff);
>> +    for (i = 1; i < 65; i++) {
>> +        index = get_vlc2(&ctx->gb, ctx->ac_vlc.table, DNXHD_VLC_BITS, 2);
>> +        //av_log(ctx->avctx, AV_LOG_DEBUG, "index %d\n", index);
>> +        if (index == 4) { /* EOB */
>> +            //av_log(ctx->avctx, AV_LOG_DEBUG, "EOB\n");
>> +            break;
>> +        }
>> +
>> +        if (i > 63) {
>> +            av_log(ctx->avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", n, i);
>> +            return -1;
>> +        }
>> +
>> +        sign = get_bits1(&ctx->gb) ? -1 : 1;
>> +
>> +        level = ctx->ac_level[index];
>> +        if (ctx->ac_index_flag[index]) {
>> +            level += get_bits(&ctx->gb, ctx->index_bits)<<6;
>> +        }
>> +
>> +        if (ctx->ac_run_flag[index]) {
> 
> index= get_vlc()
> ctx->ac_level[index]
> ctx->ac_index_flag[index]
> ctx->ac_run_flag[index]
> 
> is quite inefficint as it needs 3 reads for the pointers to the tables
> 3 more reads with base+index to get the actual 3 values
> 
> by chaning the tables this could be done for example like
> level= get_vlc()
> level & 0x400
> level & 0x800
> level &=0x3FF
> 
> you avoid all thouse reads

Ok, I got it, I need to change values in ac_level table right ?
I'll do it after svn inclusion is it ok ?

>> +            index2 = get_vlc2(&ctx->gb, ctx->run_vlc.table, DNXHD_VLC_BITS, 2);
>> +            i += ctx->run_level[index2];
> 
> i think run_level is not a good name, i see nothing related to the level
> here ;)

Changed.

>> +        }
>> +
>> +        j = ctx->scantable.permutated[i];
>> +        //av_log(ctx->avctx, AV_LOG_DEBUG, "j %d\n", j);
>> +        if (level) {
> 
> the if(level) is useless it must be always true at least with the current
> tables and i think tables for which its false make no sense ...

Seems so, removed, also check for EOB is done for level == 0 in fact.

>> +            //av_log(ctx->avctx, AV_LOG_DEBUG, "level %d, quant %d\n", level, quant_matrix[i]);
>> +            level = level * qscale * quant_matrix[i] + ((qscale * quant_matrix[i])>>1);
>> +            if (quant_matrix[i] != 32) // FIXME 10bit
>> +                level += 16;
>> +            //av_log(ctx->avctx, AV_LOG_DEBUG, "temp %d ", level);
>> +            level = sign * (level>>5); // FIXME 10bit
> 
> 
> sign= get_bits1(&ctx->gb);
> 
> and
> 
> level= (2*level+1) * qscale * quant_matrix[i];
> if (quant_matrix[i] != 32) // FIXME 10bit
>     level += 32;
> level >>= 6;
> if(sign)
>     level= -level
> 
> you could also try to simply use something like
> level += quant_bias[i];
> maybe its faster, maybe not ...
> 
> or maybe the following is faster ...
> 
> sign= get_sbits(&ctx->gb, 1);
> 
> level= (level^sign)-sign;

That one seems much faster here.

> you could also try to merge the (2*level+1) into the level table though it
> would complicat the ctx->ac_index_flag[index]==1 case a little but it still
> might be faster as i think ctx->ac_index_flag[index]==1 is rare

Humm, I'll try to optimize it further after svn inclusion, is it ok ?

>> +        }
>> +
>> +        //av_log(NULL, AV_LOG_DEBUG, "i %d, j %d, end level %d\n", i, j, level);
>> +        block[j] = level;
>> +    }
> 
> the end checking of the whole loop can also be improved
> 
> instead of
> 
> for (i = 1; i < 65; i++) {
>     if(EOB) break;
>     if(i>63) return -1;
>     if(flag)
>         i+= get_vlc()
>     write data based on i
> }
> 
> id suggest
> 
> for (i = 1; ; i++) {
>     if(EOB) break;
>     if(flag)
>         i+= get_vlc()
>     if(i>63) return -1;
>     write data based on i
> }

Done.

> [...]
>> +static int dnxhd_decode_macroblock(DNXHDContext *ctx, int x, int y)
>> +{
>> +    int dct_linesize_luma   = ctx->picture.linesize[0];
>> +    int dct_linesize_chroma = ctx->picture.linesize[1];
>> +    uint8_t *dest_y, *dest_u, *dest_v;
>> +    int dct_offset;
>> +    int qscale, i, j, k;
>> +
>> +    //ctx->dsp.clear_blocks(ctx->blocks[0]);
>> +
>> +    memset(ctx->blocks, 0, sizeof(ctx->blocks)); //FIXME dsp clear blocks should take number of blocks
> 
> doing the clearing after put_pixels_clamped() may (or may not) be faster due
> to cache effects

Well, no real difference here, however clear_blocks 2 times is really
faster than memset it seems.

> [...]
>> +    for (i = 0; i < 8; i++) {
>> +        dnxhd_decode_dct_block(ctx, ctx->blocks[i], i, qscale);
>> +        ctx->dsp.idct(ctx->blocks[i]);
> 
>> +        for (j = 0; j < 8; j++)
>> +            for (k = 0; k < 8; k++)
>> +                ctx->blocks[i][j + (k<<3)] += 128; // adjust levels
> 
> setting last_dc to 1<<something instead of 0 at the begin of each row should
> have the same effect and be quite a bit faster
> 
> and ctx->dsp.idct_put() could be used

So true, changed.

How is it ?

-- 
Baptiste COUDURIER                              GnuPG Key Id: 0x5C1ABAAA
SMARTJOG S.A.                                    http://www.smartjog.com
Key fingerprint                 8D77134D20CC9220201FC5DB0AC9325C5C1ABAAA
Phone: +33 1 49966312
-------------- next part --------------
A non-text attachment was scrubbed...
Name: dnxhd.patch
Type: text/x-diff
Size: 23344 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20070318/c85d1d4d/attachment.patch>