[FFmpeg-devel] [PATCH] Electronic Arts TGQ decoder

Sat Sep 27 12:51:29 CEST 2008

On Sat, Sep 27, 2008 at 08:07:05PM +1000, Peter Ross wrote:
> On Sat, Sep 27, 2008 at 10:12:34AM +1000, Peter Ross wrote:
> > Patches enclosed.
> > 
> > Info: http://wiki.multimedia.cx/index.php?title=Electronic_Arts_TGQ
> > Samples: http://samples.mplayerhq.hu/game-formats/ea-tgq-uv/
> 
> Thanks for the prompt feedback. Round two enclosed.
> 
> -- Peter
> (A907 E02F A6E5 0CD2 34CD 20D2 6760 79C5 AC40 DD6B)

> Index: libavcodec/mpegvideo_enc.c
> ===================================================================
> --- libavcodec/mpegvideo_enc.c	(revision 15434)
> +++ libavcodec/mpegvideo_enc.c	(working copy)
> @@ -61,7 +61,7 @@
>      4520 ,  6270,  5906,  5315,  4520,  3552,  2446,  1247
>  };
>  
> -static const uint16_t inv_aanscales[64] = {
> +const uint16_t ff_inv_aanscales[64] = {
>    4096,  2953,  3135,  3483,  4096,  5213,  7568, 14846,
>    2953,  2129,  2260,  2511,  2953,  3759,  5457, 10703,
>    3135,  2260,  2399,  2666,  3135,  3990,  5793, 11363,
> @@ -3089,7 +3089,7 @@
>              || s->dsp.fdct == ff_faandct
>  #endif
>             )
> -            dct_coeff= (dct_coeff*inv_aanscales[ scantable[i] ]) >> 12;
> +            dct_coeff= (dct_coeff*ff_inv_aanscales[ scantable[i] ]) >> 12;
>          zero_distortion= dct_coeff*dct_coeff;
>  
>          for(level_index=0; level_index < coeff_count[i]; level_index++){

ok

[...]

> Index: libavcodec/dsputil.c
> ===================================================================
> --- libavcodec/dsputil.c	(revision 15434)
> +++ libavcodec/dsputil.c	(working copy)
> @@ -4137,6 +4137,72 @@
>      dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
>  }
>  
> +/* Electronic Arts TGQ/TQI/MAD IDCT algorithm */
> +#define A4 1.3065630f
> +#define A2 0.5411961f
> +#define A5 0.3826834f
> +#define SQRT2 1.41421356237309514547
> +#define IDCT_TRANSFORM(dest,d0,d1,d2,d3,d4,d5,d6,d7,munge,src) {\
> +    const int src7add1 = (src)[7] + (src)[1]; \
> +    const int src3add5 = (src)[3] + (src)[5]; \

> +    const int value1 = (src7add1 - src3add5)/SQRT2; \
> +    const int value2 = ((src)[2] - (src)[6])/SQRT2; \

These could be done by *(1/SQRT2), multiplies are faster than divides
Also they might be faster with a lrintf()

> +    const int src1sub7 = (src)[1] - (src)[7]; \
> +    const int src5sub3 = (src)[5] - (src)[3]; \

> +    const int result0 = floor( src5sub3*A2 + (src5sub3+src1sub7)*A5 ); \
> +    const int result2 = floor( src1sub7*A4 - (src5sub3+src1sub7)*A5 ); \

i would s/floor/lrintf/ as that should be faster on x86

> +    const int b0 = result2 + src3add5 + src7add1; \
> +    const int b1 = result2 + value1; \
> +    const int b2 = result0 + value1; \
> +    const int b3 = result0; \
> +    const int src0add4 = (src)[0] + (src)[4]; \
> +    const int src0sub4 = (src)[0] - (src)[4]; \
> +    const int src26value2 = (src)[2] + (src)[6] + value2; \
> +    const int a0 = src0add4 + src26value2; \
> +    const int a1 = src0sub4 + value2; \
> +    const int a2 = src0sub4 - value2; \
> +    const int a3 = src0add4 - src26value2; \
> +    (dest)[d0] = munge(a0 + b0); \
> +    (dest)[d1] = munge(a1 + b1); \
> +    (dest)[d2] = munge(a2 + b2); \
> +    (dest)[d3] = munge(a3 + b3); \
> +    (dest)[d4] = munge(a3 - b3); \
> +    (dest)[d5] = munge(a2 - b2); \
> +    (dest)[d6] = munge(a1 - b1); \
> +    (dest)[d7] = munge(a0 - b0); \

adding 8 to src[0][0] at the start should improve the correctness
of the >>4 that munge is in the second pass.

> +}
> +/* end IDCT_TRANSFORM macro */
> +
> +#define MUNGE_NONE(x) (x)
> +#define IDCT_ROW(dest,src)  IDCT_TRANSFORM(dest,0,8,16,24,32,40,48,56,MUNGE_NONE,src)
> +
> +#define MUNGE_8BIT(x) av_clip_uint8((x)>>4)
> +#define IDCT_COL(dest,src) IDCT_TRANSFORM(dest,0,1, 2, 3, 4, 5, 6, 7,MUNGE_8BIT,src)

it seems this does a transpose during IDCT_ROW, if that wasnt done, 
ff_zigzag_direct_transposed would be unneeded and the normal zigzag
should be useable

[...]
> Index: libavcodec/dsputil.h
> ===================================================================
> --- libavcodec/dsputil.h	(revision 15434)
> +++ libavcodec/dsputil.h	(working copy)
> @@ -86,6 +86,9 @@
>  void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
>  void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
>  
> +/* EA DSP function */
> +void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);

it seems this is unneeded and the function could be static

[...]
> +static void tgq_decode_block(TgqContext *s, DCTELEM block[64], GetBitContext *gb){
> +    int i,j,value;
> +    block[0] = get_sbits(gb,8) * s->qtable[0];
> +    for(i=1; i<64; ) {
> +        switch(show_bits(gb,3)) {
> +        case 4:
> +            block[ff_zigzag_direct_transposed[i++]] = 0;
> +        case 0:
> +            block[ff_zigzag_direct_transposed[i++]] = 0;
> +            skip_bits(gb,3);
> +            break;
> +        case 5:
> +            block[ff_zigzag_direct_transposed[i++]] = 0;
> +        case 1:
> +            skip_bits(gb,3);
> +            value = 2*get_bits(gb,5);
> +            for(j=0; j<value; j++)
> +                block[ff_zigzag_direct_transposed[i++]] = 0;
> +            break;
> +        case 6:
> +            skip_bits(gb,3);
> +            block[ff_zigzag_direct_transposed[i]] = -(s->qtable[ff_zigzag_direct_transposed[i]]);
> +            i++;
> +            break;
> +        case 2:
> +            skip_bits(gb,3);
> +            block[ff_zigzag_direct_transposed[i]] = s->qtable[ff_zigzag_direct_transposed[i]];
> +            i++;
> +            break;
> +        case 7: // 111b
> +        case 3: // 011b
> +            skip_bits(gb,2);
> +            if (show_bits(gb,6)==0x3F) {
> +                skip_bits(gb, 6);
> +                block[ff_zigzag_direct_transposed[i]] = get_sbits(gb,8) * s->qtable[ff_zigzag_direct_transposed[i]];
> +            }else{
> +                block[ff_zigzag_direct_transposed[i]] = get_sbits(gb,6) * s->qtable[ff_zigzag_direct_transposed[i]];

If you want the codec to work with other idcts, then the permutated scantable
has to be used instead of the hardcoded ff_zigzag_direct_transposed

> +            }
> +            i++;
> +            break;
> +        }
> +    }
> +    block[0] += 128<<4;
> +}
> +
> +static void tgq_idct_put_mb(TgqContext *s, DCTELEM (*block)[64], int mb_x, int mb_y){
> +    int linesize= s->frame.linesize[0];
> +    uint8_t *dest_y  = s->frame.data[0] + (mb_y * 16* linesize            ) + mb_x * 16;
> +    uint8_t *dest_cb = s->frame.data[1] + (mb_y * 8 * s->frame.linesize[1]) + mb_x * 8;
> +    uint8_t *dest_cr = s->frame.data[2] + (mb_y * 8 * s->frame.linesize[2]) + mb_x * 8;
> +
> +    s->dsp.idct_put(dest_y                 , linesize, block[0]);
> +    s->dsp.idct_put(dest_y              + 8, linesize, block[1]);
> +    s->dsp.idct_put(dest_y + 8*linesize    , linesize, block[2]);
> +    s->dsp.idct_put(dest_y + 8*linesize + 8, linesize, block[3]);
> +    if(!(s->avctx->flags&CODEC_FLAG_GRAY)){
> +         s->dsp.idct_put(dest_cb, s->frame.linesize[1], block[4]);
> +         s->dsp.idct_put(dest_cr, s->frame.linesize[2], block[5]);
> +    }
> +}
> +
> +static inline void tgq_dconly(TgqContext *s, unsigned char *dst, int dst_stride, int dc){
> +    int j;
> +    for(j=0;j<8;j++)
> +        memset(dst+j*dst_stride, dc, 8);
> +}
> +
> +static inline void tgq_dconly_block(TgqContext *s, int mb_x, int mb_y, int i, int dc_level){
> +    int linesize= s->frame.linesize[0];
> +    uint8_t *dest_y  = s->frame.data[0] + (mb_y * 16* linesize            ) + mb_x * 16;
> +    uint8_t *dest_cb = s->frame.data[1] + (mb_y * 8 * s->frame.linesize[1]) + mb_x * 8;
> +    uint8_t *dest_cr = s->frame.data[2] + (mb_y * 8 * s->frame.linesize[2]) + mb_x * 8;
> +    int dc = av_clip_uint8(128 + ((dc_level*s->qtable[0])>>4));
> +
> +    switch(i) {
> +    case 0: tgq_dconly(s,dest_y                 , linesize, dc); break;
> +    case 1: tgq_dconly(s,dest_y              + 8, linesize, dc); break;
> +    case 2: tgq_dconly(s,dest_y + 8*linesize    , linesize, dc); break;
> +    case 3: tgq_dconly(s,dest_y + 8*linesize + 8, linesize, dc); break;
> +    case 4: if(!(s->avctx->flags&CODEC_FLAG_GRAY))
> +                tgq_dconly(s,dest_cb, s->frame.linesize[1], dc);
> +            break;
> +    case 5: if(!(s->avctx->flags&CODEC_FLAG_GRAY))
> +                tgq_dconly(s,dest_cr, s->frame.linesize[2], dc);
> +            break;
> +    }
> +}
> +
> +static void tgq_decode_mb(TgqContext *s, int mb_y, int mb_x, const int8_t **bs, const int8_t *buf_end){
> +    int mode;
> +    int i; // block counter
> +    int8_t dc[6];
> +    DCTELEM block[6][64];
> +
> +    mode = bytestream_get_byte((const uint8_t**)bs);
> +    if (mode>buf_end-*bs) {
> +        av_log(s->avctx, AV_LOG_ERROR, "truncated macroblock\n");
> +        return;
> +    }
> +

> +    if (mode==3||mode==6||mode==12) {
> +        if (mode==3) {
> +            memset(dc, (*bs)[0], 4);
> +            dc[4] = (*bs)[1];
> +            dc[5] = (*bs)[2];
> +        }else if (mode==6) {
> +            memcpy(dc, *bs, 6);
> +        }else if (mode==12) {
> +            for(i=0; i<6; i++)
> +                dc[i] = (*bs)[i*2];
> +        }
> +        for(i=0; i<6;i++)
> +            tgq_dconly_block(s, mb_x, mb_y, i, dc[i]);
> +    }else if (mode>12) {
> +        GetBitContext gb;
> +        init_get_bits(&gb, *bs, mode*8);
> +        for(i=0; i<6; i++)
> +            tgq_decode_block(s, block[i], &gb);
> +        tgq_idct_put_mb(s, block, mb_x, mb_y);
> +    }else {
> +        av_log(s->avctx, AV_LOG_ERROR, "unsupported mb mode %i\n", mode);
> +    }

if((mode>12) {
    ...
    tgq_idct_put_mb(s, block, mb_x, mb_y);
}else{
    if (mode==3) {
        memset(dc, (*bs)[0], 4);
        dc[4] = (*bs)[1];
        dc[5] = (*bs)[2];
    }else if (mode==6) {
        memcpy(dc, *bs, 6);
    }else if (mode==12) {
        for(i=0; i<6; i++)
            dc[i] = (*bs)[i*2];
    }else{
        av_log(s->avctx, AV_LOG_ERROR, "unsupported mb mode %i\n", mode);
    }
    tgq_idct_put_mb_dconly()
}

tgq_idct_put_mb_dconly(){
    tgq_dconly(s,dest_y                 , linesize, dc[0]);
    tgq_dconly(s,dest_y              + 8, linesize, dc[1]);
    tgq_dconly(s,dest_y + 8*linesize    , linesize, dc[2]);
    tgq_dconly(s,dest_y + 8*linesize + 8, linesize, dc[3]);
    if(!(s->avctx->flags&CODEC_FLAG_GRAY)){
        tgq_dconly(s,dest_cb, s->frame.linesize[1], dc[4]);
        tgq_dconly(s,dest_cr, s->frame.linesize[2], dc[5]);
    }
}

> +    *bs += mode;
> +}
> +
> +static void tgq_calculate_qtable(TgqContext *s, int quant){
> +    int i,j;
> +    const int a = (14*(100-quant))/100 + 1;
> +    const int b = (11*(100-quant))/100 + 4;
> +    for(j=0;j<8;j++)
> +    for(i=0;i<8;i++)
> +        s->qtable[j*8+i] =((a*(j+i)/(7+7) + b)*ff_inv_aanscales[j*8+i])>>(14-4);
> +}

similarly, if you want the codec to work with the other idcts
ff_inv_aanscales would have to be conditional, and only used for the AAN
idct (=only the new EA one)

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

No human being will ever know the Truth, for even if they happen to say it
by chance, they would not even known they had done so. -- Xenophanes
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080927/5da532e6/attachment.pgp>