# [FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations

Victor Pollex victor.pollex
Thu Jul 17 23:29:59 CEST 2008

Michael Niedermayer schrieb:
> On Mon, Jul 07, 2008 at 09:02:28PM +0200, Victor Pollex wrote:
>
>> Michael Niedermayer schrieb:
>>
>>> On Thu, Jul 03, 2008 at 02:51:18PM +0200, Victor Pollex wrote:
>>>
> [...]
>
>>>> +/*
>>>> +    precodition:
>>>> +        for all values v in r0, r1, r2, r3: -3971 <= v <= 3971
>>>> +
>>>> +    postcondition:
>>>> +        r3 = ((17 * (r0 + r2) + (22 * r1 + 10 * r3) + c) >> 3)
>>>> +        r4 = ((17 * (r0 - r2) - (10 * r1 - 22 * r3) + c) >> 3)
>>>> +        r1 = ((17 * (r0 - r2) + (10 * r1 - 22 * r3) + c) >> 3)
>>>> +        r2 = ((17 * (r0 + r2) - (22 * r1 + 10 * r3) + c) >> 3)
>>>> +        r0 undefined
>>>> +        r5 undefined
>>>> +        r6 undefined
>>>> +        r7 undefined
>>>> +*/
>>>> +#define TRANSFORM_4X4_ROW(r0,r1,r2,r3,r4,r5,r6,r7,c)\
>>>> +    TRANSPOSE4(r0,r1,r2,r3,r4)\
>>>> +    TRANSFORM_4X4_COMMON(r0,r3,r4,r2,r1,r5,r6,r7,c)\
>>>> +    "paddw "#r4", "#r4"\n\t" /* 2 * (r0 + r2) */\
>>>> +    SUMSUB_BA(r3,r4)\
>>>> +    "paddw "#r0", "#r0"\n\t" /* 2 * (r0 - r2) */\
>>>> +    SUMSUB_BA(r2,r0)\
>>>> +    TRANSPOSE4(r3,r0,r2,r4,r1)
>>>>
>>>>
>>> It should be possible to merge one transpose into the scantble (the
>>> mpeg1/2/4
>>> decoder does that too)
>>>
>>>
>>>
>> I'm not sure if this should be done as I found the following lines in
>>    if (!v->res_fasttx)
>>    {
>>        v->s.dsp.vc1_inv_trans_8x8 = ff_simple_idct;
>>    }
>>
>
> The used permutation should of course depend on the used idct
>

ok, changed it. I could also try to do the same with the scantables for
the 8x4 and 4x8 transformations if desired.

>
>
>>> [...]
>>>
>>>
>>>> +/*
>>>> +    postcondition:
>>>> +        r0 = [15:0](2 * r0);
>>>> +        r1 = [15:0](3 * r0);
>>>> +*/
>>>> +#define G3X(r0,r1)\
>>>> +    "movq  "#r0", "#r1"\n\t" /* r0 */\
>>>> +    "paddw "#r0", "#r0"\n\t" /* 2 * r0 */\
>>>> +    "paddw "#r0", "#r1"\n\t" /* 3 * r0 */
>>>>
>>>>
>>> 4 uses, saving 8 lines, macro with docs is 9 lines
>>>
>>>
>> removed docs
>>
>
> I would prefer if you would remove the macro, it would make the code easier
> to understand and more flexible.
>

removed it.

>
> [...]
>
>> Index: libavcodec/i386/vc1dsp_mmx.c
>> ===================================================================
>> --- libavcodec/i386/vc1dsp_mmx.c	(Revision 14101)
>> +++ libavcodec/i386/vc1dsp_mmx.c	(Arbeitskopie)
>> @@ -1,6 +1,7 @@
>>  /*
>>   * VC-1 and WMV3 - DSP functions MMX-optimized
>>   * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
>> + * Copyright (c) 2008 Victor Pollex
>>   *
>>   * Permission is hereby granted, free of charge, to any person
>>   * obtaining a copy of this software and associated documentation
>> @@ -467,7 +468,466 @@
>>  DECLARE_FUNCTION(3, 2)
>>  DECLARE_FUNCTION(3, 3)
>>
>> +#define OPC_SS_AB(opc, src, dst0, dst1)\
>> +    #opc" "#src","#dst0"\n\t"\
>> +    #opc" "#src","#dst1"\n\t"
>> +
>> +#define OPC_SSSS_ABCD(opc, src, dst0, dst1, dst2, dst3)\
>> +    OPC_SS_AB(opc, src, dst0, dst1)\
>> +    OPC_SS_AB(opc, src, dst2, dst3)
>> +
>>
>
>
>> +    "psubw "#src", "#dst1"\n\t"
>>
>
> using
>
> "paddw src, dst0\n\t    psubw src, dst1\n\t"
>
> instead of yet another macro
> will be 4 lines shorter and more readable
>
>

removed it. I used two separate lines and commented them instead of
doing it like you suggested.

>
>
>> +
>> +/*
>> +    precodition:
>> +        for all values v in r0, r1, r2, r3: -2^15 <= 5 * v + c / 2 <= 2^15 - 1
>> +
>> +    postcondition:
>> +        r0 =     r0 - r2
>> +        r1 = 2 * r1 + r3
>> +        r2 =     r0 + r2
>> +        r3 = 2 * r3 - r1
>> +        r4 = (((r0 + r2 + c) >> 1) + (3 * r1 + r3)) >> 2
>> +        r5 = (((r0 - r2 + c) >> 1) - (3 * r3 - r1)) >> 2
>> +        r6 = (((r0 - r2 + c) >> 1) + (3 * r3 - r1)) >> 2
>> +        r7 = (((r0 + r2 + c) >> 1) - (3 * r1 + r3)) >> 2
>> +*/
>> +#define TRANSFORM_4X4_COMMON(r0,r1,r2,r3,r4,r5,r6,r7,c)\
>>
>
>
>> +    SUMSUB_BA(r2,r0)\
>> +    "movq     "#r0", "#r5"\n\t" /* r0 - r2 */\
>> +    "movq     "#r2", "#r7"\n\t" /* r0 + r2 */\
>> +    "movq      "#c", "#r4"\n\t" /* c */\
>> +    "paddw    "#r4", "#r5"\n\t" /* r0 - r2 + c */\
>> +    "paddw    "#r4", "#r7"\n\t" /* r0 + r2 + c */\
>>
>
> r5= c       c
> r0-=r2      r0-r2
> r2+=r2      2r2
> r6=r2       2r2
> r2+=r0      r2+r2
> r5+=r0      r0-r2+c
> r6+=r5      r0+r2+c
>
> one instruction less
>
>

did it a bit differently, also with 7 instructions.

>
>> +    OPC_SS_AB(psraw,\$1,r5,r7)\
>>
>
>
>> +    "movq     "#r1", "#r4"\n\t" /* r1 */\
>> +    "movq     "#r3", "#r6"\n\t" /* r3 */\
>> +    "paddw    "#r1", "#r1"\n\t" /* 2 * r1 */\
>> +    "paddw    "#r6", "#r1"\n\t" /* 2 * r1 + r3 */\
>> +    "paddw    "#r3", "#r3"\n\t" /* 2 * r3 */\
>> +    "psubw    "#r4", "#r3"\n\t" /* 2 * r3 - r1 */\
>> +    "paddw    "#r1", "#r4"\n\t" /* 3 * r1 + r3 */\
>> +    "paddw    "#r3", "#r6"\n\t" /* 3 * r3 - r1 */\
>>
>
> r4=r3       r3
> r3+=r1      r3+r1
> r1+=r1      2r1
> r3+=r1      r3+3r1
> r4-=r1      r3-2r1
> r1+=r1      2r3-4r1
> r1+=r3      3r3-r1
>
> one instruction less
>
>
>

unfortunately the 2r1+r3 and 2r3-r1 are needed later.

> [...]
>
>> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
>> +{
>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>> +    asm volatile(
>> +    TRANSFORM_8X4_ROW(0x00(%0),0x00%1)
>> +    TRANSFORM_8X4_ROW(0x40(%0),0x40%1)
>> +
>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> +    STORE4(0x10,0x00%1,%%mm0,%%mm3,%%mm4,%%mm2)
>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> +    STORE4(0x10,0x08%1,%%mm0,%%mm3,%%mm4,%%mm2)
>> +
>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> +    STORE4(0x10,0x40%1,%%mm0,%%mm3,%%mm4,%%mm2)
>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> +    STORE4(0x10,0x48%1,%%mm0,%%mm3,%%mm4,%%mm2)
>> +
>> +    TRANSFORM_4X8_COL(0x00%1,0x00(%0),%2)
>> +    TRANSFORM_4X8_COL(0x08%1,0x08(%0),%2)
>> +    :
>> +    : "r"(block), "m"(temp[0]), "m"(constants[4])
>> +    : "memory"
>> +    );
>> +}
>> +
>> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM *block)
>> +{
>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>> +    asm volatile(
>> +    TRANSFORM_8X4_ROW(0x00(%1),0x00%2)
>> +
>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> +    TRANSFORM_4X4_COL(%%mm0,%%mm3,%%mm4,%%mm2,%%mm1,%%mm5,%%mm6,%%mm7,%5)
>> +    "pxor %%mm7, %%mm7\n\t"
>> +
>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> +    TRANSFORM_4X4_COL(%%mm0,%%mm3,%%mm4,%%mm2,%%mm1,%%mm5,%%mm6,%%mm7,%5)
>> +    "pxor %%mm7, %%mm7\n\t"
>> +    : "+r"(dest)
>> +    : "r"(block), "m"(temp[0]), "r"(dest+4), "r"(linesize), "m"(constants[4])
>> +    : "memory"
>> +    );
>> +}
>> +
>> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM *block)
>> +{
>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>> +    asm volatile(
>> +    TRANSFORM_4X4_ROW(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4,%%mm5,%%mm6,%%mm7,%4)
>> +    STORE4(0x10,0x00%2,%%mm3,%%mm4,%%mm1,%%mm2)
>> +    TRANSFORM_4X4_ROW(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4,%%mm5,%%mm6,%%mm7,%4)
>> +    STORE4(0x10,0x40%2,%%mm3,%%mm4,%%mm1,%%mm2)
>> +
>> +    TRANSFORM_4X8_COL(0x00%2,0x00(%1),0x08+%4)
>> +
>> +    "pxor %%mm7, %%mm7\n\t"
>> +    : "+r"(dest)
>> +    : "r"(block), "m"(temp[0]), "r"(linesize), "m"(constants[0])
>> +    : "memory"
>> +    );
>> +}
>>
>
> some of the load&store are avoidable
>
>

The only thing I can think of is in the 4x8 transformation, where the
column transformation is done in two halfs. Instead of writing the lines
to "block" after each half, I could add them to "dest". Is this what you