[FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations

Sun Jun 22 17:22:27 CEST 2008

On Sun, Jun 22, 2008 at 03:21:53PM +0200, Victor Pollex wrote:
> Michael Niedermayer schrieb:
>> On Sat, Jun 21, 2008 at 03:37:44PM +0200, Victor Pollex wrote:
>>   
>>> Hi,
>>> as in subject.
>>>
>>> Victor Pollex
>>>     
>>
>>   
>>> Index: libavcodec/i386/vc1dsp_mmx.c
>>> ===================================================================
>>> --- libavcodec/i386/vc1dsp_mmx.c	(Revision 13854)
>>> +++ libavcodec/i386/vc1dsp_mmx.c	(Arbeitskopie)
>>> @@ -1,6 +1,7 @@
>>>  /*
>>>   * VC-1 and WMV3 - DSP functions MMX-optimized
>>>   * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
>>> + * Copyright (c) 2008 Victor Pollex
>>>   *
>>>   * Permission is hereby granted, free of charge, to any person
>>>   * obtaining a copy of this software and associated documentation
>>> @@ -467,7 +468,609 @@
>>>  DECLARE_FUNCTION(3, 2)
>>>  DECLARE_FUNCTION(3, 3)
>>>  +#define LOAD_4X4(stride,base,in)\
>>> +    "movq 0*"#stride"+"#base#in", %%mm0\n\t"\
>>> +    "movq 1*"#stride"+"#base#in", %%mm1\n\t"\
>>> +    "movq 2*"#stride"+"#base#in", %%mm2\n\t"\
>>> +    "movq 3*"#stride"+"#base#in", %%mm3\n\t"
>>>     
>>
>> duplicate of LOAD4
>>   
> the only LOAD4 I found is in dsputilenc_mmx.c and has a fixed stride of 8, 
> but I also need a stride of 16. If I missed something give me a hint.

LOAD4 does load 4 your LOAD_4X4 loads 4, they have slightly different things
that are hardcoded and that are settable through arguments. This doesnt make
them less duplicates of each other. Simply add stride to the exsting LOAD4
(in a seperate patch!) and use it.

[...]
>>> +/*
>>> +    precondition:
>>> +        r0 = row0/col0
>>> +        r1 = row1/col1
>>> +        r2 = row2/col2
>>> +        r3 = row3/col3
>>> +
>>> +    postcondition:
>>> +        r0 = col0/row0
>>> +        r1 = col1/row1
>>> +        r2 = col2/row2
>>> +        r3 = col3/row3
>>> +        t0 = undefined
>>> +*/
>>> +#define TRANSPOSE_4X4(r0,r1,r2,r3,t0)\
>>> +    "movq      "#r2", "#t0"\n\t"\
>>> +    "punpcklwd "#r3", "#r2"\n\t"\
>>> +    "punpckhwd "#r3", "#t0"\n\t"\
>>> +    \
>>> +    "movq      "#r0", "#r3"\n\t"\
>>> +    "punpcklwd "#r1", "#r0"\n\t"\
>>> +    "punpckhwd "#r1", "#r3"\n\t"\
>>> +    \
>>> +    "movq      "#r0", "#r1"\n\t"\
>>> +    "punpckldq "#r2", "#r0"\n\t"\
>>> +    "punpckhdq "#r2", "#r1"\n\t"\
>>> +    \
>>> +    "movq      "#r3", "#r2"\n\t"\
>>> +    "punpckldq "#t0", "#r2"\n\t"\
>>> +    "punpckhdq "#t0", "#r3"\n\t"
>>>     
>>
>> duplicate of TRANSPOSE4
>>   
> basically yes, but here the output registers are the same and in the same 
> order as the input registers and with TRANSPOSE4 the input registers are a, 
> b, c, d and the output registers are a, d, t, c according to the comments. 
> If I feel like i could try to rewrite part of the code to use TRANSPOSE4, 
> but I rather try to rewrite it so that I don't need to transpose it in the 
> first place.

Its entirely your decission on how to solve it as long as the solution is
optimal speed/size/readbility wise. But i will not approve a
patch that adds duplicated code (like TRANSPOSE4 / TRANSPOSE_4X4) when it
is avoidable

[...]
>>> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
>>> +{
>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>> +    asm volatile(
>>> +    TRANSFORM_8X4_ROW(0x00, (%0), %1)
>>> +    TRANSFORM_8X4_ROW(0x40, (%0), %1)
>>> +
>>> +
>>> +    LOAD_4X4(0x10, 0x00, %1)
>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>> +    STORE_4X4(0x10, 0x00, %1)
>>> +    LOAD_4X4(0x10, 0x40, %1)
>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>> +    STORE_4X4(0x10, 0x40, %1)
>>> +    TRANSFORM_4X8_COL(0x00, %1, (%0))
>>> +
>>> +    LOAD_4X4(0x10, 0x08, %1)
>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>> +    STORE_4X4(0x10, 0x08, %1)
>>> +    LOAD_4X4(0x10, 0x48, %1)
>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>> +    STORE_4X4(0x10, 0x48, %1)
>>> +    TRANSFORM_4X8_COL(0x08, %1, (%0))
>>> +    : "+r"(block), "+m"(temp)
>>> +    :
>>> +    : "memory"
>>> +    );
>>> +}
>>> +
>>> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM 
>>> *block)
>>> +{
>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>> +    asm volatile(
>>> +    TRANSFORM_8X4_ROW(0x00, (%0), %1)
>>> +
>>> +    LOAD_4X4(0x10, 0x00, %1)
>>> +    TRANSFORM_4X4_COL
>>> +    STORE_4X4(0x10, 0x00, (%0))
>>> +    LOAD_4X4(0x10, 0x08, %1)
>>> +    TRANSFORM_4X4_COL
>>> +    STORE_4X4(0x10, 0x08, (%0))
>>> +
>>> +    "pxor        %%mm7,     %%mm7\n\t"
>>> +    LOAD_4X4(0x08, 0x00, (%0))
>>> +    LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>>> +    "add %3, %2\n\t"
>>> +    LOAD_4X4(0x08, 0x20, (%0))
>>> +    LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>>> +    : "+r"(block), "+m"(temp), "+r"(dest)
>>> +    : "r"(linesize)
>>> +    : "memory"
>>> +    );
>>> +}
>>> +
>>> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM 
>>> *block)
>>> +{
>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>> +    asm volatile(
>>> +    LOAD_4X4(0x10, 0x00, (%0))
>>> +    TRANSFORM_4X4_ROW
>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>> +    STORE_4X4(0x10, 0x00, %1)
>>> +    LOAD_4X4(0x10, 0x40, (%0))
>>> +    TRANSFORM_4X4_ROW
>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>> +    STORE_4X4(0x10, 0x40, %1)
>>> +
>>> +    TRANSFORM_4X8_COL(0x00, %1, (%0))
>>> +
>>> +    "pxor     %%mm7,   %%mm7\n\t"
>>> +    LOAD_4X4(0x10, 0x00, (%0))
>>> +    LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>>> +    "add %3, %2\n\t"
>>> +    LOAD_4X4(0x10, 0x40, (%0))
>>> +    LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>>> +    : "+r"(block), "+m"(temp), "+r"(dest)
>>> +    : "r"(linesize)
>>> +    : "memory"
>>> +    );
>>> +}
>>> +
>>> +static void vc1_inv_trans_4x4_mmx(uint8_t *dest, int linesize, DCTELEM 
>>> *block)
>>> +{
>>> +    asm volatile(
>>> +    LOAD_4X4(0x10, 0x00, (%1))
>>> +    TRANSFORM_4X4_ROW
>>> +    TRANSFORM_4X4_COL
>>> +    "pxor     %%mm7,   %%mm7\n\t"
>>> +    LOAD_ADD_CLAMP_STORE_4X4(%0, %2)
>>> +    : "+r"(dest)
>>> +    : "r"(block), "r"(linesize)
>>> +    : "memory"
>>> +    );
>>> +}
>>>     
>>
>> I do not think that brute force duplicating and unrolling of all variants
>> is optimal. Also benchmarks are needed for C vs, your mmx vs mmx
>> code with no duplicated transforms
>>
>> [...]
>>   
> what do you mean with duplicated transforms? Do you mean that for example 
> the 4x4 row transformation should be a method instead of a macro?

Yes (not inlined methods/function or just plain asm that uses loops/calls
whatever), i meant this should be tried, code cache
is a limited resource and your code after macro expansion is likely large.
I do not know which way it would be faster of course but it should be tested.

>
> As for benchmarks I used START_TIMER and STOP_TIMER from libavutil with 
> mingw 4.3.0-tdm3
> I don't know if it is an appropiate method, but i did it somewhat like this
>
> for(i = 0; i < (1 << 19); ++i) {
>    memcpy(block1, block, 64 * sizeof(short));
>    START_TIMER
>    vc1_inv_trans_4x4_c(dest, 8, block1);
>    STOP_TIMER("vc1_inv_trans_4x4_c")
> }

The transforms should be tested in the real code that is in vc1.c
just put START/STOP_TIMER over the call to the transform

[...]

> 7377 dezicycles in vc1_inv_trans_8x8_c, 65506 runs, 30 skips
> 7377 dezicycles in vc1_inv_trans_8x8_c, 131009 runs, 63 skips
> 7377 dezicycles in vc1_inv_trans_8x8_c, 262014 runs, 130 skips
> 7377 dezicycles in vc1_inv_trans_8x8_c, 524041 runs, 247 skips
>
> 7360 dezicycles in vc1_inv_trans_8x8_mmx, 65503 runs, 33 skips
> 7376 dezicycles in vc1_inv_trans_8x8_mmx, 131006 runs, 66 skips
> 7382 dezicycles in vc1_inv_trans_8x8_mmx, 262016 runs, 128 skips
> 7387 dezicycles in vc1_inv_trans_8x8_mmx, 524047 runs, 241 skips

This one does not look faster

[...]

> +#define SRA2(r0,r1,c0)\
> +    "psraw $"#c0", "#r0"\n\t" /* r0 >> c0 */\
> +    "psraw $"#c0", "#r1"\n\t" /* r1 >> c0 */
> +
> +/*
> +    postcondition:
> +        r0 = r0 >> c0;
> +        r1 = r1 >> c0;
> +        r2 = r2 >> c0;
> +        r3 = r3 >> c0;
> +*/
> +#define SRA4(r0,r1,r2,r3,c0)\
> +    SRA2(r0,r1,c0)\
> +    SRA2(r2,r3,c0)

I think a generic macro like:

OPC_SS_AB(opc, dst0, dst1, src) \
    opc" "src","dst0"\n\t" \
    opc" "src","dst1"\n\t"

OPC_SSSS_ABCD(opc, dst0, dst1, dst2, dst3, src) \
    OPC_SS_AB(opc, dst0, dst1, src) \
    OPC_SS_AB(opc, dst2, dst3, src) \

would be simpler

[...]
> +DECLARE_ALIGNED_16(static const int16_t, constants[]) = {
> +  X4( 4),
> +  X4(64)
> +};

Also please reduce the amount of marcos in your code this hurts readability a
lot.
That means at least every macro that is bigger than the space it safes.
4,4,4,4
64,64,64,64

vs.

X4( 4),
X4(64)
#define X4(x) x, x, x, x

being an example

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Frequently ignored awnser#1 FFmpeg bugs should be sent to our bugtracker. User
questions about the command line tools should be sent to the ffmpeg-user ML.
And questions about how to use libav* should be sent to the libav-user ML.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080622/98bc1b80/attachment.pgp>