[FFmpeg-devel] [PATCH] Remove mmx.h from dsputil_mmx.c

Wed Mar 11 20:44:20 CET 2009

On Wed, Mar 11, 2009 at 03:19:03AM -0400, Alex Converse wrote:
> On Tue, Mar 10, 2009 at 10:04 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
> > On Tue, Mar 10, 2009 at 09:19:56PM -0400, Alex Converse wrote:
> >> This is my first mmx patch so please bear with me.
> >
> >> diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
> >> index 2f47f5f..3771c5f 100644
> >> --- a/libavcodec/x86/dsputil_mmx.c
> >> +++ b/libavcodec/x86/dsputil_mmx.c
> >> @@ -28,7 +28,6 @@
> >> ?#include "libavcodec/mpegvideo.h"
> >> ?#include "libavcodec/simple_idct.h"
> >> ?#include "dsputil_mmx.h"
> >> -#include "mmx.h"
> >> ?#include "vp3dsp_mmx.h"
> >> ?#include "vp3dsp_sse2.h"
> >> ?#include "vp6dsp_mmx.h"
> >> @@ -278,16 +277,33 @@ static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
> >>
> >> ?void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
> >> ?{
> >> + ? ?const DCTELEM *p = block;
> >> + ? ?uint8_t *pix = pixels;
> >> ? ? ?int i;
> >>
> >> - ? ?movq_m2r(*vector128, mm1);
> >> - ? ?for (i = 0; i < 8; i++) {
> >> - ? ? ? ?movq_m2r(*(block), mm0);
> >> - ? ? ? ?packsswb_m2r(*(block + 4), mm0);
> >> - ? ? ? ?block += 8;
> >> - ? ? ? ?paddb_r2r(mm1, mm0);
> >> - ? ? ? ?movq_r2m(mm0, *pixels);
> >> - ? ? ? ?pixels += line_size;
> >> + ? ?__asm__ volatile ("movq %0, %%mm0" : : "m" (*vector128));
> >> + ? ?for (i = 0; i < 8; i+=4) {
> >> + ? ? ? ?__asm__ volatile (
> >> + ? ? ? ? ? ? ? ?"movq ? (%1), %%mm1 ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"movq 16(%1), %%mm2 ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"movq 32(%1), %%mm3 ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"movq 48(%1), %%mm4 ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"packsswb ?8(%1), %%mm1 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"packsswb 24(%1), %%mm2 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"packsswb 40(%1), %%mm3 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"packsswb 56(%1), %%mm4 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"paddb %%mm0, %%mm1 ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"paddb %%mm0, %%mm2 ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"paddb %%mm0, %%mm3 ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"paddb %%mm0, %%mm4 ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"movq %%mm1, (%0) ? ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"movq %%mm2, (%0, %2) ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?"movq %%mm3, (%0, %2, 2) ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ?"movq %%mm4, (%0, %3) ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ?::"r" (pix), "r" (p), "r"((x86_reg)line_size), "r"((x86_reg)3*line_size)
> >> + ? ? ? ? ? ? ? ?:"memory");
> >> + ? ? ? ?p += 32;
> >> + ? ? ? ?pix += 4*line_size;
> >
> > 1. the for() should not be there, it should be in the asm()
> 
> The for loop has been eliminated
> 
> > 2. benchmark is always needed when messing with optimized code.
> > ? ?START/STOP_TIMER around the point that calls the changed
> > ? ?function is likely the easiest way to do it
> 
> The benchmarks seem inconclusive. The unpatched version seems to start
> out faster and end slower. I don't know what of make of this.

[...]

> diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
> index 2f47f5f..b9679ad 100644
> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -28,7 +28,6 @@
>  #include "libavcodec/mpegvideo.h"
>  #include "libavcodec/simple_idct.h"
>  #include "dsputil_mmx.h"
> -#include "mmx.h"
>  #include "vp3dsp_mmx.h"
>  #include "vp3dsp_sse2.h"
>  #include "vp6dsp_mmx.h"
> @@ -276,19 +275,39 @@ void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size
>  static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
>    { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
>  
> +#define put_signed_pixels_clamped_mmx_half \
> +            "movq   (%1), %%mm1                 \n\t"\
> +            "movq 16(%1), %%mm2                 \n\t"\
> +            "movq 32(%1), %%mm3                 \n\t"\
> +            "movq 48(%1), %%mm4                 \n\t"\
> +            "packsswb  8(%1), %%mm1             \n\t"\
> +            "packsswb 24(%1), %%mm2             \n\t"\
> +            "packsswb 40(%1), %%mm3             \n\t"\
> +            "packsswb 56(%1), %%mm4             \n\t"\
> +            "paddb %%mm0, %%mm1                 \n\t"\
> +            "paddb %%mm0, %%mm2                 \n\t"\
> +            "paddb %%mm0, %%mm3                 \n\t"\
> +            "paddb %%mm0, %%mm4                 \n\t"\
> +            "movq %%mm1, (%0)                   \n\t"\
> +            "movq %%mm2, (%0, %2)               \n\t"\
> +            "movq %%mm3, (%0, %2, 2)            \n\t"\
> +            "movq %%mm4, (%0, %3)               \n\t"
> +
>  void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
>  {
> -    int i;
> -
> -    movq_m2r(*vector128, mm1);
> -    for (i = 0; i < 8; i++) {
> -        movq_m2r(*(block), mm0);
> -        packsswb_m2r(*(block + 4), mm0);
> -        block += 8;
> -        paddb_r2r(mm1, mm0);
> -        movq_r2m(mm0, *pixels);
> -        pixels += line_size;
> -    }
> +    const DCTELEM *p = block;
> +    uint8_t *pix = pixels;
> +    x86_reg line_skip = line_size;
> +
> +    __asm__ volatile (
> +            "movq (%3), %%mm0                   \n\t"
> +            "lea (%2, %2, 2), %3                \n\t"
> +            put_signed_pixels_clamped_mmx_half
> +            "add $64, %1                        \n\t"
> +            "lea (%0, %2, 4), %0                \n\t"
> +            put_signed_pixels_clamped_mmx_half
> +            ::"r" (pix), "r" (p), "r"(line_skip), "r" (vector128)
> +            :"memory");

you dont need the add $64, this can be done by changing the offsets
also %0,%1 and %3 are marked as input, while you change them

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Its not that you shouldnt use gotos but rather that you should write
readable code and code with gotos often but not always is less readable
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090311/4fe48797/attachment.pgp>