[FFmpeg-devel] [PATCH] Remove mmx.h from dsputil_mmx.c

Alex Converse alex.converse
Sat Mar 14 01:38:09 CET 2009


On Wed, Mar 11, 2009 at 3:44 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
> On Wed, Mar 11, 2009 at 03:19:03AM -0400, Alex Converse wrote:
>> On Tue, Mar 10, 2009 at 10:04 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
>> > On Tue, Mar 10, 2009 at 09:19:56PM -0400, Alex Converse wrote:
>> >> This is my first mmx patch so please bear with me.
>> >
>> >> diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
>> >> index 2f47f5f..3771c5f 100644
>> >> --- a/libavcodec/x86/dsputil_mmx.c
>> >> +++ b/libavcodec/x86/dsputil_mmx.c
>> >> @@ -28,7 +28,6 @@
>> >> ?#include "libavcodec/mpegvideo.h"
>> >> ?#include "libavcodec/simple_idct.h"
>> >> ?#include "dsputil_mmx.h"
>> >> -#include "mmx.h"
>> >> ?#include "vp3dsp_mmx.h"
>> >> ?#include "vp3dsp_sse2.h"
>> >> ?#include "vp6dsp_mmx.h"
>> >> @@ -278,16 +277,33 @@ static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
>> >>
>> >> ?void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
>> >> ?{
>> >> + ? ?const DCTELEM *p = block;
>> >> + ? ?uint8_t *pix = pixels;
>> >> ? ? ?int i;
>> >>
>> >> - ? ?movq_m2r(*vector128, mm1);
>> >> - ? ?for (i = 0; i < 8; i++) {
>> >> - ? ? ? ?movq_m2r(*(block), mm0);
>> >> - ? ? ? ?packsswb_m2r(*(block + 4), mm0);
>> >> - ? ? ? ?block += 8;
>> >> - ? ? ? ?paddb_r2r(mm1, mm0);
>> >> - ? ? ? ?movq_r2m(mm0, *pixels);
>> >> - ? ? ? ?pixels += line_size;
>> >> + ? ?__asm__ volatile ("movq %0, %%mm0" : : "m" (*vector128));
>> >> + ? ?for (i = 0; i < 8; i+=4) {
>> >> + ? ? ? ?__asm__ volatile (
>> >> + ? ? ? ? ? ? ? ?"movq ? (%1), %%mm1 ? ? ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"movq 16(%1), %%mm2 ? ? ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"movq 32(%1), %%mm3 ? ? ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"movq 48(%1), %%mm4 ? ? ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"packsswb ?8(%1), %%mm1 ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"packsswb 24(%1), %%mm2 ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"packsswb 40(%1), %%mm3 ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"packsswb 56(%1), %%mm4 ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"paddb %%mm0, %%mm1 ? ? ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"paddb %%mm0, %%mm2 ? ? ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"paddb %%mm0, %%mm3 ? ? ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"paddb %%mm0, %%mm4 ? ? ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"movq %%mm1, (%0) ? ? ? ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"movq %%mm2, (%0, %2) ? ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?"movq %%mm3, (%0, %2, 2) ? ? ? ?\n\t"
>> >> + ? ? ? ? ? ? ? ?"movq %%mm4, (%0, %3) ? ? ? ? ? \n\t"
>> >> + ? ? ? ? ? ? ? ?::"r" (pix), "r" (p), "r"((x86_reg)line_size), "r"((x86_reg)3*line_size)
>> >> + ? ? ? ? ? ? ? ?:"memory");
>> >> + ? ? ? ?p += 32;
>> >> + ? ? ? ?pix += 4*line_size;
>> >
>> > 1. the for() should not be there, it should be in the asm()
>>
>> The for loop has been eliminated
>>
>> > 2. benchmark is always needed when messing with optimized code.
>> > ? ?START/STOP_TIMER around the point that calls the changed
>> > ? ?function is likely the easiest way to do it
>>
>> The benchmarks seem inconclusive. The unpatched version seems to start
>> out faster and end slower. I don't know what of make of this.
>
> [...]
>
>> diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
>> index 2f47f5f..b9679ad 100644
>> --- a/libavcodec/x86/dsputil_mmx.c
>> +++ b/libavcodec/x86/dsputil_mmx.c
>> @@ -28,7 +28,6 @@
>> ?#include "libavcodec/mpegvideo.h"
>> ?#include "libavcodec/simple_idct.h"
>> ?#include "dsputil_mmx.h"
>> -#include "mmx.h"
>> ?#include "vp3dsp_mmx.h"
>> ?#include "vp3dsp_sse2.h"
>> ?#include "vp6dsp_mmx.h"
>> @@ -276,19 +275,39 @@ void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size
>> ?static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
>> ? ?{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
>>
>> +#define put_signed_pixels_clamped_mmx_half \
>> + ? ? ? ? ? ?"movq ? (%1), %%mm1 ? ? ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"movq 16(%1), %%mm2 ? ? ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"movq 32(%1), %%mm3 ? ? ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"movq 48(%1), %%mm4 ? ? ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"packsswb ?8(%1), %%mm1 ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"packsswb 24(%1), %%mm2 ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"packsswb 40(%1), %%mm3 ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"packsswb 56(%1), %%mm4 ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"paddb %%mm0, %%mm1 ? ? ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"paddb %%mm0, %%mm2 ? ? ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"paddb %%mm0, %%mm3 ? ? ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"paddb %%mm0, %%mm4 ? ? ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"movq %%mm1, (%0) ? ? ? ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"movq %%mm2, (%0, %2) ? ? ? ? ? ? ? \n\t"\
>> + ? ? ? ? ? ?"movq %%mm3, (%0, %2, 2) ? ? ? ? ? ?\n\t"\
>> + ? ? ? ? ? ?"movq %%mm4, (%0, %3) ? ? ? ? ? ? ? \n\t"
>> +
>> ?void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
>> ?{
>> - ? ?int i;
>> -
>> - ? ?movq_m2r(*vector128, mm1);
>> - ? ?for (i = 0; i < 8; i++) {
>> - ? ? ? ?movq_m2r(*(block), mm0);
>> - ? ? ? ?packsswb_m2r(*(block + 4), mm0);
>> - ? ? ? ?block += 8;
>> - ? ? ? ?paddb_r2r(mm1, mm0);
>> - ? ? ? ?movq_r2m(mm0, *pixels);
>> - ? ? ? ?pixels += line_size;
>> - ? ?}
>> + ? ?const DCTELEM *p = block;
>> + ? ?uint8_t *pix = pixels;
>> + ? ?x86_reg line_skip = line_size;
>> +
>> + ? ?__asm__ volatile (
>> + ? ? ? ? ? ?"movq (%3), %%mm0 ? ? ? ? ? ? ? ? ? \n\t"
>> + ? ? ? ? ? ?"lea (%2, %2, 2), %3 ? ? ? ? ? ? ? ?\n\t"
>> + ? ? ? ? ? ?put_signed_pixels_clamped_mmx_half
>> + ? ? ? ? ? ?"add $64, %1 ? ? ? ? ? ? ? ? ? ? ? ?\n\t"
>> + ? ? ? ? ? ?"lea (%0, %2, 4), %0 ? ? ? ? ? ? ? ?\n\t"
>> + ? ? ? ? ? ?put_signed_pixels_clamped_mmx_half
>> + ? ? ? ? ? ?::"r" (pix), "r" (p), "r"(line_skip), "r" (vector128)
>> + ? ? ? ? ? ?:"memory");
>
> you dont need the add $64, this can be done by changing the offsets
> also %0,%1 and %3 are marked as input, while you change them
>

Fixed

Do the x87 stack or the mm* registers need to be marked as clobbered?

Here is cycle estimation from Valgrind. I find it easier to read:
        Ir         Dr         Dw     I1mr    D1mr     D1mw  I2mr  D2mr    D2mw
function put_signed_pixels_clamped_mmx:
before:
16,088,784  6,033,294  2,681,464  122,988  34,505  259,448   991   292  27,624
  Cycle Estimation = Ir + 10*L1m + 100*L2m = 23,184,894
after:
12,736,954  6,033,294  2,681,464   64,595  33,784  258,686   617   294  27,622
  Cycle Estimation = Ir + 10*L1m + 100*L2m = 19,160,904

--Alex
-------------- next part --------------
A non-text attachment was scrubbed...
Name: dsputil_mmx-no-mmx.h.diff
Type: text/x-diff
Size: 2540 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090313/e78d813e/attachment.diff>



More information about the ffmpeg-devel mailing list