[Ffmpeg-devel] [PATCH] Snow mmx+sse2 asm optimizations

Robert Edele yartrebo
Tue Mar 7 23:12:38 CET 2006


On Mon, 2006-03-06 at 02:06 +0100, Michael Niedermayer wrote:
> Hi
> 
> On Sun, Mar 05, 2006 at 06:09:09PM -0500, Robert Edele wrote:
> [...]
> > With the help of ods15, we have done the following:
> >  - the asm code now resides entirely in dsputil_mmx.c.
> >  - snow_mmx_sse2.h is now gone
> >  - code previously in snow.c and all of snow_mmx_sse2.h is now in
> > dsputil_mxx.c, dsputil.c, and dsputil.h.
> >  - snow calls the asm via dsputil function pointers.
> > 
> > If you have any further issues with this code, please let me know.
> 
> it looks much better then before, but
> please move the stuff from dsputil_mmx.c to snowdsp_mmx.c
> this should be just a copy&paste + Makefile update
> 
Fixed as suggested.
> 
> [...]
> > -static void vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
> > +void ff_snow_vertical_compose97i(void *vb0, void *vb1, void *vb2, void *vb3, void *vb4, void *vb5, int width){
> > +    DWTELEM *b0 = vb0;
> > +    DWTELEM *b1 = vb1;
> > +    DWTELEM *b2 = vb2;
> > +    DWTELEM *b3 = vb3;
> > +    DWTELEM *b4 = vb4;
> > +    DWTELEM *b5 = vb5;
> 
> move DWTELEM to dsputil.h or anything else but please not that mess
> 
moved to snow.h
> 
> [...]
> > @@ -2545,6 +2620,41 @@
> >      }
> >  }
> >  
> > +void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> > +                              int src_x, int src_y, int src_stride, void * vsb, int add, uint8_t * dst8){
> > +    slice_buffer * sb = vsb;
> 
> uhm...
> 
> put
> typdef struct slice_buffer_s slice_buffer; 
> in dsputil.c or wherever its needed, and
> struct slice_buffe_s { ... }; in snow.c
> 
> 
slice_buffer struct definition put into snow.h.
> 
> 
> [...]
> > Index: i386/dsputil_mmx.c
> > ===================================================================
> > RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/dsputil_mmx.c,v
> > retrieving revision 1.111
> > diff -u -r1.111 dsputil_mmx.c
> > --- i386/dsputil_mmx.c	10 Feb 2006 06:55:25 -0000	1.111
> > +++ i386/dsputil_mmx.c	5 Mar 2006 17:31:12 -0000
> > @@ -2564,6 +2564,1518 @@
> >  }
> >  #endif
> >  
> > +/* snow wavelet */
> > +#define DWTELEM int
> > +#define W_AM 3
> > +#define W_AO 0
> > +#define W_AS 1
> > +
> > +#define W_BM 1
> > +#define W_BO 8
> > +#define W_BS 4
> > +
> > +#define W_CM 1
> > +#define W_CO 0
> > +#define W_CS 0
> > +
> > +#define W_DM 3
> > +#define W_DO 4
> > +#define W_DS 3
> > +
> > +#ifdef ARCH_X86_64
> > +#define PTR_SIZE "8"
> > +#else
> > +#define PTR_SIZE "4"
> > +#endif
> > +
> > +/** Used to minimize the amount of memory used in order to optimize cache performance. **/
> > +typedef struct {
> > +    DWTELEM * * line; ///< For use by idwt and predict_slices.
> > +    DWTELEM * * data_stack; ///< Used for internal purposes.
> > +    int data_stack_top;
> > +    int line_count;
> > +    int line_width;
> > +    int data_count;
> > +    DWTELEM * base_buffer; ///< Buffer that this structure is caching.
> > +} slice_buffer;
> 
> duplicating #defines and structs is not accpetable, these should be in a
> common header
> 
Offending lines removed, snow.h is now our common header.
> 
> > +
> > +#define snow_interleave_line_header(low,b,width)\
> > +    int i = (width) - 2;\
> > +    \
> > +    if ((width) & 1)\
> > +    {\
> > +        (b)[i+1] = (low)[(i+1)>>1];\
> > +        i--;\
> > +    }
> > +
> > +#define snow_interleave_line_footer(low,high,b)\
> > +    for (; i>=0; i-=2){\
> > +        (b)[i+1] = (high)[i>>1];\
> > +        (b)[i] = (low)[i>>1];\
> > +    }
> 
> these should be inline functions
> 
Fixed as suggested.
> 
> > +
> > +static void horizontal_compose97i_sse2(void *vb, int width){
> > +    DWTELEM *b = vb;
> > +    const int w2= (width+1)>>1;
> > +    // SSE2 code runs faster with pointers aligned on a 32-byte boundary.
> > +    DWTELEM temp_buf[width>>1];
> > +    DWTELEM * const temp = temp_buf + 4 - (((int)temp_buf & 0xF) / 4);
> 
> replace /4 by >>2 or make type unsigned divides by 4 and signed is slow
> 
Fixed using >>2.
> 
> [...]
> > +        for(; i<w_l; i++){
> > +            b[i] = b[i] - ((W_DM * (ref[i] + ref[i + 1]) + W_DO) >> W_DS);
> > +        }
> > +
> > +        if(width&1){
> > +            b[w_l] = b[w_l] - ((W_DM * 2 * ref[w_l] + W_DO) >> W_DS);
> > +        }
> [...]
> > +        for(; i<w_r; i++){
> > +            dst[i] = dst[i] - (b[i] + b[i + 1]);
> > +        }
> > +
> > +        if(!(width&1)){
> > +            dst[w_r] = dst[w_r] - (2 * b[w_r]);
> > +        }
> [...]
> > +        for(; i<w_l; i++){
> > +            b[i] = b[i] - (((-(ref[i] + ref[(i+1)])+W_BO) - 4*b[i])>>W_BS);
> > +        }
> > +
> > +        if(width&1){
> > +            b[w_l] = b[w_l] - (((-2 * ref[w_l] + W_BO) - 4 * b[w_l]) >> W_BS);
> > +        }
> ...
> 
> replace this with a function, see the lift() function in snow.c on how if its
> not obvious
> 
> same applies to the other such cases
> 
2 cases fixed. If there are any more that I missed, please inform me.
> 
> [...]
> > +static void vertical_compose97i_sse2(void *vb0, void *vb1, void *vb2, void *vb3, void *vb4, void *vb5, int width){
> > +    DWTELEM *b0 = vb0;
> > +    DWTELEM *b1 = vb1;
> > +    DWTELEM *b2 = vb2;
> > +    DWTELEM *b3 = vb3;
> > +    DWTELEM *b4 = vb4;
> > +    DWTELEM *b5 = vb5;
> > +    int i;
> > +    int end_w2 = width >> 4; /* Needed because GCC does something totally brain dead and mis-loads end_w into the asm code if I use end_w directly.*/
> > +
> > +    asm volatile (
> > +        "sal $4, %%"REG_d"                           \n\t"
> > +        "jmp 2f                                      \n\t"
> > +        "1:                                           \n\t"
> > +
> > +        "mov %5, %%"REG_a"                           \n\t"
> > +        "mov %3, %%"REG_b"                           \n\t"
> > +
> > +        "movdqa (%%"REG_b",%%"REG_d",4), %%xmm0      \n\t"
> > +        "movdqa 16(%%"REG_b",%%"REG_d",4), %%xmm2    \n\t"
> > +        "movdqa 32(%%"REG_b",%%"REG_d",4), %%xmm4    \n\t"
> > +        "movdqa 48(%%"REG_b",%%"REG_d",4), %%xmm6    \n\t"
> > +
> > +        "paddd (%%"REG_a",%%"REG_d",4), %%xmm0       \n\t"
> > +        "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2     \n\t"
> > +        "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4     \n\t"
> > +        "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6     \n\t"
> > +
> > +        "movdqa %%xmm0, %%xmm1                       \n\t"
> > +        "movdqa %%xmm2, %%xmm3                       \n\t"
> > +        "movdqa %%xmm4, %%xmm5                       \n\t"
> > +        "movdqa %%xmm6, %%xmm7                       \n\t"
> > +
> > +        "pslld $1, %%xmm0                            \n\t"
> > +        "pslld $1, %%xmm2                            \n\t"
> > +        "pslld $1, %%xmm4                            \n\t"
> > +        "pslld $1, %%xmm6                            \n\t"
> > +
> > +        "paddd %%xmm1, %%xmm0                        \n\t"
> > +        "paddd %%xmm3, %%xmm2                        \n\t"
> > +        "paddd %%xmm5, %%xmm4                        \n\t"
> > +        "paddd %%xmm7, %%xmm6                        \n\t"
> > +
> > +        "pcmpeqd %%xmm1, %%xmm1                      \n\t"
> > +        "pslld $31, %%xmm1                           \n\t"
> > +        "psrld $29, %%xmm1                           \n\t"
> > +        "mov %4, %%"REG_a"                           \n\t"
> > +
> > +        "paddd %%xmm1, %%xmm0                        \n\t"
> > +        "paddd %%xmm1, %%xmm2                        \n\t"
> > +        "paddd %%xmm1, %%xmm4                        \n\t"
> > +        "paddd %%xmm1, %%xmm6                        \n\t"
> > +
> > +        "psrad $3, %%xmm0                            \n\t"
> > +        "psrad $3, %%xmm2                            \n\t"
> > +        "psrad $3, %%xmm4                            \n\t"
> > +        "psrad $3, %%xmm6                            \n\t"
> > +
> > +        "movdqa (%%"REG_a",%%"REG_d",4), %%xmm1      \n\t"
> > +        "movdqa 16(%%"REG_a",%%"REG_d",4), %%xmm3    \n\t"
> > +        "movdqa 32(%%"REG_a",%%"REG_d",4), %%xmm5    \n\t"
> > +        "movdqa 48(%%"REG_a",%%"REG_d",4), %%xmm7    \n\t"
> > +
> > +        "psubd %%xmm0, %%xmm1                        \n\t"
> > +        "psubd %%xmm2, %%xmm3                        \n\t"
> > +        "psubd %%xmm4, %%xmm5                        \n\t"
> > +        "psubd %%xmm6, %%xmm7                        \n\t"
> > +
> > +        "movdqa %%xmm1, (%%"REG_a",%%"REG_d",4)      \n\t"
> > +        "movdqa %%xmm3, 16(%%"REG_a",%%"REG_d",4)    \n\t"
> > +        "movdqa %%xmm5, 32(%%"REG_a",%%"REG_d",4)    \n\t"
> > +        "movdqa %%xmm7, 48(%%"REG_a",%%"REG_d",4)    \n\t"
> > +
> > +        "mov %2, %%"REG_c"                           \n\t"
> > +
> > +        "paddd (%%"REG_c",%%"REG_d",4), %%xmm1       \n\t"
> > +        "paddd 16(%%"REG_c",%%"REG_d",4), %%xmm3     \n\t"
> > +        "paddd 32(%%"REG_c",%%"REG_d",4), %%xmm5     \n\t"
> > +        "paddd 48(%%"REG_c",%%"REG_d",4), %%xmm7     \n\t"
> > +
> > +        "movdqa (%%"REG_b",%%"REG_d",4), %%xmm0      \n\t"
> > +        "movdqa 16(%%"REG_b",%%"REG_d",4), %%xmm2    \n\t"
> > +        "movdqa 32(%%"REG_b",%%"REG_d",4), %%xmm4    \n\t"
> > +        "movdqa 48(%%"REG_b",%%"REG_d",4), %%xmm6    \n\t"
> > +
> > +        "psubd %%xmm1, %%xmm0                        \n\t"
> > +        "psubd %%xmm3, %%xmm2                        \n\t"
> > +        "psubd %%xmm5, %%xmm4                        \n\t"
> > +        "psubd %%xmm7, %%xmm6                        \n\t"
> > +
> > +        "movdqa %%xmm0, (%%"REG_b",%%"REG_d",4)      \n\t"
> > +        "movdqa %%xmm2, 16(%%"REG_b",%%"REG_d",4)    \n\t"
> > +        "movdqa %%xmm4, 32(%%"REG_b",%%"REG_d",4)    \n\t"
> > +        "movdqa %%xmm6, 48(%%"REG_b",%%"REG_d",4)    \n\t"
> > +
> > +        "mov %1, %%"REG_a"                           \n\t"
> > +
> > +        "paddd (%%"REG_a",%%"REG_d",4), %%xmm0       \n\t"
> > +        "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2     \n\t"
> > +        "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4     \n\t"
> > +        "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6     \n\t"
> > +
> > +        "movdqa (%%"REG_c",%%"REG_d",4), %%xmm1      \n\t"
> > +        "movdqa 16(%%"REG_c",%%"REG_d",4), %%xmm3    \n\t"
> > +        "movdqa 32(%%"REG_c",%%"REG_d",4), %%xmm5    \n\t"
> > +        "movdqa 48(%%"REG_c",%%"REG_d",4), %%xmm7    \n\t"
> > +
> > +        "pslld $2, %%xmm1                            \n\t"
> > +        "pslld $2, %%xmm3                            \n\t"
> > +        "pslld $2, %%xmm5                            \n\t"
> > +        "pslld $2, %%xmm7                            \n\t"
> > +
> > +        "paddd %%xmm1, %%xmm0                        \n\t"
> > +        "paddd %%xmm3, %%xmm2                        \n\t"
> > +        "paddd %%xmm5, %%xmm4                        \n\t"
> > +        "paddd %%xmm7, %%xmm6                        \n\t"
> > +
> > +        "pcmpeqd %%xmm1, %%xmm1                      \n\t"
> > +        "pslld $31, %%xmm1                           \n\t"
> > +        "psrld $28, %%xmm1                           \n\t"
> > +        "mov %0, %%"REG_b"                           \n\t"
> > +
> > +        "paddd %%xmm1, %%xmm0                        \n\t"
> > +        "paddd %%xmm1, %%xmm2                        \n\t"
> > +        "paddd %%xmm1, %%xmm4                        \n\t"
> > +        "paddd %%xmm1, %%xmm6                        \n\t"
> > +
> > +        "psrad $4, %%xmm0                            \n\t"
> > +        "psrad $4, %%xmm2                            \n\t"
> > +        "psrad $4, %%xmm4                            \n\t"
> > +        "psrad $4, %%xmm6                            \n\t"
> > +
> > +        "paddd (%%"REG_c",%%"REG_d",4), %%xmm0       \n\t"
> > +        "paddd 16(%%"REG_c",%%"REG_d",4), %%xmm2     \n\t"
> > +        "paddd 32(%%"REG_c",%%"REG_d",4), %%xmm4     \n\t"
> > +        "paddd 48(%%"REG_c",%%"REG_d",4), %%xmm6     \n\t"
> > +
> > +        "movdqa %%xmm0, (%%"REG_c",%%"REG_d",4)      \n\t"
> > +        "movdqa %%xmm2, 16(%%"REG_c",%%"REG_d",4)    \n\t"
> > +        "movdqa %%xmm4, 32(%%"REG_c",%%"REG_d",4)    \n\t"
> > +        "movdqa %%xmm6, 48(%%"REG_c",%%"REG_d",4)    \n\t"
> > +
> > +        "paddd (%%"REG_b",%%"REG_d",4), %%xmm0       \n\t"
> > +        "paddd 16(%%"REG_b",%%"REG_d",4), %%xmm2     \n\t"
> > +        "paddd 32(%%"REG_b",%%"REG_d",4), %%xmm4     \n\t"
> > +        "paddd 48(%%"REG_b",%%"REG_d",4), %%xmm6     \n\t"
> > +
> > +        "movdqa %%xmm0, %%xmm1                       \n\t"
> > +        "movdqa %%xmm2, %%xmm3                       \n\t"
> > +        "movdqa %%xmm4, %%xmm5                       \n\t"
> > +        "movdqa %%xmm6, %%xmm7                       \n\t"
> > +
> > +        "pslld $1, %%xmm0                            \n\t"
> > +        "pslld $1, %%xmm2                            \n\t"
> > +        "pslld $1, %%xmm4                            \n\t"
> > +        "pslld $1, %%xmm6                            \n\t"
> > +
> > +        "paddd %%xmm1, %%xmm0                        \n\t"
> > +        "paddd %%xmm3, %%xmm2                        \n\t"
> > +        "paddd %%xmm5, %%xmm4                        \n\t"
> > +        "paddd %%xmm7, %%xmm6                        \n\t"
> > +
> > +        "psrad $1, %%xmm0                            \n\t"
> > +        "psrad $1, %%xmm2                            \n\t"
> > +        "psrad $1, %%xmm4                            \n\t"
> > +        "psrad $1, %%xmm6                            \n\t"
> > +
> > +        "paddd (%%"REG_a",%%"REG_d",4), %%xmm0       \n\t"
> > +        "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2     \n\t"
> > +        "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4     \n\t"
> > +        "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6     \n\t"
> > +
> > +        "movdqa %%xmm0, (%%"REG_a",%%"REG_d",4)      \n\t"
> > +        "movdqa %%xmm2, 16(%%"REG_a",%%"REG_d",4)    \n\t"
> > +        "movdqa %%xmm4, 32(%%"REG_a",%%"REG_d",4)    \n\t"
> > +        "movdqa %%xmm6, 48(%%"REG_a",%%"REG_d",4)    \n\t"
> > +
> > +        "2:                                           \n\t"
> > +        "sub $16, %%"REG_d"                          \n\t"
> > +        "jge 1b                                      \n\t"
> > +        ::
> > +        "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5),"d"(end_w2):
> > +        "%"REG_a"","%"REG_b"","%"REG_c"");
> 
> this code is not valid, REG_d is changed but neither output nor on the clobber list
> 
  REG_d is on the input list, so GCC recognizes it as clobbered? GCC
also refuses that I put it REG_d into the clobber list. I believe the
code is good as is?
> [...]
> 
> > +static inline void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> > +                      int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
> > +    int y, x;
> > +    DWTELEM * dst;
> > +    DWTELEM * * dst_array = sb->line + src_y;
> > +
> > +    asm volatile(
> > +             "mov  %5, %%ebx                 \n\t"
> > +             "mov  %3, %%"REG_S"             \n\t"
> > +             "pcmpeqd %%xmm4, %%xmm4         \n\t"
> > +             "pslld $31, %%xmm4              \n\t"
> > +             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */
> > +             "psrld $24, %%xmm4              \n\t" /* FRAC_BITS >> 1 */
> > +
> > +             "1:                              \n\t"
> > +             "movq (%%"REG_S"), %%xmm0       \n\t"
> > +             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> > +             "punpcklbw %%xmm7, %%xmm0       \n\t"
> > +             "movq 8(%%"REG_S"), %%xmm1      \n\t"
> > +             "punpcklbw %%xmm7, %%xmm1       \n\t"
> > +             "movq (%%"REG_d"), %%xmm5       \n\t"
> > +             "mov %1, %%"REG_D"              \n\t"
> > +             "punpcklbw %%xmm7, %%xmm5       \n\t"
> > +             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> > +             "movq (%%"REG_d"), %%xmm6       \n\t"
> > +             "pmullw %%xmm0, %%xmm5          \n\t"
> > +             "punpcklbw %%xmm7, %%xmm6       \n\t"
> > +             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> > +             "mov (%%"REG_D"), %%"REG_D"     \n\t"
> > +
> > +             "movq 128(%%"REG_S"), %%xmm0    \n\t"
> > +             "pmullw %%xmm1, %%xmm6          \n\t"
> > +             "punpcklbw %%xmm7, %%xmm0       \n\t"
> > +             "movq 136(%%"REG_S"), %%xmm1    \n\t"
> > +             "add %2, %%"REG_D"              \n\t"
> > +             "punpcklbw %%xmm7, %%xmm1       \n\t"
> > +             "movq (%%"REG_d"), %%xmm2       \n\t"
> > +             "punpcklbw %%xmm7, %%xmm2       \n\t"
> > +             "mov (%%"REG_a"), %%"REG_d"     \n\t"
> > +             "paddusw %%xmm5, %%xmm6         \n\t"
> > +             "pmullw %%xmm0, %%xmm2          \n\t"
> > +             "movq (%%"REG_d"), %%xmm3       \n\t"
> > +             "mov %0, %%"REG_d"              \n\t"
> > +             "punpcklbw %%xmm7, %%xmm3       \n\t"
> > +             "paddusw %%xmm2, %%xmm6         \n\t"
> > +             "pmullw %%xmm1, %%xmm3          \n\t"
> > +             "paddusw %%xmm3, %%xmm6         \n\t"
> > +
> > +             "movdqa (%%"REG_D"), %%xmm3     \n\t"
> > +             "movdqa %%xmm6, %%xmm0          \n\t"
> > +             "movdqa 16(%%"REG_D"), %%xmm5   \n\t"
> > +             "punpckhwd %%xmm7, %%xmm6       \n\t"
> > +             "movq 24(%%"REG_S"), %%xmm1     \n\t"
> > +             "punpcklwd %%xmm7, %%xmm0       \n\t"
> > +             "paddd %%xmm0, %%xmm3           \n\t"
> > +             "paddd %%xmm6, %%xmm5           \n\t"
> > +             "punpcklbw %%xmm7, %%xmm1       \n\t"
> > +             "paddd %%xmm4, %%xmm3           \n\t"
> > +             "paddd %%xmm4, %%xmm5           \n\t"
> > +             "movq 16(%%"REG_S"), %%xmm0     \n\t"
> > +             "psrad $8, %%xmm3               \n\t" /* FRAC_BITS. */
> > +             "psrad $8, %%xmm5               \n\t" /* FRAC_BITS. */
> > +
> > +             "packssdw %%xmm5, %%xmm3        \n\t"
> > +             "mov %1, %%"REG_D"              \n\t"
> > +             "packuswb %%xmm7, %%xmm3        \n\t"
> > +
> > +             "movq %%xmm3, (%%"REG_d")       \n\t"
> > +
> > +
> > +             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> > +             "punpcklbw %%xmm7, %%xmm0       \n\t"
> > +             "movq (%%"REG_d",%%"REG_c"), %%xmm5; \n\t"
> > +             "punpcklbw %%xmm7, %%xmm5       \n\t"
> > +             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> > +             "movq (%%"REG_d",%%"REG_c"), %%xmm6; \n\t"
> > +             "pmullw %%xmm0, %%xmm5          \n\t"
> > +             "punpcklbw %%xmm7, %%xmm6       \n\t"
> > +
> > +             "movq 144(%%"REG_S"), %%xmm0    \n\t"
> > +             "pmullw %%xmm1, %%xmm6          \n\t"
> > +             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> > +             "punpcklbw %%xmm7, %%xmm0       \n\t"
> > +             "movq 152(%%"REG_S"), %%xmm1    \n\t"
> > +             "punpcklbw %%xmm7, %%xmm1       \n\t"
> > +             "movq (%%"REG_d",%%"REG_c"), %%xmm2;\n\t"
> > +             "punpcklbw %%xmm7, %%xmm2       \n\t"
> > +             "mov (%%"REG_a"), %%"REG_d"     \n\t"
> > +             "paddusw %%xmm5, %%xmm6         \n\t"
> > +             "pmullw %%xmm0, %%xmm2          \n\t"
> > +             "movq (%%"REG_d",%%"REG_c"), %%xmm3;\n\t"
> > +             "punpcklbw %%xmm7, %%xmm3       \n\t"
> > +             "paddusw %%xmm2, %%xmm6         \n\t"
> > +             "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
> > +             "pmullw %%xmm1, %%xmm3          \n\t"
> > +             "sal $1, %%"REG_c"              \n\t"
> > +             "add %2, %%"REG_D"              \n\t"
> > +             "paddusw %%xmm3, %%xmm6         \n\t"
> > +             "mov %0, %%"REG_d"              \n\t"
> > +
> > +             "movdqa (%%"REG_D"), %%xmm3     \n\t"
> > +             "movdqa %%xmm6, %%xmm0          \n\t"
> > +             "movdqa 16(%%"REG_D"), %%xmm5   \n\t"
> > +             "punpckhwd %%xmm7, %%xmm6       \n\t"
> > +             "punpcklwd %%xmm7, %%xmm0       \n\t"
> > +             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
> > +             "paddd %%xmm0, %%xmm3           \n\t"
> > +             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
> > +             "paddd %%xmm6, %%xmm5           \n\t"
> > +             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
> > +             "paddd %%xmm4, %%xmm3           \n\t"
> > +             "add %%"REG_c", (%%"REG_a")     \n\t"
> > +             "paddd %%xmm4, %%xmm5           \n\t"
> > +             "psrad $8, %%xmm3               \n\t" /* FRAC_BITS. */
> > +             "add $"PTR_SIZE"*2, %1          \n\t"
> > +             "psrad $8, %%xmm5               \n\t" /* FRAC_BITS. */
> > +             "add $32, %%"REG_S"             \n\t"
> > +
> > +             "packssdw %%xmm5, %%xmm3        \n\t"
> > +             "add %%"REG_c", %0              \n\t"
> > +             "packuswb %%xmm7, %%xmm3        \n\t"
> > +
> > +             "sar $1, %%"REG_c"              \n\t"
> > +             "movq %%xmm3, (%%"REG_d",%%"REG_c");\n\t"
> > +
> > +             "sub $2, %%"REG_b"              \n\t"
> > +             "jnz 1b                         \n\t"
> > +             :
> > +             :
> > +             "m"(dst8),"m"(dst_array),"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"(b_h),"c"(src_stride):
> > +             "%"REG_b"","%"REG_S"","%"REG_D"","%"REG_d"");
> 
> a minor issue, dont use ebx please, it causes PIC fanboys to flame us
> and a major one REG_c is changed and not an output or cloberlisted
> 
ebx changed to REG_b. h_b (input to the function) has been changed from
type int to type long so that this fix will work.

REG_c is an input. GCC refuses to allow it on the clobber list.
> 
> [...]
> > +
> > +static inline void inner_add_yblock_bw_16_obmc_32_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> > +                      int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
> > +    int y, x;
> > +    DWTELEM * dst;
> > +    DWTELEM * * dst_array = sb->line + src_y;
> > +
> > +    asm volatile(
> > +             "mov  %5, %%ebx                \n\t"
> > +             "mov  %3, %%"REG_S"            \n\t"
> > +             "pcmpeqd %%mm4, %%mm4          \n\t"
> > +             "pslld $31, %%mm4              \n\t"
> > +             "pxor %%mm7, %%mm7             \n\t" /* 0 */
> > +             "psrld $24, %%mm4              \n\t" /* FRAC_BITS >> 1 */
> > +
> > +             "1:                              \n\t"
> > +             "movd (%%"REG_S"), %%mm0        \n\t"
> > +             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> > +             "punpcklbw %%mm7, %%mm0        \n\t"
> > +             "movd 16(%%"REG_S"), %%mm1     \n\t"
> > +             "punpcklbw %%mm7, %%mm1        \n\t"
> > +             "movd (%%"REG_d"), %%mm5       \n\t"
> > +             "mov %1, %%"REG_D"             \n\t"
> > +             "punpcklbw %%mm7, %%mm5        \n\t"
> > +             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> > +             "movd (%%"REG_d"), %%mm6       \n\t"
> > +             "pmullw %%mm0, %%mm5           \n\t"
> > +             "punpcklbw %%mm7, %%mm6        \n\t"
> > +             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> > +
> > +             "movd 512(%%"REG_S"), %%mm0    \n\t"
> > +             "pmullw %%mm1, %%mm6           \n\t"
> > +             "punpcklbw %%mm7, %%mm0        \n\t"
> > +             "movd 528(%%"REG_S"), %%mm1    \n\t"
> > +             "punpcklbw %%mm7, %%mm1        \n\t"
> > +             "movd (%%"REG_d"), %%mm2       \n\t"
> > +             "punpcklbw %%mm7, %%mm2        \n\t"
> > +             "mov (%%"REG_a"), %%"REG_d"    \n\t"
> > +             "paddusw %%mm5, %%mm6          \n\t"
> > +             "mov (%%"REG_D"), %%"REG_D"    \n\t"
> > +             "pmullw %%mm0, %%mm2           \n\t"
> > +             "movd (%%"REG_d"), %%mm3       \n\t"
> > +             "mov %0, %%"REG_d"             \n\t"
> > +             "punpcklbw %%mm7, %%mm3        \n\t"
> > +             "add %2, %%"REG_D"             \n\t"
> > +             "paddusw %%mm2, %%mm6          \n\t"
> > +             "pmullw %%mm1, %%mm3           \n\t"
> > +             "paddusw %%mm3, %%mm6          \n\t"
> > +
> > +             "movq (%%"REG_D"), %%mm3       \n\t"
> > +             "movq %%mm6, %%mm0             \n\t"
> > +             "movq 8(%%"REG_D"), %%mm5      \n\t"
> > +             "punpckhwd %%mm7, %%mm6        \n\t"
> > +             "movd 20(%%"REG_S"), %%mm1     \n\t"
> > +             "punpcklwd %%mm7, %%mm0        \n\t"
> > +             "paddd %%mm0, %%mm3            \n\t"
> > +             "paddd %%mm6, %%mm5            \n\t"
> > +             "punpcklbw %%mm7, %%mm1        \n\t"
> > +             "paddd %%mm4, %%mm3            \n\t"
> > +             "paddd %%mm4, %%mm5            \n\t"
> > +             "movd 4(%%"REG_S"), %%mm0      \n\t"
> > +             "psrad $8, %%mm3               \n\t" /* FRAC_BITS. */
> > +             "psrad $8, %%mm5               \n\t" /* FRAC_BITS. */
> > +
> > +             "packssdw %%mm5, %%mm3         \n\t"
> > +             "packuswb %%mm7, %%mm3         \n\t"
> > +
> > +             "movd %%mm3, (%%"REG_d")       \n\t"
> > +
> > +
> > +             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> > +             "punpcklbw %%mm7, %%mm0        \n\t"
> > +             "movd 4(%%"REG_d"), %%mm5      \n\t"
> > +             "punpcklbw %%mm7, %%mm5        \n\t"
> > +             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> > +             "movd 4(%%"REG_d"), %%mm6      \n\t"
> > +             "pmullw %%mm0, %%mm5           \n\t"
> > +             "punpcklbw %%mm7, %%mm6        \n\t"
> > +
> > +             "movd 516(%%"REG_S"), %%mm0    \n\t"
> > +             "pmullw %%mm1, %%mm6           \n\t"
> > +             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> > +             "punpcklbw %%mm7, %%mm0        \n\t"
> > +             "movd 532(%%"REG_S"), %%mm1    \n\t"
> > +             "punpcklbw %%mm7, %%mm1        \n\t"
> > +             "movd 4(%%"REG_d"), %%mm2      \n\t"
> > +             "punpcklbw %%mm7, %%mm2        \n\t"
> > +             "mov (%%"REG_a"), %%"REG_d"     \n\t"
> > +             "paddusw %%mm5, %%mm6          \n\t"
> > +             "pmullw %%mm0, %%mm2           \n\t"
> > +             "movd 4(%%"REG_d"), %%mm3      \n\t"
> > +             "punpcklbw %%mm7, %%mm3        \n\t"
> > +             "paddusw %%mm2, %%mm6          \n\t"
> > +             "pmullw %%mm1, %%mm3           \n\t"
> > +             "paddusw %%mm3, %%mm6          \n\t"
> > +             "mov %0, %%"REG_d"             \n\t"
> > +
> > +             "movq 16(%%"REG_D"), %%mm3     \n\t"
> > +             "movq %%mm6, %%mm0             \n\t"
> > +             "movq 24(%%"REG_D"), %%mm5     \n\t"
> > +             "punpckhwd %%mm7, %%mm6        \n\t"
> > +             "punpcklwd %%mm7, %%mm0        \n\t"
> > +             "paddd %%mm0, %%mm3            \n\t"
> > +             "paddd %%mm6, %%mm5            \n\t"
> > +             "paddd %%mm4, %%mm3            \n\t"
> > +             "paddd %%mm4, %%mm5            \n\t"
> > +             "psrad $8, %%mm3               \n\t" /* FRAC_BITS. */
> > +             "psrad $8, %%mm5               \n\t" /* FRAC_BITS. */
> > +
> > +             "packssdw %%mm5, %%mm3         \n\t"
> > +             "packuswb %%mm7, %%mm3         \n\t"
> > +
> > +             "movd %%mm3, 4(%%"REG_d")      \n\t"
> > +
> > +
> > +
> > +             "movd 8(%%"REG_S"), %%mm0      \n\t"
> > +             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> > +             "punpcklbw %%mm7, %%mm0        \n\t"
> > +             "movd 24(%%"REG_S"), %%mm1     \n\t"
> > +             "punpcklbw %%mm7, %%mm1        \n\t"
> > +             "movd 8(%%"REG_d"), %%mm5      \n\t"
> > +             "punpcklbw %%mm7, %%mm5        \n\t"
> > +             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> > +             "movd 8(%%"REG_d"), %%mm6      \n\t"
> > +             "pmullw %%mm0, %%mm5           \n\t"
> > +             "punpcklbw %%mm7, %%mm6        \n\t"
> > +             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> > +
> > +             "movd 520(%%"REG_S"), %%mm0    \n\t"
> > +             "pmullw %%mm1, %%mm6           \n\t"
> > +             "punpcklbw %%mm7, %%mm0        \n\t"
> > +             "movd 536(%%"REG_S"), %%mm1    \n\t"
> > +             "punpcklbw %%mm7, %%mm1        \n\t"
> > +             "movd 8(%%"REG_d"), %%mm2      \n\t"
> > +             "punpcklbw %%mm7, %%mm2        \n\t"
> > +             "mov (%%"REG_a"), %%"REG_d"    \n\t"
> > +             "paddusw %%mm5, %%mm6          \n\t"
> > +             "pmullw %%mm0, %%mm2           \n\t"
> > +             "movd 8(%%"REG_d"), %%mm3      \n\t"
> > +             "mov %0, %%"REG_d"             \n\t"
> > +             "punpcklbw %%mm7, %%mm3        \n\t"
> > +             "paddusw %%mm2, %%mm6          \n\t"
> > +             "pmullw %%mm1, %%mm3           \n\t"
> > +             "paddusw %%mm3, %%mm6          \n\t"
> > +
> > +             "movq 32(%%"REG_D"), %%mm3     \n\t"
> > +             "movq %%mm6, %%mm0             \n\t"
> > +             "movq 40(%%"REG_D"), %%mm5     \n\t"
> > +             "punpckhwd %%mm7, %%mm6        \n\t"
> > +             "movd 28(%%"REG_S"), %%mm1     \n\t"
> > +             "punpcklwd %%mm7, %%mm0        \n\t"
> > +             "paddd %%mm0, %%mm3            \n\t"
> > +             "paddd %%mm6, %%mm5            \n\t"
> > +             "punpcklbw %%mm7, %%mm1        \n\t"
> > +             "paddd %%mm4, %%mm3            \n\t"
> > +             "paddd %%mm4, %%mm5            \n\t"
> > +             "movd 12(%%"REG_S"), %%mm0     \n\t"
> > +             "psrad $8, %%mm3               \n\t" /* FRAC_BITS. */
> > +             "psrad $8, %%mm5               \n\t" /* FRAC_BITS. */
> > +
> > +             "packssdw %%mm5, %%mm3         \n\t"
> > +             "packuswb %%mm7, %%mm3         \n\t"
> > +
> > +             "movd %%mm3, 8(%%"REG_d")      \n\t"
> > +
> > +
> > +             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> > +             "punpcklbw %%mm7, %%mm0        \n\t"
> > +             "movd 12(%%"REG_d"), %%mm5     \n\t"
> > +             "punpcklbw %%mm7, %%mm5        \n\t"
> > +             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> > +             "movd 12(%%"REG_d"), %%mm6     \n\t"
> > +             "pmullw %%mm0, %%mm5           \n\t"
> > +             "punpcklbw %%mm7, %%mm6        \n\t"
> > +
> > +             "movd 524(%%"REG_S"), %%mm0    \n\t"
> > +             "pmullw %%mm1, %%mm6           \n\t"
> > +             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> > +             "punpcklbw %%mm7, %%mm0        \n\t"
> > +             "movd 540(%%"REG_S"), %%mm1    \n\t"
> > +             "punpcklbw %%mm7, %%mm1        \n\t"
> > +             "movd 12(%%"REG_d"), %%mm2     \n\t"
> > +             "punpcklbw %%mm7, %%mm2        \n\t"
> > +             "mov (%%"REG_a"), %%"REG_d"    \n\t"
> > +             "paddusw %%mm5, %%mm6          \n\t"
> > +             "pmullw %%mm0, %%mm2           \n\t"
> > +             "movd 12(%%"REG_d"), %%mm3     \n\t"
> > +             "punpcklbw %%mm7, %%mm3        \n\t"
> > +             "paddusw %%mm2, %%mm6          \n\t"
> > +             "pmullw %%mm1, %%mm3           \n\t"
> > +             "paddusw %%mm3, %%mm6          \n\t"
> > +             "mov %0, %%"REG_d"             \n\t"
> > +
> > +             "movq 48(%%"REG_D"), %%mm3     \n\t"
> > +             "movq %%mm6, %%mm0             \n\t"
> > +             "movq 56(%%"REG_D"), %%mm5     \n\t"
> > +             "punpckhwd %%mm7, %%mm6        \n\t"
> > +             "punpcklwd %%mm7, %%mm0        \n\t"
> > +             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
> > +             "paddd %%mm0, %%mm3            \n\t"
> > +             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
> > +             "paddd %%mm6, %%mm5            \n\t"
> > +             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
> > +             "paddd %%mm4, %%mm3            \n\t"
> > +             "add %%"REG_c", (%%"REG_a")    \n\t"
> > +             "paddd %%mm4, %%mm5            \n\t"
> > +             "psrad $8, %%mm3               \n\t" /* FRAC_BITS. */
> > +             "add $"PTR_SIZE"*1, %1         \n\t"
> > +             "psrad $8, %%mm5               \n\t" /* FRAC_BITS. */
> > +             "add $32, %%"REG_S"            \n\t"
> > +
> > +             "packssdw %%mm5, %%mm3         \n\t"
> > +             "add %%"REG_c", %0             \n\t"
> > +             "packuswb %%mm7, %%mm3         \n\t"
> > +
> > +             "movd %%mm3, 12(%%"REG_d")     \n\t"
> > +
> > +             "dec %%"REG_b"                 \n\t"
> > +             "jnz 1b                        \n\t"
> > +             "emms                          \t\t"
> 
> is the emms here really needed?
> 
Fixed. Experimentally, they are not needed, and have been removed.
> [...]
> 
Thanks for your many useful comments Michael. Would you mind having a
look at the updated patch?

Sincerely,
Robert Edele
-------------- next part --------------
A non-text attachment was scrubbed...
Name: snow_mmx.patch
Type: text/x-patch
Size: 93269 bytes
Desc: 
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20060307/7ce5a015/attachment.bin>



More information about the ffmpeg-devel mailing list