[Ffmpeg-devel] [PATCH] Snow mmx+sse2 asm optimizations

Michael Niedermayer michaelni
Mon Mar 6 02:06:01 CET 2006


Hi

On Sun, Mar 05, 2006 at 06:09:09PM -0500, Robert Edele wrote:
[...]
> With the help of ods15, we have done the following:
>  - the asm code now resides entirely in dsputil_mmx.c.
>  - snow_mmx_sse2.h is now gone
>  - code previously in snow.c and all of snow_mmx_sse2.h is now in
> dsputil_mxx.c, dsputil.c, and dsputil.h.
>  - snow calls the asm via dsputil function pointers.
> 
> If you have any further issues with this code, please let me know.

it looks much better then before, but
please move the stuff from dsputil_mmx.c to snowdsp_mmx.c
this should be just a copy&paste + Makefile update


[...]
> -static void vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
> +void ff_snow_vertical_compose97i(void *vb0, void *vb1, void *vb2, void *vb3, void *vb4, void *vb5, int width){
> +    DWTELEM *b0 = vb0;
> +    DWTELEM *b1 = vb1;
> +    DWTELEM *b2 = vb2;
> +    DWTELEM *b3 = vb3;
> +    DWTELEM *b4 = vb4;
> +    DWTELEM *b5 = vb5;

move DWTELEM to dsputil.h or anything else but please not that mess


[...]
> @@ -2545,6 +2620,41 @@
>      }
>  }
>  
> +void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> +                              int src_x, int src_y, int src_stride, void * vsb, int add, uint8_t * dst8){
> +    slice_buffer * sb = vsb;

uhm...

put
typdef struct slice_buffer_s slice_buffer; 
in dsputil.c or wherever its needed, and
struct slice_buffe_s { ... }; in snow.c




[...]
> Index: i386/dsputil_mmx.c
> ===================================================================
> RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/dsputil_mmx.c,v
> retrieving revision 1.111
> diff -u -r1.111 dsputil_mmx.c
> --- i386/dsputil_mmx.c	10 Feb 2006 06:55:25 -0000	1.111
> +++ i386/dsputil_mmx.c	5 Mar 2006 17:31:12 -0000
> @@ -2564,6 +2564,1518 @@
>  }
>  #endif
>  
> +/* snow wavelet */
> +#define DWTELEM int
> +#define W_AM 3
> +#define W_AO 0
> +#define W_AS 1
> +
> +#define W_BM 1
> +#define W_BO 8
> +#define W_BS 4
> +
> +#define W_CM 1
> +#define W_CO 0
> +#define W_CS 0
> +
> +#define W_DM 3
> +#define W_DO 4
> +#define W_DS 3
> +
> +#ifdef ARCH_X86_64
> +#define PTR_SIZE "8"
> +#else
> +#define PTR_SIZE "4"
> +#endif
> +
> +/** Used to minimize the amount of memory used in order to optimize cache performance. **/
> +typedef struct {
> +    DWTELEM * * line; ///< For use by idwt and predict_slices.
> +    DWTELEM * * data_stack; ///< Used for internal purposes.
> +    int data_stack_top;
> +    int line_count;
> +    int line_width;
> +    int data_count;
> +    DWTELEM * base_buffer; ///< Buffer that this structure is caching.
> +} slice_buffer;

duplicating #defines and structs is not accpetable, these should be in a
common header


> +
> +#define snow_interleave_line_header(low,b,width)\
> +    int i = (width) - 2;\
> +    \
> +    if ((width) & 1)\
> +    {\
> +        (b)[i+1] = (low)[(i+1)>>1];\
> +        i--;\
> +    }
> +
> +#define snow_interleave_line_footer(low,high,b)\
> +    for (; i>=0; i-=2){\
> +        (b)[i+1] = (high)[i>>1];\
> +        (b)[i] = (low)[i>>1];\
> +    }

these should be inline functions


> +
> +static void horizontal_compose97i_sse2(void *vb, int width){
> +    DWTELEM *b = vb;
> +    const int w2= (width+1)>>1;
> +    // SSE2 code runs faster with pointers aligned on a 32-byte boundary.
> +    DWTELEM temp_buf[width>>1];
> +    DWTELEM * const temp = temp_buf + 4 - (((int)temp_buf & 0xF) / 4);

replace /4 by >>2 or make type unsigned divides by 4 and signed is slow


[...]
> +        for(; i<w_l; i++){
> +            b[i] = b[i] - ((W_DM * (ref[i] + ref[i + 1]) + W_DO) >> W_DS);
> +        }
> +
> +        if(width&1){
> +            b[w_l] = b[w_l] - ((W_DM * 2 * ref[w_l] + W_DO) >> W_DS);
> +        }
[...]
> +        for(; i<w_r; i++){
> +            dst[i] = dst[i] - (b[i] + b[i + 1]);
> +        }
> +
> +        if(!(width&1)){
> +            dst[w_r] = dst[w_r] - (2 * b[w_r]);
> +        }
[...]
> +        for(; i<w_l; i++){
> +            b[i] = b[i] - (((-(ref[i] + ref[(i+1)])+W_BO) - 4*b[i])>>W_BS);
> +        }
> +
> +        if(width&1){
> +            b[w_l] = b[w_l] - (((-2 * ref[w_l] + W_BO) - 4 * b[w_l]) >> W_BS);
> +        }
...

replace this with a function, see the lift() function in snow.c on how if its
not obvious

same applies to the other such cases


[...]
> +static void vertical_compose97i_sse2(void *vb0, void *vb1, void *vb2, void *vb3, void *vb4, void *vb5, int width){
> +    DWTELEM *b0 = vb0;
> +    DWTELEM *b1 = vb1;
> +    DWTELEM *b2 = vb2;
> +    DWTELEM *b3 = vb3;
> +    DWTELEM *b4 = vb4;
> +    DWTELEM *b5 = vb5;
> +    int i;
> +    int end_w2 = width >> 4; /* Needed because GCC does something totally brain dead and mis-loads end_w into the asm code if I use end_w directly.*/
> +
> +    asm volatile (
> +        "sal $4, %%"REG_d"                           \n\t"
> +        "jmp 2f                                      \n\t"
> +        "1:                                           \n\t"
> +
> +        "mov %5, %%"REG_a"                           \n\t"
> +        "mov %3, %%"REG_b"                           \n\t"
> +
> +        "movdqa (%%"REG_b",%%"REG_d",4), %%xmm0      \n\t"
> +        "movdqa 16(%%"REG_b",%%"REG_d",4), %%xmm2    \n\t"
> +        "movdqa 32(%%"REG_b",%%"REG_d",4), %%xmm4    \n\t"
> +        "movdqa 48(%%"REG_b",%%"REG_d",4), %%xmm6    \n\t"
> +
> +        "paddd (%%"REG_a",%%"REG_d",4), %%xmm0       \n\t"
> +        "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2     \n\t"
> +        "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4     \n\t"
> +        "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6     \n\t"
> +
> +        "movdqa %%xmm0, %%xmm1                       \n\t"
> +        "movdqa %%xmm2, %%xmm3                       \n\t"
> +        "movdqa %%xmm4, %%xmm5                       \n\t"
> +        "movdqa %%xmm6, %%xmm7                       \n\t"
> +
> +        "pslld $1, %%xmm0                            \n\t"
> +        "pslld $1, %%xmm2                            \n\t"
> +        "pslld $1, %%xmm4                            \n\t"
> +        "pslld $1, %%xmm6                            \n\t"
> +
> +        "paddd %%xmm1, %%xmm0                        \n\t"
> +        "paddd %%xmm3, %%xmm2                        \n\t"
> +        "paddd %%xmm5, %%xmm4                        \n\t"
> +        "paddd %%xmm7, %%xmm6                        \n\t"
> +
> +        "pcmpeqd %%xmm1, %%xmm1                      \n\t"
> +        "pslld $31, %%xmm1                           \n\t"
> +        "psrld $29, %%xmm1                           \n\t"
> +        "mov %4, %%"REG_a"                           \n\t"
> +
> +        "paddd %%xmm1, %%xmm0                        \n\t"
> +        "paddd %%xmm1, %%xmm2                        \n\t"
> +        "paddd %%xmm1, %%xmm4                        \n\t"
> +        "paddd %%xmm1, %%xmm6                        \n\t"
> +
> +        "psrad $3, %%xmm0                            \n\t"
> +        "psrad $3, %%xmm2                            \n\t"
> +        "psrad $3, %%xmm4                            \n\t"
> +        "psrad $3, %%xmm6                            \n\t"
> +
> +        "movdqa (%%"REG_a",%%"REG_d",4), %%xmm1      \n\t"
> +        "movdqa 16(%%"REG_a",%%"REG_d",4), %%xmm3    \n\t"
> +        "movdqa 32(%%"REG_a",%%"REG_d",4), %%xmm5    \n\t"
> +        "movdqa 48(%%"REG_a",%%"REG_d",4), %%xmm7    \n\t"
> +
> +        "psubd %%xmm0, %%xmm1                        \n\t"
> +        "psubd %%xmm2, %%xmm3                        \n\t"
> +        "psubd %%xmm4, %%xmm5                        \n\t"
> +        "psubd %%xmm6, %%xmm7                        \n\t"
> +
> +        "movdqa %%xmm1, (%%"REG_a",%%"REG_d",4)      \n\t"
> +        "movdqa %%xmm3, 16(%%"REG_a",%%"REG_d",4)    \n\t"
> +        "movdqa %%xmm5, 32(%%"REG_a",%%"REG_d",4)    \n\t"
> +        "movdqa %%xmm7, 48(%%"REG_a",%%"REG_d",4)    \n\t"
> +
> +        "mov %2, %%"REG_c"                           \n\t"
> +
> +        "paddd (%%"REG_c",%%"REG_d",4), %%xmm1       \n\t"
> +        "paddd 16(%%"REG_c",%%"REG_d",4), %%xmm3     \n\t"
> +        "paddd 32(%%"REG_c",%%"REG_d",4), %%xmm5     \n\t"
> +        "paddd 48(%%"REG_c",%%"REG_d",4), %%xmm7     \n\t"
> +
> +        "movdqa (%%"REG_b",%%"REG_d",4), %%xmm0      \n\t"
> +        "movdqa 16(%%"REG_b",%%"REG_d",4), %%xmm2    \n\t"
> +        "movdqa 32(%%"REG_b",%%"REG_d",4), %%xmm4    \n\t"
> +        "movdqa 48(%%"REG_b",%%"REG_d",4), %%xmm6    \n\t"
> +
> +        "psubd %%xmm1, %%xmm0                        \n\t"
> +        "psubd %%xmm3, %%xmm2                        \n\t"
> +        "psubd %%xmm5, %%xmm4                        \n\t"
> +        "psubd %%xmm7, %%xmm6                        \n\t"
> +
> +        "movdqa %%xmm0, (%%"REG_b",%%"REG_d",4)      \n\t"
> +        "movdqa %%xmm2, 16(%%"REG_b",%%"REG_d",4)    \n\t"
> +        "movdqa %%xmm4, 32(%%"REG_b",%%"REG_d",4)    \n\t"
> +        "movdqa %%xmm6, 48(%%"REG_b",%%"REG_d",4)    \n\t"
> +
> +        "mov %1, %%"REG_a"                           \n\t"
> +
> +        "paddd (%%"REG_a",%%"REG_d",4), %%xmm0       \n\t"
> +        "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2     \n\t"
> +        "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4     \n\t"
> +        "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6     \n\t"
> +
> +        "movdqa (%%"REG_c",%%"REG_d",4), %%xmm1      \n\t"
> +        "movdqa 16(%%"REG_c",%%"REG_d",4), %%xmm3    \n\t"
> +        "movdqa 32(%%"REG_c",%%"REG_d",4), %%xmm5    \n\t"
> +        "movdqa 48(%%"REG_c",%%"REG_d",4), %%xmm7    \n\t"
> +
> +        "pslld $2, %%xmm1                            \n\t"
> +        "pslld $2, %%xmm3                            \n\t"
> +        "pslld $2, %%xmm5                            \n\t"
> +        "pslld $2, %%xmm7                            \n\t"
> +
> +        "paddd %%xmm1, %%xmm0                        \n\t"
> +        "paddd %%xmm3, %%xmm2                        \n\t"
> +        "paddd %%xmm5, %%xmm4                        \n\t"
> +        "paddd %%xmm7, %%xmm6                        \n\t"
> +
> +        "pcmpeqd %%xmm1, %%xmm1                      \n\t"
> +        "pslld $31, %%xmm1                           \n\t"
> +        "psrld $28, %%xmm1                           \n\t"
> +        "mov %0, %%"REG_b"                           \n\t"
> +
> +        "paddd %%xmm1, %%xmm0                        \n\t"
> +        "paddd %%xmm1, %%xmm2                        \n\t"
> +        "paddd %%xmm1, %%xmm4                        \n\t"
> +        "paddd %%xmm1, %%xmm6                        \n\t"
> +
> +        "psrad $4, %%xmm0                            \n\t"
> +        "psrad $4, %%xmm2                            \n\t"
> +        "psrad $4, %%xmm4                            \n\t"
> +        "psrad $4, %%xmm6                            \n\t"
> +
> +        "paddd (%%"REG_c",%%"REG_d",4), %%xmm0       \n\t"
> +        "paddd 16(%%"REG_c",%%"REG_d",4), %%xmm2     \n\t"
> +        "paddd 32(%%"REG_c",%%"REG_d",4), %%xmm4     \n\t"
> +        "paddd 48(%%"REG_c",%%"REG_d",4), %%xmm6     \n\t"
> +
> +        "movdqa %%xmm0, (%%"REG_c",%%"REG_d",4)      \n\t"
> +        "movdqa %%xmm2, 16(%%"REG_c",%%"REG_d",4)    \n\t"
> +        "movdqa %%xmm4, 32(%%"REG_c",%%"REG_d",4)    \n\t"
> +        "movdqa %%xmm6, 48(%%"REG_c",%%"REG_d",4)    \n\t"
> +
> +        "paddd (%%"REG_b",%%"REG_d",4), %%xmm0       \n\t"
> +        "paddd 16(%%"REG_b",%%"REG_d",4), %%xmm2     \n\t"
> +        "paddd 32(%%"REG_b",%%"REG_d",4), %%xmm4     \n\t"
> +        "paddd 48(%%"REG_b",%%"REG_d",4), %%xmm6     \n\t"
> +
> +        "movdqa %%xmm0, %%xmm1                       \n\t"
> +        "movdqa %%xmm2, %%xmm3                       \n\t"
> +        "movdqa %%xmm4, %%xmm5                       \n\t"
> +        "movdqa %%xmm6, %%xmm7                       \n\t"
> +
> +        "pslld $1, %%xmm0                            \n\t"
> +        "pslld $1, %%xmm2                            \n\t"
> +        "pslld $1, %%xmm4                            \n\t"
> +        "pslld $1, %%xmm6                            \n\t"
> +
> +        "paddd %%xmm1, %%xmm0                        \n\t"
> +        "paddd %%xmm3, %%xmm2                        \n\t"
> +        "paddd %%xmm5, %%xmm4                        \n\t"
> +        "paddd %%xmm7, %%xmm6                        \n\t"
> +
> +        "psrad $1, %%xmm0                            \n\t"
> +        "psrad $1, %%xmm2                            \n\t"
> +        "psrad $1, %%xmm4                            \n\t"
> +        "psrad $1, %%xmm6                            \n\t"
> +
> +        "paddd (%%"REG_a",%%"REG_d",4), %%xmm0       \n\t"
> +        "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2     \n\t"
> +        "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4     \n\t"
> +        "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6     \n\t"
> +
> +        "movdqa %%xmm0, (%%"REG_a",%%"REG_d",4)      \n\t"
> +        "movdqa %%xmm2, 16(%%"REG_a",%%"REG_d",4)    \n\t"
> +        "movdqa %%xmm4, 32(%%"REG_a",%%"REG_d",4)    \n\t"
> +        "movdqa %%xmm6, 48(%%"REG_a",%%"REG_d",4)    \n\t"
> +
> +        "2:                                           \n\t"
> +        "sub $16, %%"REG_d"                          \n\t"
> +        "jge 1b                                      \n\t"
> +        ::
> +        "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5),"d"(end_w2):
> +        "%"REG_a"","%"REG_b"","%"REG_c"");

this code is not valid, REG_d is changed but neither output nor on the clobber list

[...]

> +static inline void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> +                      int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
> +    int y, x;
> +    DWTELEM * dst;
> +    DWTELEM * * dst_array = sb->line + src_y;
> +
> +    asm volatile(
> +             "mov  %5, %%ebx                 \n\t"
> +             "mov  %3, %%"REG_S"             \n\t"
> +             "pcmpeqd %%xmm4, %%xmm4         \n\t"
> +             "pslld $31, %%xmm4              \n\t"
> +             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */
> +             "psrld $24, %%xmm4              \n\t" /* FRAC_BITS >> 1 */
> +
> +             "1:                              \n\t"
> +             "movq (%%"REG_S"), %%xmm0       \n\t"
> +             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> +             "punpcklbw %%xmm7, %%xmm0       \n\t"
> +             "movq 8(%%"REG_S"), %%xmm1      \n\t"
> +             "punpcklbw %%xmm7, %%xmm1       \n\t"
> +             "movq (%%"REG_d"), %%xmm5       \n\t"
> +             "mov %1, %%"REG_D"              \n\t"
> +             "punpcklbw %%xmm7, %%xmm5       \n\t"
> +             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> +             "movq (%%"REG_d"), %%xmm6       \n\t"
> +             "pmullw %%xmm0, %%xmm5          \n\t"
> +             "punpcklbw %%xmm7, %%xmm6       \n\t"
> +             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> +             "mov (%%"REG_D"), %%"REG_D"     \n\t"
> +
> +             "movq 128(%%"REG_S"), %%xmm0    \n\t"
> +             "pmullw %%xmm1, %%xmm6          \n\t"
> +             "punpcklbw %%xmm7, %%xmm0       \n\t"
> +             "movq 136(%%"REG_S"), %%xmm1    \n\t"
> +             "add %2, %%"REG_D"              \n\t"
> +             "punpcklbw %%xmm7, %%xmm1       \n\t"
> +             "movq (%%"REG_d"), %%xmm2       \n\t"
> +             "punpcklbw %%xmm7, %%xmm2       \n\t"
> +             "mov (%%"REG_a"), %%"REG_d"     \n\t"
> +             "paddusw %%xmm5, %%xmm6         \n\t"
> +             "pmullw %%xmm0, %%xmm2          \n\t"
> +             "movq (%%"REG_d"), %%xmm3       \n\t"
> +             "mov %0, %%"REG_d"              \n\t"
> +             "punpcklbw %%xmm7, %%xmm3       \n\t"
> +             "paddusw %%xmm2, %%xmm6         \n\t"
> +             "pmullw %%xmm1, %%xmm3          \n\t"
> +             "paddusw %%xmm3, %%xmm6         \n\t"
> +
> +             "movdqa (%%"REG_D"), %%xmm3     \n\t"
> +             "movdqa %%xmm6, %%xmm0          \n\t"
> +             "movdqa 16(%%"REG_D"), %%xmm5   \n\t"
> +             "punpckhwd %%xmm7, %%xmm6       \n\t"
> +             "movq 24(%%"REG_S"), %%xmm1     \n\t"
> +             "punpcklwd %%xmm7, %%xmm0       \n\t"
> +             "paddd %%xmm0, %%xmm3           \n\t"
> +             "paddd %%xmm6, %%xmm5           \n\t"
> +             "punpcklbw %%xmm7, %%xmm1       \n\t"
> +             "paddd %%xmm4, %%xmm3           \n\t"
> +             "paddd %%xmm4, %%xmm5           \n\t"
> +             "movq 16(%%"REG_S"), %%xmm0     \n\t"
> +             "psrad $8, %%xmm3               \n\t" /* FRAC_BITS. */
> +             "psrad $8, %%xmm5               \n\t" /* FRAC_BITS. */
> +
> +             "packssdw %%xmm5, %%xmm3        \n\t"
> +             "mov %1, %%"REG_D"              \n\t"
> +             "packuswb %%xmm7, %%xmm3        \n\t"
> +
> +             "movq %%xmm3, (%%"REG_d")       \n\t"
> +
> +
> +             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> +             "punpcklbw %%xmm7, %%xmm0       \n\t"
> +             "movq (%%"REG_d",%%"REG_c"), %%xmm5; \n\t"
> +             "punpcklbw %%xmm7, %%xmm5       \n\t"
> +             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> +             "movq (%%"REG_d",%%"REG_c"), %%xmm6; \n\t"
> +             "pmullw %%xmm0, %%xmm5          \n\t"
> +             "punpcklbw %%xmm7, %%xmm6       \n\t"
> +
> +             "movq 144(%%"REG_S"), %%xmm0    \n\t"
> +             "pmullw %%xmm1, %%xmm6          \n\t"
> +             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> +             "punpcklbw %%xmm7, %%xmm0       \n\t"
> +             "movq 152(%%"REG_S"), %%xmm1    \n\t"
> +             "punpcklbw %%xmm7, %%xmm1       \n\t"
> +             "movq (%%"REG_d",%%"REG_c"), %%xmm2;\n\t"
> +             "punpcklbw %%xmm7, %%xmm2       \n\t"
> +             "mov (%%"REG_a"), %%"REG_d"     \n\t"
> +             "paddusw %%xmm5, %%xmm6         \n\t"
> +             "pmullw %%xmm0, %%xmm2          \n\t"
> +             "movq (%%"REG_d",%%"REG_c"), %%xmm3;\n\t"
> +             "punpcklbw %%xmm7, %%xmm3       \n\t"
> +             "paddusw %%xmm2, %%xmm6         \n\t"
> +             "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
> +             "pmullw %%xmm1, %%xmm3          \n\t"
> +             "sal $1, %%"REG_c"              \n\t"
> +             "add %2, %%"REG_D"              \n\t"
> +             "paddusw %%xmm3, %%xmm6         \n\t"
> +             "mov %0, %%"REG_d"              \n\t"
> +
> +             "movdqa (%%"REG_D"), %%xmm3     \n\t"
> +             "movdqa %%xmm6, %%xmm0          \n\t"
> +             "movdqa 16(%%"REG_D"), %%xmm5   \n\t"
> +             "punpckhwd %%xmm7, %%xmm6       \n\t"
> +             "punpcklwd %%xmm7, %%xmm0       \n\t"
> +             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
> +             "paddd %%xmm0, %%xmm3           \n\t"
> +             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
> +             "paddd %%xmm6, %%xmm5           \n\t"
> +             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
> +             "paddd %%xmm4, %%xmm3           \n\t"
> +             "add %%"REG_c", (%%"REG_a")     \n\t"
> +             "paddd %%xmm4, %%xmm5           \n\t"
> +             "psrad $8, %%xmm3               \n\t" /* FRAC_BITS. */
> +             "add $"PTR_SIZE"*2, %1          \n\t"
> +             "psrad $8, %%xmm5               \n\t" /* FRAC_BITS. */
> +             "add $32, %%"REG_S"             \n\t"
> +
> +             "packssdw %%xmm5, %%xmm3        \n\t"
> +             "add %%"REG_c", %0              \n\t"
> +             "packuswb %%xmm7, %%xmm3        \n\t"
> +
> +             "sar $1, %%"REG_c"              \n\t"
> +             "movq %%xmm3, (%%"REG_d",%%"REG_c");\n\t"
> +
> +             "sub $2, %%"REG_b"              \n\t"
> +             "jnz 1b                         \n\t"
> +             :
> +             :
> +             "m"(dst8),"m"(dst_array),"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"(b_h),"c"(src_stride):
> +             "%"REG_b"","%"REG_S"","%"REG_D"","%"REG_d"");

a minor issue, dont use ebx please, it causes PIC fanboys to flame us
and a major one REG_c is changed and not an output or cloberlisted


[...]
> +
> +static inline void inner_add_yblock_bw_16_obmc_32_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> +                      int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
> +    int y, x;
> +    DWTELEM * dst;
> +    DWTELEM * * dst_array = sb->line + src_y;
> +
> +    asm volatile(
> +             "mov  %5, %%ebx                \n\t"
> +             "mov  %3, %%"REG_S"            \n\t"
> +             "pcmpeqd %%mm4, %%mm4          \n\t"
> +             "pslld $31, %%mm4              \n\t"
> +             "pxor %%mm7, %%mm7             \n\t" /* 0 */
> +             "psrld $24, %%mm4              \n\t" /* FRAC_BITS >> 1 */
> +
> +             "1:                              \n\t"
> +             "movd (%%"REG_S"), %%mm0        \n\t"
> +             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> +             "punpcklbw %%mm7, %%mm0        \n\t"
> +             "movd 16(%%"REG_S"), %%mm1     \n\t"
> +             "punpcklbw %%mm7, %%mm1        \n\t"
> +             "movd (%%"REG_d"), %%mm5       \n\t"
> +             "mov %1, %%"REG_D"             \n\t"
> +             "punpcklbw %%mm7, %%mm5        \n\t"
> +             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> +             "movd (%%"REG_d"), %%mm6       \n\t"
> +             "pmullw %%mm0, %%mm5           \n\t"
> +             "punpcklbw %%mm7, %%mm6        \n\t"
> +             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> +
> +             "movd 512(%%"REG_S"), %%mm0    \n\t"
> +             "pmullw %%mm1, %%mm6           \n\t"
> +             "punpcklbw %%mm7, %%mm0        \n\t"
> +             "movd 528(%%"REG_S"), %%mm1    \n\t"
> +             "punpcklbw %%mm7, %%mm1        \n\t"
> +             "movd (%%"REG_d"), %%mm2       \n\t"
> +             "punpcklbw %%mm7, %%mm2        \n\t"
> +             "mov (%%"REG_a"), %%"REG_d"    \n\t"
> +             "paddusw %%mm5, %%mm6          \n\t"
> +             "mov (%%"REG_D"), %%"REG_D"    \n\t"
> +             "pmullw %%mm0, %%mm2           \n\t"
> +             "movd (%%"REG_d"), %%mm3       \n\t"
> +             "mov %0, %%"REG_d"             \n\t"
> +             "punpcklbw %%mm7, %%mm3        \n\t"
> +             "add %2, %%"REG_D"             \n\t"
> +             "paddusw %%mm2, %%mm6          \n\t"
> +             "pmullw %%mm1, %%mm3           \n\t"
> +             "paddusw %%mm3, %%mm6          \n\t"
> +
> +             "movq (%%"REG_D"), %%mm3       \n\t"
> +             "movq %%mm6, %%mm0             \n\t"
> +             "movq 8(%%"REG_D"), %%mm5      \n\t"
> +             "punpckhwd %%mm7, %%mm6        \n\t"
> +             "movd 20(%%"REG_S"), %%mm1     \n\t"
> +             "punpcklwd %%mm7, %%mm0        \n\t"
> +             "paddd %%mm0, %%mm3            \n\t"
> +             "paddd %%mm6, %%mm5            \n\t"
> +             "punpcklbw %%mm7, %%mm1        \n\t"
> +             "paddd %%mm4, %%mm3            \n\t"
> +             "paddd %%mm4, %%mm5            \n\t"
> +             "movd 4(%%"REG_S"), %%mm0      \n\t"
> +             "psrad $8, %%mm3               \n\t" /* FRAC_BITS. */
> +             "psrad $8, %%mm5               \n\t" /* FRAC_BITS. */
> +
> +             "packssdw %%mm5, %%mm3         \n\t"
> +             "packuswb %%mm7, %%mm3         \n\t"
> +
> +             "movd %%mm3, (%%"REG_d")       \n\t"
> +
> +
> +             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> +             "punpcklbw %%mm7, %%mm0        \n\t"
> +             "movd 4(%%"REG_d"), %%mm5      \n\t"
> +             "punpcklbw %%mm7, %%mm5        \n\t"
> +             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> +             "movd 4(%%"REG_d"), %%mm6      \n\t"
> +             "pmullw %%mm0, %%mm5           \n\t"
> +             "punpcklbw %%mm7, %%mm6        \n\t"
> +
> +             "movd 516(%%"REG_S"), %%mm0    \n\t"
> +             "pmullw %%mm1, %%mm6           \n\t"
> +             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> +             "punpcklbw %%mm7, %%mm0        \n\t"
> +             "movd 532(%%"REG_S"), %%mm1    \n\t"
> +             "punpcklbw %%mm7, %%mm1        \n\t"
> +             "movd 4(%%"REG_d"), %%mm2      \n\t"
> +             "punpcklbw %%mm7, %%mm2        \n\t"
> +             "mov (%%"REG_a"), %%"REG_d"     \n\t"
> +             "paddusw %%mm5, %%mm6          \n\t"
> +             "pmullw %%mm0, %%mm2           \n\t"
> +             "movd 4(%%"REG_d"), %%mm3      \n\t"
> +             "punpcklbw %%mm7, %%mm3        \n\t"
> +             "paddusw %%mm2, %%mm6          \n\t"
> +             "pmullw %%mm1, %%mm3           \n\t"
> +             "paddusw %%mm3, %%mm6          \n\t"
> +             "mov %0, %%"REG_d"             \n\t"
> +
> +             "movq 16(%%"REG_D"), %%mm3     \n\t"
> +             "movq %%mm6, %%mm0             \n\t"
> +             "movq 24(%%"REG_D"), %%mm5     \n\t"
> +             "punpckhwd %%mm7, %%mm6        \n\t"
> +             "punpcklwd %%mm7, %%mm0        \n\t"
> +             "paddd %%mm0, %%mm3            \n\t"
> +             "paddd %%mm6, %%mm5            \n\t"
> +             "paddd %%mm4, %%mm3            \n\t"
> +             "paddd %%mm4, %%mm5            \n\t"
> +             "psrad $8, %%mm3               \n\t" /* FRAC_BITS. */
> +             "psrad $8, %%mm5               \n\t" /* FRAC_BITS. */
> +
> +             "packssdw %%mm5, %%mm3         \n\t"
> +             "packuswb %%mm7, %%mm3         \n\t"
> +
> +             "movd %%mm3, 4(%%"REG_d")      \n\t"
> +
> +
> +
> +             "movd 8(%%"REG_S"), %%mm0      \n\t"
> +             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> +             "punpcklbw %%mm7, %%mm0        \n\t"
> +             "movd 24(%%"REG_S"), %%mm1     \n\t"
> +             "punpcklbw %%mm7, %%mm1        \n\t"
> +             "movd 8(%%"REG_d"), %%mm5      \n\t"
> +             "punpcklbw %%mm7, %%mm5        \n\t"
> +             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> +             "movd 8(%%"REG_d"), %%mm6      \n\t"
> +             "pmullw %%mm0, %%mm5           \n\t"
> +             "punpcklbw %%mm7, %%mm6        \n\t"
> +             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> +
> +             "movd 520(%%"REG_S"), %%mm0    \n\t"
> +             "pmullw %%mm1, %%mm6           \n\t"
> +             "punpcklbw %%mm7, %%mm0        \n\t"
> +             "movd 536(%%"REG_S"), %%mm1    \n\t"
> +             "punpcklbw %%mm7, %%mm1        \n\t"
> +             "movd 8(%%"REG_d"), %%mm2      \n\t"
> +             "punpcklbw %%mm7, %%mm2        \n\t"
> +             "mov (%%"REG_a"), %%"REG_d"    \n\t"
> +             "paddusw %%mm5, %%mm6          \n\t"
> +             "pmullw %%mm0, %%mm2           \n\t"
> +             "movd 8(%%"REG_d"), %%mm3      \n\t"
> +             "mov %0, %%"REG_d"             \n\t"
> +             "punpcklbw %%mm7, %%mm3        \n\t"
> +             "paddusw %%mm2, %%mm6          \n\t"
> +             "pmullw %%mm1, %%mm3           \n\t"
> +             "paddusw %%mm3, %%mm6          \n\t"
> +
> +             "movq 32(%%"REG_D"), %%mm3     \n\t"
> +             "movq %%mm6, %%mm0             \n\t"
> +             "movq 40(%%"REG_D"), %%mm5     \n\t"
> +             "punpckhwd %%mm7, %%mm6        \n\t"
> +             "movd 28(%%"REG_S"), %%mm1     \n\t"
> +             "punpcklwd %%mm7, %%mm0        \n\t"
> +             "paddd %%mm0, %%mm3            \n\t"
> +             "paddd %%mm6, %%mm5            \n\t"
> +             "punpcklbw %%mm7, %%mm1        \n\t"
> +             "paddd %%mm4, %%mm3            \n\t"
> +             "paddd %%mm4, %%mm5            \n\t"
> +             "movd 12(%%"REG_S"), %%mm0     \n\t"
> +             "psrad $8, %%mm3               \n\t" /* FRAC_BITS. */
> +             "psrad $8, %%mm5               \n\t" /* FRAC_BITS. */
> +
> +             "packssdw %%mm5, %%mm3         \n\t"
> +             "packuswb %%mm7, %%mm3         \n\t"
> +
> +             "movd %%mm3, 8(%%"REG_d")      \n\t"
> +
> +
> +             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> +             "punpcklbw %%mm7, %%mm0        \n\t"
> +             "movd 12(%%"REG_d"), %%mm5     \n\t"
> +             "punpcklbw %%mm7, %%mm5        \n\t"
> +             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> +             "movd 12(%%"REG_d"), %%mm6     \n\t"
> +             "pmullw %%mm0, %%mm5           \n\t"
> +             "punpcklbw %%mm7, %%mm6        \n\t"
> +
> +             "movd 524(%%"REG_S"), %%mm0    \n\t"
> +             "pmullw %%mm1, %%mm6           \n\t"
> +             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> +             "punpcklbw %%mm7, %%mm0        \n\t"
> +             "movd 540(%%"REG_S"), %%mm1    \n\t"
> +             "punpcklbw %%mm7, %%mm1        \n\t"
> +             "movd 12(%%"REG_d"), %%mm2     \n\t"
> +             "punpcklbw %%mm7, %%mm2        \n\t"
> +             "mov (%%"REG_a"), %%"REG_d"    \n\t"
> +             "paddusw %%mm5, %%mm6          \n\t"
> +             "pmullw %%mm0, %%mm2           \n\t"
> +             "movd 12(%%"REG_d"), %%mm3     \n\t"
> +             "punpcklbw %%mm7, %%mm3        \n\t"
> +             "paddusw %%mm2, %%mm6          \n\t"
> +             "pmullw %%mm1, %%mm3           \n\t"
> +             "paddusw %%mm3, %%mm6          \n\t"
> +             "mov %0, %%"REG_d"             \n\t"
> +
> +             "movq 48(%%"REG_D"), %%mm3     \n\t"
> +             "movq %%mm6, %%mm0             \n\t"
> +             "movq 56(%%"REG_D"), %%mm5     \n\t"
> +             "punpckhwd %%mm7, %%mm6        \n\t"
> +             "punpcklwd %%mm7, %%mm0        \n\t"
> +             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
> +             "paddd %%mm0, %%mm3            \n\t"
> +             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
> +             "paddd %%mm6, %%mm5            \n\t"
> +             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
> +             "paddd %%mm4, %%mm3            \n\t"
> +             "add %%"REG_c", (%%"REG_a")    \n\t"
> +             "paddd %%mm4, %%mm5            \n\t"
> +             "psrad $8, %%mm3               \n\t" /* FRAC_BITS. */
> +             "add $"PTR_SIZE"*1, %1         \n\t"
> +             "psrad $8, %%mm5               \n\t" /* FRAC_BITS. */
> +             "add $32, %%"REG_S"            \n\t"
> +
> +             "packssdw %%mm5, %%mm3         \n\t"
> +             "add %%"REG_c", %0             \n\t"
> +             "packuswb %%mm7, %%mm3         \n\t"
> +
> +             "movd %%mm3, 12(%%"REG_d")     \n\t"
> +
> +             "dec %%"REG_b"                 \n\t"
> +             "jnz 1b                        \n\t"
> +             "emms                          \t\t"

is the emms here really needed?

[...]

-- 
Michael





More information about the ffmpeg-devel mailing list