[FFmpeg-devel] [PATCH] MMX/SSE2 qpel functions for RV40

Michael Niedermayer michaelni
Thu Jan 8 00:06:52 CET 2009


On Mon, Jan 05, 2009 at 07:35:30PM +0100, Mathieu Velten wrote:
> 2009/1/5 Michael Niedermayer <michaelni at gmx.at>:
> > you can access constants like ff_pw_5 through MANGLE() thus bypassing
> > gccs register deallocator
> 
> thanks, new patch attached based on the first version with MANGLE(ff_pw_5).
[...]


> +    for(i=0; i<h; i++) {\
> +        __asm__ volatile(\
> +            "pxor %%mm7, %%mm7             \n\t"\

this can be done outside the loop, also the loop should be in asm not C


> +            "movq "MANGLE(ff_pw_5)", %%mm6 \n\t"  /* mm6 = ff_pw_5 */\
> +            "movq  -2(%0), %%mm0           \n\t"\
> +            "movq   3(%0), %%mm3           \n\t"\
> +            "movq %%mm0, %%mm2             \n\t"\
> +            "movq %%mm3, %%mm1             \n\t"\
> +            "punpcklbw %%mm7, %%mm0        \n\t"  /* mm0 = src[-2..1] */\
> +            "punpcklbw %%mm7, %%mm3        \n\t"  /* mm3 = src[3..6] */\
> +            "punpckhbw %%mm7, %%mm2        \n\t"  /* mm2 = src[2..5] */\
> +            "punpckhbw %%mm7, %%mm1        \n\t"  /* mm1 = src[7..10] */\
> +            "paddw %%mm3, %%mm0            \n\t"\
> +            "paddw %%mm2, %%mm1            \n\t"  /* mm0/mm1 = src[-2..5] + src[3..10] */\
> +            "movd   -1(%0), %%mm4          \n\t"\
> +            "movd    6(%0), %%mm5          \n\t"\
> +            "punpcklbw %%mm7, %%mm4        \n\t"  /* mm4 = src[-1..2] */\
> +            "punpcklbw %%mm7, %%mm5        \n\t"  /* mm5 = src[6..9] */\
> +            "paddw %%mm4, %%mm2            \n\t"\
> +            "paddw %%mm5, %%mm3            \n\t"\
> +            "pmullw %%mm6, %%mm2           \n\t"\
> +            "pmullw %%mm6, %%mm3           \n\t"  /* mm2/mm3 = (src[-1..6]+src[2..9]) * 5 */\
> +            "movq %2, %%mm6                \n\t"  /* mm6 = ff_pw_C1 */\

> +            "movq   0(%0), %%mm4           \n\t"\

the 0 is unneeded


> +            "movq %%mm4, %%mm5             \n\t"\
> +            "punpcklbw %%mm7, %%mm4        \n\t"  /* mm4 = src[0..3] */\
> +            "punpckhbw %%mm7, %%mm5        \n\t"  /* mm5 = src[4..7] */\
> +            "pmullw %%mm6, %%mm4           \n\t"\
> +            "pmullw %%mm6, %%mm5           \n\t"  /* mm4/mm5 = src[0..7] * C1 */\
> +            "movq %3, %%mm6                \n\t"  /* mm6 = ff_pw_C2 */\
> +            "paddw %%mm4, %%mm0            \n\t"\
> +            "paddw %%mm5, %%mm1            \n\t"  /* mm0/mm1 += src[0..7] * C1 */\
> +            "movq   1(%0), %%mm4           \n\t"\
> +            "movq %%mm4, %%mm5             \n\t"\
> +            "punpcklbw %%mm7, %%mm4        \n\t"  /* mm4 = src[1..4] */\
> +            "punpckhbw %%mm7, %%mm5        \n\t"  /* mm5 = src[5..8] */\
> +            "pmullw %%mm6, %%mm4           \n\t"\
> +            "pmullw %%mm6, %%mm5           \n\t"  /* mm4/mm5 = src[1..8] * C2 */\
> +            "movq %4, %%mm6                \n\t"  /* mm6 = rnd_reg */\
> +            "paddw %%mm4, %%mm0            \n\t"\
> +            "paddw %%mm5, %%mm1            \n\t"  /* mm0/mm1 += src[1..8] * C2 */\
> +            "movd %5, %%mm5                \n\t"  /* mm5 = SHIFT */\
> +            "psubw %%mm2, %%mm0            \n\t"\
> +            "psubw %%mm3, %%mm1            \n\t"  /* mm0/mm1 -= (src[-1..6]+src[2..9]) * 5 */\
> +            "paddw %%mm6, %%mm0            \n\t"\
> +            "paddw %%mm6, %%mm1            \n\t"  /* mm0/mm1 += rnd_reg */\

> +            "psraw %%mm5, %%mm0            \n\t"\
> +            "psraw %%mm5, %%mm1            \n\t"  /* mm0/mm1 >>= SHIFT */\

i wonder if SHIFT would not be better as a constant ...
(no iam not saying change it, only if its faster and doesnt lead to too
 mich larger object files)


> +            "packuswb %%mm1, %%mm0         \n\t"\
> +            OP(%%mm0, (%1),%%mm5, q)\
> +            : "+a"(src), "+c"(dst)\
> +            : "m"(*C1_reg), "m"(*C2_reg), "m"(*rnd_reg), "D"((x86_reg)SHIFT)\
> +            : "memory"\

C1_reg, C2_reg, rnd_reg, SHIFT could be in a table, thus requireing only 1 reg
to access them.

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

I have often repented speaking, but never of holding my tongue.
-- Xenocrates
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090108/b275b106/attachment.pgp>



More information about the ffmpeg-devel mailing list