[Ffmpeg-devel] [PATCH] minor H.264 asm optimization
Reimar Döffinger
Reimar.Doeffinger
Thu Feb 22 12:07:18 CET 2007
Hello,
attached patch reduces code size quite a lot for me, since my gcc (4.1.2)
stupidly does loop unrolling.
Not that this was tested only quickly, only on AMD64 and not properly
benchmarked.
But since I have not that much experience with asm I wanted to suggest
this here before making too much of an effort.
Greetings,
Reimar D?ffinger
P.S.: I will never get used to having to write $4 and not 4... first try
always gives beautiful crashes - well, at least I see like that that my
sample actually uses the code ;-)
-------------- next part --------------
Index: libavcodec/i386/h264dsp_mmx.c
===================================================================
--- libavcodec/i386/h264dsp_mmx.c (revision 8067)
+++ libavcodec/i386/h264dsp_mmx.c (working copy)
@@ -670,18 +670,18 @@
"add %3, %1 \n\t"
#define QPEL_H264HV(A,B,C,D,E,F,OF)\
- "movd (%0), "#F" \n\t"\
+ "movd (%%"REG_a"), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"paddw "#D", %%mm6 \n\t"\
"psllw $2, %%mm6 \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psubw "#E", %%mm6 \n\t"\
- "pmullw %3, %%mm6 \n\t"\
- "add %2, %0 \n\t"\
+ "pmullw "MANGLE(ff_pw_5)", %%mm6\n\t"\
+ "add %%"REG_S", %%"REG_a" \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"paddw "#F", "#A" \n\t"\
"paddw "#A", %%mm6 \n\t"\
- "movq %%mm6, "#OF"(%1) \n\t"
+ "movq %%mm6, "#OF"(%%"REG_c")\n\t"
#define QPEL_H264(OPNAME, OP, MMX)\
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
@@ -800,20 +800,23 @@
static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
int h=4;\
int w=3;\
- src -= 2*srcStride+2;\
- while(w--){\
- asm volatile(\
+ uint8_t *stmp;\
+ src -= 2*srcStride+2;\
+ stmp = src;\
+ asm volatile(\
+ "1: \n\t"\
+ "add $4, %3 \n\t"\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
- "add %2, %0 \n\t"\
+ "add %4, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
- "add %2, %0 \n\t"\
+ "add %4, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
- "add %2, %0 \n\t"\
+ "add %4, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
- "add %2, %0 \n\t"\
+ "add %4, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
- "add %2, %0 \n\t"\
+ "add %4, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
@@ -823,14 +826,15 @@
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
+ "add $8, %1 \n\t"\
+ "mov %3, %0 \n\t"\
+ "dec %2 \n\t"\
+ "jnz 1b \n\t"\
\
- : "+a"(src)\
- : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
+ : "+a"(src), "+c"(tmp), "+g"(w), "+r"(stmp)\
+ : "S"((long)srcStride)\
: "memory"\
- );\
- tmp += 4;\
- src += 4 - 9*srcStride;\
- }\
+ );\
tmp -= 3*4;\
asm volatile(\
"movq %4, %%mm6 \n\t"\
More information about the ffmpeg-devel
mailing list