[Ffmpeg-devel] [PATCH] minor H.264 asm optimization

Reimar Döffinger Reimar.Doeffinger
Thu Feb 22 12:07:18 CET 2007


Hello,
attached patch reduces code size quite a lot for me, since my gcc (4.1.2)
stupidly does loop unrolling.
Not that this was tested only quickly, only on AMD64 and not properly
benchmarked.
But since I have not that much experience with asm I wanted to suggest
this here before making too much of an effort.

Greetings,
Reimar D?ffinger

P.S.: I will never get used to having to write $4 and not 4... first try
always gives beautiful crashes - well, at least I see like that that my
sample actually uses the code ;-)
-------------- next part --------------
Index: libavcodec/i386/h264dsp_mmx.c
===================================================================
--- libavcodec/i386/h264dsp_mmx.c	(revision 8067)
+++ libavcodec/i386/h264dsp_mmx.c	(working copy)
@@ -670,18 +670,18 @@
         "add %3, %1                 \n\t"
 
 #define QPEL_H264HV(A,B,C,D,E,F,OF)\
-        "movd (%0), "#F"            \n\t"\
+        "movd (%%"REG_a"), "#F"     \n\t"\
         "movq "#C", %%mm6           \n\t"\
         "paddw "#D", %%mm6          \n\t"\
         "psllw $2, %%mm6            \n\t"\
         "psubw "#B", %%mm6          \n\t"\
         "psubw "#E", %%mm6          \n\t"\
-        "pmullw %3, %%mm6           \n\t"\
-        "add %2, %0                 \n\t"\
+        "pmullw "MANGLE(ff_pw_5)", %%mm6\n\t"\
+        "add %%"REG_S", %%"REG_a"   \n\t"\
         "punpcklbw %%mm7, "#F"      \n\t"\
         "paddw "#F", "#A"           \n\t"\
         "paddw "#A", %%mm6          \n\t"\
-        "movq %%mm6, "#OF"(%1)      \n\t"
+        "movq %%mm6, "#OF"(%%"REG_c")\n\t"
 
 #define QPEL_H264(OPNAME, OP, MMX)\
 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
@@ -800,20 +800,23 @@
 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
     int h=4;\
     int w=3;\
-    src -= 2*srcStride+2;\
-    while(w--){\
-        asm volatile(\
+    uint8_t *stmp;\
+    src -= 2*srcStride+2;\  
+    stmp = src;\
+    asm volatile(\
+            "1:                     \n\t"\
+            "add $4, %3             \n\t"\
             "pxor %%mm7, %%mm7      \n\t"\
             "movd (%0), %%mm0       \n\t"\
-            "add %2, %0             \n\t"\
+            "add %4, %0             \n\t"\
             "movd (%0), %%mm1       \n\t"\
-            "add %2, %0             \n\t"\
+            "add %4, %0             \n\t"\
             "movd (%0), %%mm2       \n\t"\
-            "add %2, %0             \n\t"\
+            "add %4, %0             \n\t"\
             "movd (%0), %%mm3       \n\t"\
-            "add %2, %0             \n\t"\
+            "add %4, %0             \n\t"\
             "movd (%0), %%mm4       \n\t"\
-            "add %2, %0             \n\t"\
+            "add %4, %0             \n\t"\
             "punpcklbw %%mm7, %%mm0 \n\t"\
             "punpcklbw %%mm7, %%mm1 \n\t"\
             "punpcklbw %%mm7, %%mm2 \n\t"\
@@ -823,14 +826,15 @@
             QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
             QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
             QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
+            "add $8, %1             \n\t"\
+            "mov %3, %0             \n\t"\
+            "dec %2                 \n\t"\
+            "jnz 1b                 \n\t"\
              \
-            : "+a"(src)\
-            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
+            : "+a"(src), "+c"(tmp), "+g"(w), "+r"(stmp)\
+            : "S"((long)srcStride)\
             : "memory"\
-        );\
-        tmp += 4;\
-        src += 4 - 9*srcStride;\
-    }\
+    );\
     tmp -= 3*4;\
     asm volatile(\
         "movq %4, %%mm6             \n\t"\



More information about the ffmpeg-devel mailing list