[FFmpeg-devel] [PATCH 5/6] x86: huffyuvdsp: MMX add_hfyu_left_pred_bgr32

Christophe Gisquet christophe.gisquet at gmail.com
Thu May 29 11:10:40 CEST 2014


          C    MMX  MMX2
Cycles: 3092  1356  1060

Unrolling is a ~15% gain.
---
 libavcodec/x86/huffyuvdsp.asm    | 37 +++++++++++++++++++++++++++++++++++++
 libavcodec/x86/huffyuvdsp_init.c | 10 +++++++++-
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
index 9806fed..38309b7 100644
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -217,3 +217,40 @@ cglobal add_bytes, 3,4,4, dst, src, w, size
     jmp .2
 .end:
     REP_RET
+
+; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
+;                               intptr_t w, uint8_t *left)
+%macro LEFT_BGR32 0
+cglobal add_hfyu_left_pred_bgr32, 4,4,0, dst, src, w, left
+    shl           wq, 2
+    movd          m0, [leftq]
+    lea         dstq, [dstq + wq]
+    lea         srcq, [srcq + wq]
+    neg           wq
+.loop:
+    movd          m1, [srcq+wq]
+    movd          m2, [srcq+wq+mmsize/2]
+    paddb         m0, m1
+    punpckldq     m0, m0
+    paddb         m0, m2
+%if cpuflag(mmxext)
+    pshufw        m1, m0, q1032
+%else
+    mova          m1, m0
+    mova          m2, m0
+    psllq         m1, 32
+    psrlq         m2, 32
+    por           m1, m2
+%endif
+    mova   [dstq+wq], m1
+    add           wq, mmsize
+    jl         .loop
+    movd          m0, [dstq-4]
+    movd     [leftq], m0
+    REP_RET
+%endmacro
+
+INIT_MMX mmx
+LEFT_BGR32
+INIT_MMX mmxext
+LEFT_BGR32
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
index 1a42b87..2abd8d5 100644
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -40,6 +40,11 @@ int  ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
 int  ff_add_hfyu_left_pred_sse4(uint8_t *dst, const uint8_t *src,
                                 int w, int left);
 
+void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
+                                     intptr_t w, uint8_t *left);
+void ff_add_hfyu_left_pred_bgr32_mmxext(uint8_t *dst, const uint8_t *src,
+                                        intptr_t w, uint8_t *left);
+
 av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -49,13 +54,16 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
         c->add_hfyu_median_pred = ff_add_hfyu_median_pred_cmov;
 #endif
 
-    if (EXTERNAL_MMX(cpu_flags))
+    if (EXTERNAL_MMX(cpu_flags)) {
         c->add_bytes = ff_add_bytes_mmx;
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
+    }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         /* slower than cmov version on AMD */
         if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
             c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmxext;
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
-- 
1.8.0.msysgit.0



More information about the ffmpeg-devel mailing list