[FFmpeg-devel] [PATCH] x86/synth_filter: Revert the switch to float ops with SSE2

James Almer jamrial at gmail.com
Sun Mar 2 03:46:27 CET 2014


This reverts the changes 64672098361361cd15d37e36f747ab44de5b80ca
and 68c3ed936a76c3ff7738f602fa90237ac7e3ce08 did to the SSE2 version, 
which generated a hit of about 5 cycles.

Signed-off-by: James Almer <jamrial at gmail.com>
---
I screwed up my earlier tests and only measured the SSE performance vs the 
SSE2 performance of the function post SSE implementation, rather than testing
SSE2 pre vs SSE2 post to see if the switch from pxor to xorps and pshufd to 
shufps had any negative effect.
---
 libavcodec/x86/dcadsp.asm | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 972ce1e..a6a4582 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -199,6 +199,14 @@ INIT_XMM sse
 DCA_LFE_FIR 0
 DCA_LFE_FIR 1
 
+%macro SETZERO 1
+%if cpuflag(sse2) && notcpuflag(avx)
+    pxor          %1, %1
+%else
+    xorps         %1, %1, %1
+%endif
+%endmacro
+
 %macro SHUF 3
 %if cpuflag(avx)
     mova          %3, [%2 - 16]
@@ -265,7 +273,12 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
                               synth_buf, synth_buf2, window, out, off, scale
 %define scale m0
 %if ARCH_X86_32 || WIN64
+%if cpuflag(sse2) && notcpuflag(avx)
+    movd          m0, scalem
+    SPLATD        m0
+%else
     VBROADCASTSS  m0, scalem
+%endif
 ; Make sure offset is in a register and not on the stack
 %define OFFQ  r4q
 %else
@@ -290,8 +303,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
 %endif
 .mainloop
     ; m1 = a  m2 = b  m3 = c  m4 = d
-    xorps         m3, m3, m3
-    xorps         m4, m4, m4
+    SETZERO       m3
+    SETZERO       m4
     mova          m1, [buf2 + i]
     mova          m2, [buf2 + i + 16 * 4]
 %if ARCH_X86_32
@@ -308,8 +321,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
 %define ptr2     r7q ; must be loaded
 %define win      r8q
 %define j        r9q
-    xorps         m9, m9, m9
-    xorps        m10, m10, m10
+    SETZERO       m9
+    SETZERO      m10
     mova          m7, [buf2 + i + mmsize]
     mova          m8, [buf2 + i + mmsize + 16 * 4]
     lea          win, [windowq + i]
-- 
1.8.3.2



More information about the ffmpeg-devel mailing list