[FFmpeg-devel] [PATCH] x86/synth_filter: Revert the switch to float ops with SSE2
James Almer
jamrial at gmail.com
Sun Mar 2 03:46:27 CET 2014
This reverts the changes 64672098361361cd15d37e36f747ab44de5b80ca
and 68c3ed936a76c3ff7738f602fa90237ac7e3ce08 did to the SSE2 version,
which generated a hit of about 5 cycles.
Signed-off-by: James Almer <jamrial at gmail.com>
---
I screwed up my earlier tests and only measured the SSE performance vs the
SSE2 performance of the function post SSE implementation, rather than testing
SSE2 pre vs SSE2 post to see if the switch from pxor to xorps and pshufd to
shufps had any negative effect.
---
libavcodec/x86/dcadsp.asm | 21 +++++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 972ce1e..a6a4582 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -199,6 +199,14 @@ INIT_XMM sse
DCA_LFE_FIR 0
DCA_LFE_FIR 1
+%macro SETZERO 1
+%if cpuflag(sse2) && notcpuflag(avx)
+ pxor %1, %1
+%else
+ xorps %1, %1, %1
+%endif
+%endmacro
+
%macro SHUF 3
%if cpuflag(avx)
mova %3, [%2 - 16]
@@ -265,7 +273,12 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
synth_buf, synth_buf2, window, out, off, scale
%define scale m0
%if ARCH_X86_32 || WIN64
+%if cpuflag(sse2) && notcpuflag(avx)
+ movd m0, scalem
+ SPLATD m0
+%else
VBROADCASTSS m0, scalem
+%endif
; Make sure offset is in a register and not on the stack
%define OFFQ r4q
%else
@@ -290,8 +303,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
%endif
.mainloop
; m1 = a m2 = b m3 = c m4 = d
- xorps m3, m3, m3
- xorps m4, m4, m4
+ SETZERO m3
+ SETZERO m4
mova m1, [buf2 + i]
mova m2, [buf2 + i + 16 * 4]
%if ARCH_X86_32
@@ -308,8 +321,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
%define ptr2 r7q ; must be loaded
%define win r8q
%define j r9q
- xorps m9, m9, m9
- xorps m10, m10, m10
+ SETZERO m9
+ SETZERO m10
mova m7, [buf2 + i + mmsize]
mova m8, [buf2 + i + mmsize + 16 * 4]
lea win, [windowq + i]
--
1.8.3.2
More information about the ffmpeg-devel
mailing list