[FFmpeg-cvslog] x86/synth_filter: Revert the switch to float ops with SSE2
James Almer
git at videolan.org
Sun Mar 2 12:28:05 CET 2014
ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Sat Mar 1 23:46:27 2014 -0300| [884e085d1ea34f2f773b9589ae8e8aa9ca91b358] | committer: Michael Niedermayer
x86/synth_filter: Revert the switch to float ops with SSE2
This reverts the changes 64672098361361cd15d37e36f747ab44de5b80ca
and 68c3ed936a76c3ff7738f602fa90237ac7e3ce08 did to the SSE2 version,
which generated a hit of about 5 cycles.
Signed-off-by: James Almer <jamrial at gmail.com>
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=884e085d1ea34f2f773b9589ae8e8aa9ca91b358
---
libavcodec/x86/dcadsp.asm | 21 +++++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 972ce1e..a6a4582 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -199,6 +199,14 @@ INIT_XMM sse
DCA_LFE_FIR 0
DCA_LFE_FIR 1
+%macro SETZERO 1
+%if cpuflag(sse2) && notcpuflag(avx)
+ pxor %1, %1
+%else
+ xorps %1, %1, %1
+%endif
+%endmacro
+
%macro SHUF 3
%if cpuflag(avx)
mova %3, [%2 - 16]
@@ -265,7 +273,12 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
synth_buf, synth_buf2, window, out, off, scale
%define scale m0
%if ARCH_X86_32 || WIN64
+%if cpuflag(sse2) && notcpuflag(avx)
+ movd m0, scalem
+ SPLATD m0
+%else
VBROADCASTSS m0, scalem
+%endif
; Make sure offset is in a register and not on the stack
%define OFFQ r4q
%else
@@ -290,8 +303,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
%endif
.mainloop
; m1 = a m2 = b m3 = c m4 = d
- xorps m3, m3, m3
- xorps m4, m4, m4
+ SETZERO m3
+ SETZERO m4
mova m1, [buf2 + i]
mova m2, [buf2 + i + 16 * 4]
%if ARCH_X86_32
@@ -308,8 +321,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
%define ptr2 r7q ; must be loaded
%define win r8q
%define j r9q
- xorps m9, m9, m9
- xorps m10, m10, m10
+ SETZERO m9
+ SETZERO m10
mova m7, [buf2 + i + mmsize]
mova m8, [buf2 + i + mmsize + 16 * 4]
lea win, [windowq + i]
More information about the ffmpeg-cvslog
mailing list