[FFmpeg-cvslog] x86: move XOP emulation code back to x86inc

James Almer git at videolan.org
Mon Aug 3 22:12:26 CEST 2015


ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Mon Aug  3 03:28:37 2015 -0300| [5750d6c5e9d184488f4dc0f9e81cbcc28cb2f2d1] | committer: James Almer

x86: move XOP emulation code back to x86inc

Only two functions that use xop multiply-accumulate instructions where the
first operand is the same as the fourth actually took advantage of the macros.

This further reduces differences with x264's x86inc.

Reviewed-by: Ronald S. Bultje <rsbultje at gmail.com>
Signed-off-by: James Almer <jamrial at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5750d6c5e9d184488f4dc0f9e81cbcc28cb2f2d1
---

 libavcodec/x86/flacdsp.asm     |    9 +++++++++
 libavutil/x86/x86inc.asm       |   16 ++++++++++++++++
 libavutil/x86/x86util.asm      |   19 -------------------
 libswresample/x86/resample.asm |    7 ++++++-
 4 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
index 901c440..7138611 100644
--- a/libavcodec/x86/flacdsp.asm
+++ b/libavcodec/x86/flacdsp.asm
@@ -25,6 +25,15 @@
 
 SECTION .text
 
+%macro PMACSDQL 5
+%if cpuflag(xop)
+    pmacsdql %1, %2, %3, %1
+%else
+    pmuldq   %2, %3
+    paddq    %1, %2
+%endif
+%endmacro
+
 %macro LPC_32 1
 INIT_XMM %1
 cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index d4c5e69..28a2d87 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1427,6 +1427,22 @@ AVX_INSTR pfmul, 3dnow, 1, 0, 1
 %undef i
 %undef j
 
+%macro FMA_INSTR 3
+    %macro %1 4-7 %1, %2, %3
+        %if cpuflag(xop)
+            v%5 %1, %2, %3, %4
+        %else
+            %6 %1, %2, %3
+            %7 %1, %4
+        %endif
+    %endmacro
+%endmacro
+
+FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
 ; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
 ; This lets us use tzcnt without bumping the yasm version requirement yet.
 %define tzcnt rep bsf
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index d6702c1..bf64d17 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -765,25 +765,6 @@
 %endif
 %endmacro
 
-%macro PMA_EMU 4
-    %macro %1 5-8 %2, %3, %4
-        %if cpuflag(xop)
-            v%6 %1, %2, %3, %4
-        %elifidn %1, %4
-            %7 %5, %2, %3
-            %8 %1, %4, %5
-        %else
-            %7 %1, %2, %3
-            %8 %1, %4
-        %endif
-    %endmacro
-%endmacro
-
-PMA_EMU  PMACSWW,  pmacsww,  pmullw, paddw
-PMA_EMU  PMACSDD,  pmacsdd,  pmulld, paddd ; sse4 emulation
-PMA_EMU PMACSDQL, pmacsdql,  pmuldq, paddq ; sse4 emulation
-PMA_EMU PMADCSWD, pmadcswd, pmaddwd, paddd
-
 ; Wrapper for non-FMA version of fmaddps
 %macro FMULADD_PS 5
     %if cpuflag(fma3) || cpuflag(fma4)
diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm
index a57ff37..4989aa6 100644
--- a/libswresample/x86/resample.asm
+++ b/libswresample/x86/resample.asm
@@ -176,7 +176,12 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
 .inner_loop:
     movu                          m1, [srcq+min_filter_count_x4q*1]
 %ifidn %1, int16
-    PMADCSWD                      m0, m1, [filterq+min_filter_count_x4q*1], m0, m1
+%if cpuflag(xop)
+    vpmadcswd                     m0, m1, [filterq+min_filter_count_x4q*1], m0
+%else
+    pmaddwd                       m1, [filterq+min_filter_count_x4q*1]
+    paddd                         m0, m1
+%endif
 %else ; float/double
 %if cpuflag(fma4) || cpuflag(fma3)
     fmaddp%4                      m0, m1, [filterq+min_filter_count_x4q*1], m0



More information about the ffmpeg-cvslog mailing list