[FFmpeg-cvslog] x86inc: activate REP_RET automatically

Tue Oct 8 10:33:58 CEST 2013

ffmpeg | branch: master | Loren Merritt <pengvado at akuvian.org> | Wed Sep 11 17:49:20 2013 +0200| [25cb0c1a1e66edacc1667acf6818f524c0997f10] | committer: Derek Buitenhuis

x86inc: activate REP_RET automatically

Now RET checks whether it immediately follows a branch, so the
programmer dosen't have to keep track of that condition. REP_RET
is still needed manually when it's a branch target, but that's
much rarer.

The implementation involves lots of spurious labels, but that's OK
because we strip them.

Signed-off-by: Derek Buitenhuis <derek.buitenhuis at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=25cb0c1a1e66edacc1667acf6818f524c0997f10
---

 libavutil/x86/x86inc.asm |   36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index f4be5d3..07512d1 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -135,8 +135,7 @@ CPUNOP amdnop
 ; Pops anything that was pushed by PROLOGUE, and returns.
 
 ; REP_RET:
-; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
-; which are slow when a normal ret follows a branch.
+; Use this instead of RET if it's a branch target.
 
 ; registers:
 ; rN and rNq are the native-size register holding function argument N
@@ -484,7 +483,7 @@ DECLARE_REG 14, R15, 120
 %if mmsize == 32
     vzeroupper
 %endif
-    ret
+    AUTO_REP_RET
 %endmacro
 
 %elif ARCH_X86_64 ; *nix x64 ;=============================================
@@ -531,7 +530,7 @@ DECLARE_REG 14, R15, 72
 %if mmsize == 32
     vzeroupper
 %endif
-    ret
+    AUTO_REP_RET
 %endmacro
 
 %else ; X86_32 ;==============================================================
@@ -587,7 +586,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %if mmsize == 32
     vzeroupper
 %endif
-    ret
+    AUTO_REP_RET
 %endmacro
 
 %endif ;======================================================================
@@ -601,6 +600,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %endmacro
 %endif
 
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
 %macro REP_RET 0
     %if has_epilogue
         RET
@@ -609,6 +612,29 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
     %endif
 %endmacro
 
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+    %ifndef cpuflags
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
+    %elif notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep
+    %endif
+    ret
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+    %rep %0
+        %macro %1 1-2 %1
+            %2 %1
+            %%branch_instr:
+            %xdefine last_branch_adr %%branch_instr
+        %endmacro
+        %rotate 1
+    %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
 %macro TAIL_CALL 2 ; callee, is_nonadjacent
     %if has_epilogue
         call %1