diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2013-10-08 10:27:20 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2013-10-08 10:27:30 +0200 |
commit | c1488fab3dd402d6d26b2b9124ab448715a8f01f (patch) | |
tree | 1078413947569e0b50703b94a0304d78002d8166 /libavutil/x86/x86inc.asm | |
parent | ba9c557b92c40ba97753354299780aa6ed646ff1 (diff) | |
parent | 25cb0c1a1e66edacc1667acf6818f524c0997f10 (diff) | |
download | ffmpeg-c1488fab3dd402d6d26b2b9124ab448715a8f01f.tar.gz |
Merge commit '25cb0c1a1e66edacc1667acf6818f524c0997f10'
* commit '25cb0c1a1e66edacc1667acf6818f524c0997f10':
x86inc: activate REP_RET automatically
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavutil/x86/x86inc.asm')
-rw-r--r-- | libavutil/x86/x86inc.asm | 36 |
1 files changed, 31 insertions, 5 deletions
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index b9f5e818bc..ab3ca6c186 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -154,8 +154,7 @@ CPUNOP amdnop ; Pops anything that was pushed by PROLOGUE, and returns. ; REP_RET: -; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons -; which are slow when a normal ret follows a branch. +; Use this instead of RET if it's a branch target. ; registers: ; rN and rNq are the native-size register holding function argument N @@ -503,7 +502,7 @@ DECLARE_REG 14, R15, 120 %if mmsize == 32 vzeroupper %endif - ret + AUTO_REP_RET %endmacro %elif ARCH_X86_64 ; *nix x64 ;============================================= @@ -550,7 +549,7 @@ DECLARE_REG 14, R15, 72 %if mmsize == 32 vzeroupper %endif - ret + AUTO_REP_RET %endmacro %else ; X86_32 ;============================================================== @@ -606,7 +605,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %if mmsize == 32 vzeroupper %endif - ret + AUTO_REP_RET %endmacro %endif ;====================================================================== @@ -620,6 +619,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %endmacro %endif +; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either +; a branch or a branch target. So switch to a 2-byte form of ret in that case. +; We can automatically detect "follows a branch", but not a branch target. +; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) %macro REP_RET 0 %if has_epilogue RET @@ -628,6 +631,29 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %endif %endmacro +%define last_branch_adr $$ +%macro AUTO_REP_RET 0 + %ifndef cpuflags + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. + %elif notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep + %endif + ret +%endmacro + +%macro BRANCH_INSTR 0-* + %rep %0 + %macro %1 1-2 %1 + %2 %1 + %%branch_instr: + %xdefine last_branch_adr %%branch_instr + %endmacro + %rotate 1 + %endrep +%endmacro + +BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp + %macro TAIL_CALL 2 ; callee, is_nonadjacent %if has_epilogue call %1 |