diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:17 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:17 +0300 |
commit | d3a398281c6fd1d3672036cb2d63f842d2cb28c5 (patch) | |
tree | dd4bd3ca0f36b817e96812825ffaf10d645803f2 /contrib/libs/cxxsupp/builtins/i386/floatundisf.S | |
parent | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (diff) | |
download | ydb-d3a398281c6fd1d3672036cb2d63f842d2cb28c5.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/cxxsupp/builtins/i386/floatundisf.S')
-rw-r--r-- | contrib/libs/cxxsupp/builtins/i386/floatundisf.S | 210 |
1 files changed, 105 insertions, 105 deletions
diff --git a/contrib/libs/cxxsupp/builtins/i386/floatundisf.S b/contrib/libs/cxxsupp/builtins/i386/floatundisf.S index b7db958283..94c97e25aa 100644 --- a/contrib/libs/cxxsupp/builtins/i386/floatundisf.S +++ b/contrib/libs/cxxsupp/builtins/i386/floatundisf.S @@ -1,105 +1,105 @@ -// This file is dual licensed under the MIT and the University of Illinois Open -// Source Licenses. See LICENSE.TXT for details. - -#include "../assembly.h" - -// float __floatundisf(du_int a); - -// Note that there is a hardware instruction, fildll, that does most of what -// this function needs to do. However, because of our ia32 ABI, it will take -// a write-small read-large stall, so the software implementation here is -// actually several cycles faster. - -// This is a branch-free implementation. A branchy implementation might be -// faster for the common case if you know something a priori about the input -// distribution. - -/* branch-free x87 implementation - one cycle slower than without x87. - -#ifdef __i386__ - -CONST_SECTION -.balign 3 - - .quad 0x43f0000000000000 -twop64: .quad 0x0000000000000000 - -#define TWOp64 twop64-0b(%ecx,%eax,8) - -.text -.balign 4 -DEFINE_COMPILERRT_FUNCTION(__floatundisf) - movl 8(%esp), %eax - movd 8(%esp), %xmm1 - movd 4(%esp), %xmm0 - punpckldq %xmm1, %xmm0 - calll 0f -0: popl %ecx - sarl $31, %eax - movq %xmm0, 4(%esp) - fildll 4(%esp) - faddl TWOp64 - fstps 4(%esp) - flds 4(%esp) - ret -END_COMPILERRT_FUNCTION(__floatundisf) - -#endif // __i386__ - -*/ - -/* branch-free, x87-free implementation - faster at the expense of code size */ - -#ifdef __i386__ - -CONST_SECTION - - .balign 16 -twop52: - .quad 0x4330000000000000 - .quad 0x0000000000000fff - - .balign 16 -sticky: - .quad 0x0000000000000000 - .long 0x00000012 - - .balign 16 -twelve: - .long 0x00000000 - -#define TWOp52 twop52-0b(%ecx) -#define STICKY sticky-0b(%ecx,%eax,8) - -.text -.balign 4 -DEFINE_COMPILERRT_FUNCTION(__floatundisf) - movl 8(%esp), %eax - movd 8(%esp), %xmm1 - movd 4(%esp), %xmm0 - punpckldq %xmm1, %xmm0 - - calll 0f -0: popl %ecx - shrl %eax // high 31 bits of input as sint32 - addl $0x7ff80000, %eax - sarl $31, %eax // (big input) ? -1 : 0 - movsd STICKY, %xmm1 // (big input) ? 0xfff : 0 - movl $12, %edx - andl %eax, %edx // (big input) ? 12 : 0 - movd %edx, %xmm3 - andpd %xmm0, %xmm1 // (big input) ? input & 0xfff : 0 - movsd TWOp52, %xmm2 // 0x1.0p52 - psrlq %xmm3, %xmm0 // (big input) ? input >> 12 : input - orpd %xmm2, %xmm1 // 0x1.0p52 + ((big input) ? input & 0xfff : input) - orpd %xmm1, %xmm0 // 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input) - subsd %xmm2, %xmm0 // (double)((big input) ? (input >> 12 | input & 0xfff) : input) - cvtsd2ss %xmm0, %xmm0 // (float)((big input) ? (input >> 12 | input & 0xfff) : input) - pslld $23, %xmm3 - paddd %xmm3, %xmm0 // (float)input - movd %xmm0, 4(%esp) - flds 4(%esp) - ret -END_COMPILERRT_FUNCTION(__floatundisf) - -#endif // __i386__ +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. + +#include "../assembly.h" + +// float __floatundisf(du_int a); + +// Note that there is a hardware instruction, fildll, that does most of what +// this function needs to do. However, because of our ia32 ABI, it will take +// a write-small read-large stall, so the software implementation here is +// actually several cycles faster. + +// This is a branch-free implementation. A branchy implementation might be +// faster for the common case if you know something a priori about the input +// distribution. + +/* branch-free x87 implementation - one cycle slower than without x87. + +#ifdef __i386__ + +CONST_SECTION +.balign 3 + + .quad 0x43f0000000000000 +twop64: .quad 0x0000000000000000 + +#define TWOp64 twop64-0b(%ecx,%eax,8) + +.text +.balign 4 +DEFINE_COMPILERRT_FUNCTION(__floatundisf) + movl 8(%esp), %eax + movd 8(%esp), %xmm1 + movd 4(%esp), %xmm0 + punpckldq %xmm1, %xmm0 + calll 0f +0: popl %ecx + sarl $31, %eax + movq %xmm0, 4(%esp) + fildll 4(%esp) + faddl TWOp64 + fstps 4(%esp) + flds 4(%esp) + ret +END_COMPILERRT_FUNCTION(__floatundisf) + +#endif // __i386__ + +*/ + +/* branch-free, x87-free implementation - faster at the expense of code size */ + +#ifdef __i386__ + +CONST_SECTION + + .balign 16 +twop52: + .quad 0x4330000000000000 + .quad 0x0000000000000fff + + .balign 16 +sticky: + .quad 0x0000000000000000 + .long 0x00000012 + + .balign 16 +twelve: + .long 0x00000000 + +#define TWOp52 twop52-0b(%ecx) +#define STICKY sticky-0b(%ecx,%eax,8) + +.text +.balign 4 +DEFINE_COMPILERRT_FUNCTION(__floatundisf) + movl 8(%esp), %eax + movd 8(%esp), %xmm1 + movd 4(%esp), %xmm0 + punpckldq %xmm1, %xmm0 + + calll 0f +0: popl %ecx + shrl %eax // high 31 bits of input as sint32 + addl $0x7ff80000, %eax + sarl $31, %eax // (big input) ? -1 : 0 + movsd STICKY, %xmm1 // (big input) ? 0xfff : 0 + movl $12, %edx + andl %eax, %edx // (big input) ? 12 : 0 + movd %edx, %xmm3 + andpd %xmm0, %xmm1 // (big input) ? input & 0xfff : 0 + movsd TWOp52, %xmm2 // 0x1.0p52 + psrlq %xmm3, %xmm0 // (big input) ? input >> 12 : input + orpd %xmm2, %xmm1 // 0x1.0p52 + ((big input) ? input & 0xfff : input) + orpd %xmm1, %xmm0 // 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input) + subsd %xmm2, %xmm0 // (double)((big input) ? (input >> 12 | input & 0xfff) : input) + cvtsd2ss %xmm0, %xmm0 // (float)((big input) ? (input >> 12 | input & 0xfff) : input) + pslld $23, %xmm3 + paddd %xmm3, %xmm0 // (float)input + movd %xmm0, 4(%esp) + flds 4(%esp) + ret +END_COMPILERRT_FUNCTION(__floatundisf) + +#endif // __i386__ |