diff options
author | heretic <heretic@yandex-team.ru> | 2022-02-10 16:45:43 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:43 +0300 |
commit | 397cbe258b9e064f49c4ca575279f02f39fef76e (patch) | |
tree | a0b0eb3cca6a14e4e8ea715393637672fa651284 /contrib/libs/llvm12/lib/Target/X86/README-SSE.txt | |
parent | 43f5a35593ebc9f6bcea619bb170394ea7ae468e (diff) | |
download | ydb-397cbe258b9e064f49c4ca575279f02f39fef76e.tar.gz |
Restoring authorship annotation for <heretic@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Target/X86/README-SSE.txt')
-rw-r--r-- | contrib/libs/llvm12/lib/Target/X86/README-SSE.txt | 1656 |
1 files changed, 828 insertions, 828 deletions
diff --git a/contrib/libs/llvm12/lib/Target/X86/README-SSE.txt b/contrib/libs/llvm12/lib/Target/X86/README-SSE.txt index d52840e5c4..40f526b478 100644 --- a/contrib/libs/llvm12/lib/Target/X86/README-SSE.txt +++ b/contrib/libs/llvm12/lib/Target/X86/README-SSE.txt @@ -1,829 +1,829 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the X86 backend: SSE-specific stuff. -//===---------------------------------------------------------------------===// - -//===---------------------------------------------------------------------===// - -SSE Variable shift can be custom lowered to something like this, which uses a -small table + unaligned load + shuffle instead of going through memory. - -__m128i_shift_right: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - -... -__m128i shift_right(__m128i value, unsigned long offset) { - return _mm_shuffle_epi8(value, - _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); -} - -//===---------------------------------------------------------------------===// - -SSE has instructions for doing operations on complex numbers, we should pattern -match them. For example, this should turn into a horizontal add: - -typedef float __attribute__((vector_size(16))) v4f32; -float f32(v4f32 A) { - return A[0]+A[1]+A[2]+A[3]; -} - -Instead we get this: - -_f32: ## @f32 - pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] - addss %xmm0, %xmm1 - pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] - movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] - movaps %xmm0, %xmm3 - addss %xmm1, %xmm3 - movdqa %xmm2, %xmm0 - addss %xmm3, %xmm0 - ret - -Also, there are cases where some simple local SLP would improve codegen a bit. -compiling this: - -_Complex float f32(_Complex float A, _Complex float B) { - return A+B; -} - -into: - -_f32: ## @f32 - movdqa %xmm0, %xmm2 - addss %xmm1, %xmm2 - pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] - pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] - addss %xmm1, %xmm3 - movaps %xmm2, %xmm0 - unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] - ret - -seems silly when it could just be one addps. - - -//===---------------------------------------------------------------------===// - -Expand libm rounding functions inline: Significant speedups possible. -http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html - -//===---------------------------------------------------------------------===// - -When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and -other fast SSE modes. - -//===---------------------------------------------------------------------===// - -Think about doing i64 math in SSE regs on x86-32. - -//===---------------------------------------------------------------------===// - -This testcase should have no SSE instructions in it, and only one load from -a constant pool: - -double %test3(bool %B) { - %C = select bool %B, double 123.412, double 523.01123123 - ret double %C -} - -Currently, the select is being lowered, which prevents the dag combiner from -turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' - -The pattern isel got this one right. - -//===---------------------------------------------------------------------===// - -Lower memcpy / memset to a series of SSE 128 bit move instructions when it's -feasible. - -//===---------------------------------------------------------------------===// - -Codegen: - if (copysign(1.0, x) == copysign(1.0, y)) -into: - if (x^y & mask) -when using SSE. - -//===---------------------------------------------------------------------===// - -Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half -of a v4sf value. - -//===---------------------------------------------------------------------===// - -Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. -Perhaps use pxor / xorp* to clear a XMM register first? - -//===---------------------------------------------------------------------===// - -External test Nurbs exposed some problems. Look for -__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc -emits: - - movaps (%edx), %xmm2 #59.21 - movaps (%edx), %xmm5 #60.21 - movaps (%edx), %xmm4 #61.21 - movaps (%edx), %xmm3 #62.21 - movl 40(%ecx), %ebp #69.49 - shufps $0, %xmm2, %xmm5 #60.21 - movl 100(%esp), %ebx #69.20 - movl (%ebx), %edi #69.20 - imull %ebp, %edi #69.49 - addl (%eax), %edi #70.33 - shufps $85, %xmm2, %xmm4 #61.21 - shufps $170, %xmm2, %xmm3 #62.21 - shufps $255, %xmm2, %xmm2 #63.21 - lea (%ebp,%ebp,2), %ebx #69.49 - negl %ebx #69.49 - lea -3(%edi,%ebx), %ebx #70.33 - shll $4, %ebx #68.37 - addl 32(%ecx), %ebx #68.37 - testb $15, %bl #91.13 - jne L_B1.24 # Prob 5% #91.13 - -This is the llvm code after instruction scheduling: - -cond_next140 (0xa910740, LLVM BB @0xa90beb0): - %reg1078 = MOV32ri -3 - %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0 - %reg1037 = MOV32rm %reg1024, 1, %noreg, 40 - %reg1080 = IMUL32rr %reg1079, %reg1037 - %reg1081 = MOV32rm %reg1058, 1, %noreg, 0 - %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 - %reg1036 = MOV32rm %reg1024, 1, %noreg, 32 - %reg1082 = SHL32ri %reg1038, 4 - %reg1039 = ADD32rr %reg1036, %reg1082 - %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0 - %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 - %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 - %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 - %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 - %reg1040 = MOV32rr %reg1039 - %reg1084 = AND32ri8 %reg1039, 15 - CMP32ri8 %reg1084, 0 - JE mbb<cond_next204,0xa914d30> - -Still ok. After register allocation: - -cond_next140 (0xa910740, LLVM BB @0xa90beb0): - %eax = MOV32ri -3 - %edx = MOV32rm %stack.3, 1, %noreg, 0 - ADD32rm %eax<def&use>, %edx, 1, %noreg, 0 - %edx = MOV32rm %stack.7, 1, %noreg, 0 - %edx = MOV32rm %edx, 1, %noreg, 40 - IMUL32rr %eax<def&use>, %edx - %esi = MOV32rm %stack.5, 1, %noreg, 0 - %esi = MOV32rm %esi, 1, %noreg, 0 - MOV32mr %stack.4, 1, %noreg, 0, %esi - %eax = LEA32r %esi, 1, %eax, -3 - %esi = MOV32rm %stack.7, 1, %noreg, 0 - %esi = MOV32rm %esi, 1, %noreg, 32 - %edi = MOV32rr %eax - SHL32ri %edi<def&use>, 4 - ADD32rr %edi<def&use>, %esi - %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0 - %xmm1 = MOVAPSrr %xmm0 - SHUFPSrr %xmm1<def&use>, %xmm1, 170 - %xmm2 = MOVAPSrr %xmm0 - SHUFPSrr %xmm2<def&use>, %xmm2, 0 - %xmm3 = MOVAPSrr %xmm0 - SHUFPSrr %xmm3<def&use>, %xmm3, 255 - SHUFPSrr %xmm0<def&use>, %xmm0, 85 - %ebx = MOV32rr %edi - AND32ri8 %ebx<def&use>, 15 - CMP32ri8 %ebx, 0 - JE mbb<cond_next204,0xa914d30> - -This looks really bad. The problem is shufps is a destructive opcode. Since it -appears as operand two in more than one shufps ops. It resulted in a number of -copies. Note icc also suffers from the same problem. Either the instruction -selector should select pshufd or The register allocator can made the two-address -to three-address transformation. - -It also exposes some other problems. See MOV32ri -3 and the spills. - -//===---------------------------------------------------------------------===// - -Consider: - -__m128 test(float a) { - return _mm_set_ps(0.0, 0.0, 0.0, a*a); -} - -This compiles into: - -movss 4(%esp), %xmm1 -mulss %xmm1, %xmm1 -xorps %xmm0, %xmm0 -movss %xmm1, %xmm0 -ret - -Because mulss doesn't modify the top 3 elements, the top elements of -xmm1 are already zero'd. We could compile this to: - -movss 4(%esp), %xmm0 -mulss %xmm0, %xmm0 -ret - -//===---------------------------------------------------------------------===// - -Here's a sick and twisted idea. Consider code like this: - -__m128 test(__m128 a) { - float b = *(float*)&A; - ... - return _mm_set_ps(0.0, 0.0, 0.0, b); -} - -This might compile to this code: - -movaps c(%esp), %xmm1 -xorps %xmm0, %xmm0 -movss %xmm1, %xmm0 -ret - -Now consider if the ... code caused xmm1 to get spilled. This might produce -this code: - -movaps c(%esp), %xmm1 -movaps %xmm1, c2(%esp) -... - -xorps %xmm0, %xmm0 -movaps c2(%esp), %xmm1 -movss %xmm1, %xmm0 -ret - -However, since the reload is only used by these instructions, we could -"fold" it into the uses, producing something like this: - -movaps c(%esp), %xmm1 -movaps %xmm1, c2(%esp) -... - -movss c2(%esp), %xmm0 -ret - -... saving two instructions. - -The basic idea is that a reload from a spill slot, can, if only one 4-byte -chunk is used, bring in 3 zeros the one element instead of 4 elements. -This can be used to simplify a variety of shuffle operations, where the -elements are fixed zeros. - -//===---------------------------------------------------------------------===// - -This code generates ugly code, probably due to costs being off or something: - -define void @test(float* %P, <4 x float>* %P2 ) { - %xFloat0.688 = load float* %P - %tmp = load <4 x float>* %P2 - %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 - store <4 x float> %inFloat3.713, <4 x float>* %P2 - ret void -} - -Generates: - -_test: - movl 8(%esp), %eax - movaps (%eax), %xmm0 - pxor %xmm1, %xmm1 - movaps %xmm0, %xmm2 - shufps $50, %xmm1, %xmm2 - shufps $132, %xmm2, %xmm0 - movaps %xmm0, (%eax) - ret - -Would it be better to generate: - -_test: - movl 8(%esp), %ecx - movaps (%ecx), %xmm0 - xor %eax, %eax - pinsrw $6, %eax, %xmm0 - pinsrw $7, %eax, %xmm0 - movaps %xmm0, (%ecx) - ret - -? - -//===---------------------------------------------------------------------===// - -Some useful information in the Apple Altivec / SSE Migration Guide: - -http://developer.apple.com/documentation/Performance/Conceptual/ -Accelerate_sse_migration/index.html - -e.g. SSE select using and, andnot, or. Various SSE compare translations. - -//===---------------------------------------------------------------------===// - -Add hooks to commute some CMPP operations. - -//===---------------------------------------------------------------------===// - -Apply the same transformation that merged four float into a single 128-bit load -to loads from constant pool. - -//===---------------------------------------------------------------------===// - -Floating point max / min are commutable when -enable-unsafe-fp-path is -specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other -nodes which are selected to max / min instructions that are marked commutable. - -//===---------------------------------------------------------------------===// - -We should materialize vector constants like "all ones" and "signbit" with -code like: - - cmpeqps xmm1, xmm1 ; xmm1 = all-ones - -and: - cmpeqps xmm1, xmm1 ; xmm1 = all-ones - psrlq xmm1, 31 ; xmm1 = all 100000000000... - -instead of using a load from the constant pool. The later is important for -ABS/NEG/copysign etc. - -//===---------------------------------------------------------------------===// - -These functions: - -#include <xmmintrin.h> -__m128i a; -void x(unsigned short n) { - a = _mm_slli_epi32 (a, n); -} -void y(unsigned n) { - a = _mm_slli_epi32 (a, n); -} - -compile to ( -O3 -static -fomit-frame-pointer): -_x: - movzwl 4(%esp), %eax - movd %eax, %xmm0 - movaps _a, %xmm1 - pslld %xmm0, %xmm1 - movaps %xmm1, _a - ret -_y: - movd 4(%esp), %xmm0 - movaps _a, %xmm1 - pslld %xmm0, %xmm1 - movaps %xmm1, _a - ret - -"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems -like movd would be sufficient in both cases as the value is already zero -extended in the 32-bit stack slot IIRC. For signed short, it should also be -save, as a really-signed value would be undefined for pslld. - - -//===---------------------------------------------------------------------===// - -#include <math.h> -int t1(double d) { return signbit(d); } - -This currently compiles to: - subl $12, %esp - movsd 16(%esp), %xmm0 - movsd %xmm0, (%esp) - movl 4(%esp), %eax - shrl $31, %eax - addl $12, %esp - ret - -We should use movmskp{s|d} instead. - -//===---------------------------------------------------------------------===// - -CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single -(aligned) vector load. This functionality has a couple of problems. - -1. The code to infer alignment from loads of globals is in the X86 backend, - not the dag combiner. This is because dagcombine2 needs to be able to see - through the X86ISD::Wrapper node, which DAGCombine can't really do. -2. The code for turning 4 x load into a single vector load is target - independent and should be moved to the dag combiner. -3. The code for turning 4 x load into a vector load can only handle a direct - load from a global or a direct load from the stack. It should be generalized - to handle any load from P, P+4, P+8, P+12, where P can be anything. -4. The alignment inference code cannot handle loads from globals in non-static - mode because it doesn't look through the extra dyld stub load. If you try - vec_align.ll without -relocation-model=static, you'll see what I mean. - -//===---------------------------------------------------------------------===// - -We should lower store(fneg(load p), q) into an integer load+xor+store, which -eliminates a constant pool load. For example, consider: - -define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { -entry: - %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] - %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly - ret i64 %tmp20 -} -declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly - -This currently compiles to: - -LCPI1_0: # <4 x float> - .long 2147483648 # float -0 - .long 2147483648 # float -0 - .long 2147483648 # float -0 - .long 2147483648 # float -0 -_ccosf: - subl $12, %esp - movss 16(%esp), %xmm0 - movss %xmm0, 4(%esp) - movss 20(%esp), %xmm0 - xorps LCPI1_0, %xmm0 - movss %xmm0, (%esp) - call L_ccoshf$stub - addl $12, %esp - ret - -Note the load into xmm0, then xor (to negate), then store. In PIC mode, -this code computes the pic base and does two loads to do the constant pool -load, so the improvement is much bigger. - -The tricky part about this xform is that the argument load/store isn't exposed -until post-legalize, and at that point, the fneg has been custom expanded into -an X86 fxor. This means that we need to handle this case in the x86 backend -instead of in target independent code. - -//===---------------------------------------------------------------------===// - -Non-SSE4 insert into 16 x i8 is atrociously bad. - -//===---------------------------------------------------------------------===// - -<2 x i64> extract is substantially worse than <2 x f64>, even if the destination -is memory. - -//===---------------------------------------------------------------------===// - -INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert -any number of 0.0 simultaneously. Currently we only use it for simple -insertions. - -See comments in LowerINSERT_VECTOR_ELT_SSE4. - -//===---------------------------------------------------------------------===// - -On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not -Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are -legal, it'll just take a few extra patterns written in the .td file. - -Note: this is not a code quality issue; the custom lowered code happens to be -right, but we shouldn't have to custom lower anything. This is probably related -to <2 x i64> ops being so bad. - -//===---------------------------------------------------------------------===// - -LLVM currently generates stack realignment code, when it is not necessary -needed. The problem is that we need to know about stack alignment too early, -before RA runs. - -At that point we don't know, whether there will be vector spill, or not. -Stack realignment logic is overly conservative here, but otherwise we can -produce unaligned loads/stores. - -Fixing this will require some huge RA changes. - -Testcase: -#include <emmintrin.h> - -typedef short vSInt16 __attribute__ ((__vector_size__ (16))); - -static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, -- 22725, - 12873};; - -vSInt16 madd(vSInt16 b) -{ - return _mm_madd_epi16(a, b); -} - -Generated code (x86-32, linux): -madd: - pushl %ebp - movl %esp, %ebp - andl $-16, %esp - movaps .LCPI1_0, %xmm1 - pmaddwd %xmm1, %xmm0 - movl %ebp, %esp - popl %ebp - ret - -//===---------------------------------------------------------------------===// - -Consider: +//===---------------------------------------------------------------------===// +// Random ideas for the X86 backend: SSE-specific stuff. +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// + +SSE Variable shift can be custom lowered to something like this, which uses a +small table + unaligned load + shuffle instead of going through memory. + +__m128i_shift_right: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + +... +__m128i shift_right(__m128i value, unsigned long offset) { + return _mm_shuffle_epi8(value, + _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); +} + +//===---------------------------------------------------------------------===// + +SSE has instructions for doing operations on complex numbers, we should pattern +match them. For example, this should turn into a horizontal add: + +typedef float __attribute__((vector_size(16))) v4f32; +float f32(v4f32 A) { + return A[0]+A[1]+A[2]+A[3]; +} + +Instead we get this: + +_f32: ## @f32 + pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] + addss %xmm0, %xmm1 + pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] + movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] + movaps %xmm0, %xmm3 + addss %xmm1, %xmm3 + movdqa %xmm2, %xmm0 + addss %xmm3, %xmm0 + ret + +Also, there are cases where some simple local SLP would improve codegen a bit. +compiling this: + +_Complex float f32(_Complex float A, _Complex float B) { + return A+B; +} + +into: + +_f32: ## @f32 + movdqa %xmm0, %xmm2 + addss %xmm1, %xmm2 + pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] + pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] + addss %xmm1, %xmm3 + movaps %xmm2, %xmm0 + unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] + ret + +seems silly when it could just be one addps. + + +//===---------------------------------------------------------------------===// + +Expand libm rounding functions inline: Significant speedups possible. +http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html + +//===---------------------------------------------------------------------===// + +When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and +other fast SSE modes. + +//===---------------------------------------------------------------------===// + +Think about doing i64 math in SSE regs on x86-32. + +//===---------------------------------------------------------------------===// + +This testcase should have no SSE instructions in it, and only one load from +a constant pool: + +double %test3(bool %B) { + %C = select bool %B, double 123.412, double 523.01123123 + ret double %C +} + +Currently, the select is being lowered, which prevents the dag combiner from +turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' + +The pattern isel got this one right. + +//===---------------------------------------------------------------------===// + +Lower memcpy / memset to a series of SSE 128 bit move instructions when it's +feasible. + +//===---------------------------------------------------------------------===// + +Codegen: + if (copysign(1.0, x) == copysign(1.0, y)) +into: + if (x^y & mask) +when using SSE. + +//===---------------------------------------------------------------------===// + +Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half +of a v4sf value. + +//===---------------------------------------------------------------------===// + +Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. +Perhaps use pxor / xorp* to clear a XMM register first? + +//===---------------------------------------------------------------------===// + +External test Nurbs exposed some problems. Look for +__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc +emits: + + movaps (%edx), %xmm2 #59.21 + movaps (%edx), %xmm5 #60.21 + movaps (%edx), %xmm4 #61.21 + movaps (%edx), %xmm3 #62.21 + movl 40(%ecx), %ebp #69.49 + shufps $0, %xmm2, %xmm5 #60.21 + movl 100(%esp), %ebx #69.20 + movl (%ebx), %edi #69.20 + imull %ebp, %edi #69.49 + addl (%eax), %edi #70.33 + shufps $85, %xmm2, %xmm4 #61.21 + shufps $170, %xmm2, %xmm3 #62.21 + shufps $255, %xmm2, %xmm2 #63.21 + lea (%ebp,%ebp,2), %ebx #69.49 + negl %ebx #69.49 + lea -3(%edi,%ebx), %ebx #70.33 + shll $4, %ebx #68.37 + addl 32(%ecx), %ebx #68.37 + testb $15, %bl #91.13 + jne L_B1.24 # Prob 5% #91.13 + +This is the llvm code after instruction scheduling: + +cond_next140 (0xa910740, LLVM BB @0xa90beb0): + %reg1078 = MOV32ri -3 + %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0 + %reg1037 = MOV32rm %reg1024, 1, %noreg, 40 + %reg1080 = IMUL32rr %reg1079, %reg1037 + %reg1081 = MOV32rm %reg1058, 1, %noreg, 0 + %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 + %reg1036 = MOV32rm %reg1024, 1, %noreg, 32 + %reg1082 = SHL32ri %reg1038, 4 + %reg1039 = ADD32rr %reg1036, %reg1082 + %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0 + %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 + %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 + %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 + %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 + %reg1040 = MOV32rr %reg1039 + %reg1084 = AND32ri8 %reg1039, 15 + CMP32ri8 %reg1084, 0 + JE mbb<cond_next204,0xa914d30> + +Still ok. After register allocation: + +cond_next140 (0xa910740, LLVM BB @0xa90beb0): + %eax = MOV32ri -3 + %edx = MOV32rm %stack.3, 1, %noreg, 0 + ADD32rm %eax<def&use>, %edx, 1, %noreg, 0 + %edx = MOV32rm %stack.7, 1, %noreg, 0 + %edx = MOV32rm %edx, 1, %noreg, 40 + IMUL32rr %eax<def&use>, %edx + %esi = MOV32rm %stack.5, 1, %noreg, 0 + %esi = MOV32rm %esi, 1, %noreg, 0 + MOV32mr %stack.4, 1, %noreg, 0, %esi + %eax = LEA32r %esi, 1, %eax, -3 + %esi = MOV32rm %stack.7, 1, %noreg, 0 + %esi = MOV32rm %esi, 1, %noreg, 32 + %edi = MOV32rr %eax + SHL32ri %edi<def&use>, 4 + ADD32rr %edi<def&use>, %esi + %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0 + %xmm1 = MOVAPSrr %xmm0 + SHUFPSrr %xmm1<def&use>, %xmm1, 170 + %xmm2 = MOVAPSrr %xmm0 + SHUFPSrr %xmm2<def&use>, %xmm2, 0 + %xmm3 = MOVAPSrr %xmm0 + SHUFPSrr %xmm3<def&use>, %xmm3, 255 + SHUFPSrr %xmm0<def&use>, %xmm0, 85 + %ebx = MOV32rr %edi + AND32ri8 %ebx<def&use>, 15 + CMP32ri8 %ebx, 0 + JE mbb<cond_next204,0xa914d30> + +This looks really bad. The problem is shufps is a destructive opcode. Since it +appears as operand two in more than one shufps ops. It resulted in a number of +copies. Note icc also suffers from the same problem. Either the instruction +selector should select pshufd or The register allocator can made the two-address +to three-address transformation. + +It also exposes some other problems. See MOV32ri -3 and the spills. + +//===---------------------------------------------------------------------===// + +Consider: + +__m128 test(float a) { + return _mm_set_ps(0.0, 0.0, 0.0, a*a); +} + +This compiles into: + +movss 4(%esp), %xmm1 +mulss %xmm1, %xmm1 +xorps %xmm0, %xmm0 +movss %xmm1, %xmm0 +ret + +Because mulss doesn't modify the top 3 elements, the top elements of +xmm1 are already zero'd. We could compile this to: + +movss 4(%esp), %xmm0 +mulss %xmm0, %xmm0 +ret + +//===---------------------------------------------------------------------===// + +Here's a sick and twisted idea. Consider code like this: + +__m128 test(__m128 a) { + float b = *(float*)&A; + ... + return _mm_set_ps(0.0, 0.0, 0.0, b); +} + +This might compile to this code: + +movaps c(%esp), %xmm1 +xorps %xmm0, %xmm0 +movss %xmm1, %xmm0 +ret + +Now consider if the ... code caused xmm1 to get spilled. This might produce +this code: + +movaps c(%esp), %xmm1 +movaps %xmm1, c2(%esp) +... + +xorps %xmm0, %xmm0 +movaps c2(%esp), %xmm1 +movss %xmm1, %xmm0 +ret + +However, since the reload is only used by these instructions, we could +"fold" it into the uses, producing something like this: + +movaps c(%esp), %xmm1 +movaps %xmm1, c2(%esp) +... + +movss c2(%esp), %xmm0 +ret + +... saving two instructions. + +The basic idea is that a reload from a spill slot, can, if only one 4-byte +chunk is used, bring in 3 zeros the one element instead of 4 elements. +This can be used to simplify a variety of shuffle operations, where the +elements are fixed zeros. + +//===---------------------------------------------------------------------===// + +This code generates ugly code, probably due to costs being off or something: + +define void @test(float* %P, <4 x float>* %P2 ) { + %xFloat0.688 = load float* %P + %tmp = load <4 x float>* %P2 + %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 + store <4 x float> %inFloat3.713, <4 x float>* %P2 + ret void +} + +Generates: + +_test: + movl 8(%esp), %eax + movaps (%eax), %xmm0 + pxor %xmm1, %xmm1 + movaps %xmm0, %xmm2 + shufps $50, %xmm1, %xmm2 + shufps $132, %xmm2, %xmm0 + movaps %xmm0, (%eax) + ret + +Would it be better to generate: + +_test: + movl 8(%esp), %ecx + movaps (%ecx), %xmm0 + xor %eax, %eax + pinsrw $6, %eax, %xmm0 + pinsrw $7, %eax, %xmm0 + movaps %xmm0, (%ecx) + ret + +? + +//===---------------------------------------------------------------------===// + +Some useful information in the Apple Altivec / SSE Migration Guide: + +http://developer.apple.com/documentation/Performance/Conceptual/ +Accelerate_sse_migration/index.html + +e.g. SSE select using and, andnot, or. Various SSE compare translations. + +//===---------------------------------------------------------------------===// + +Add hooks to commute some CMPP operations. + +//===---------------------------------------------------------------------===// + +Apply the same transformation that merged four float into a single 128-bit load +to loads from constant pool. + +//===---------------------------------------------------------------------===// + +Floating point max / min are commutable when -enable-unsafe-fp-path is +specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other +nodes which are selected to max / min instructions that are marked commutable. + +//===---------------------------------------------------------------------===// + +We should materialize vector constants like "all ones" and "signbit" with +code like: + + cmpeqps xmm1, xmm1 ; xmm1 = all-ones + +and: + cmpeqps xmm1, xmm1 ; xmm1 = all-ones + psrlq xmm1, 31 ; xmm1 = all 100000000000... + +instead of using a load from the constant pool. The later is important for +ABS/NEG/copysign etc. + +//===---------------------------------------------------------------------===// + +These functions: + +#include <xmmintrin.h> +__m128i a; +void x(unsigned short n) { + a = _mm_slli_epi32 (a, n); +} +void y(unsigned n) { + a = _mm_slli_epi32 (a, n); +} + +compile to ( -O3 -static -fomit-frame-pointer): +_x: + movzwl 4(%esp), %eax + movd %eax, %xmm0 + movaps _a, %xmm1 + pslld %xmm0, %xmm1 + movaps %xmm1, _a + ret +_y: + movd 4(%esp), %xmm0 + movaps _a, %xmm1 + pslld %xmm0, %xmm1 + movaps %xmm1, _a + ret + +"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems +like movd would be sufficient in both cases as the value is already zero +extended in the 32-bit stack slot IIRC. For signed short, it should also be +save, as a really-signed value would be undefined for pslld. + + +//===---------------------------------------------------------------------===// + +#include <math.h> +int t1(double d) { return signbit(d); } + +This currently compiles to: + subl $12, %esp + movsd 16(%esp), %xmm0 + movsd %xmm0, (%esp) + movl 4(%esp), %eax + shrl $31, %eax + addl $12, %esp + ret + +We should use movmskp{s|d} instead. + +//===---------------------------------------------------------------------===// + +CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single +(aligned) vector load. This functionality has a couple of problems. + +1. The code to infer alignment from loads of globals is in the X86 backend, + not the dag combiner. This is because dagcombine2 needs to be able to see + through the X86ISD::Wrapper node, which DAGCombine can't really do. +2. The code for turning 4 x load into a single vector load is target + independent and should be moved to the dag combiner. +3. The code for turning 4 x load into a vector load can only handle a direct + load from a global or a direct load from the stack. It should be generalized + to handle any load from P, P+4, P+8, P+12, where P can be anything. +4. The alignment inference code cannot handle loads from globals in non-static + mode because it doesn't look through the extra dyld stub load. If you try + vec_align.ll without -relocation-model=static, you'll see what I mean. + +//===---------------------------------------------------------------------===// + +We should lower store(fneg(load p), q) into an integer load+xor+store, which +eliminates a constant pool load. For example, consider: + +define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { +entry: + %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] + %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly + ret i64 %tmp20 +} +declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly + +This currently compiles to: + +LCPI1_0: # <4 x float> + .long 2147483648 # float -0 + .long 2147483648 # float -0 + .long 2147483648 # float -0 + .long 2147483648 # float -0 +_ccosf: + subl $12, %esp + movss 16(%esp), %xmm0 + movss %xmm0, 4(%esp) + movss 20(%esp), %xmm0 + xorps LCPI1_0, %xmm0 + movss %xmm0, (%esp) + call L_ccoshf$stub + addl $12, %esp + ret + +Note the load into xmm0, then xor (to negate), then store. In PIC mode, +this code computes the pic base and does two loads to do the constant pool +load, so the improvement is much bigger. + +The tricky part about this xform is that the argument load/store isn't exposed +until post-legalize, and at that point, the fneg has been custom expanded into +an X86 fxor. This means that we need to handle this case in the x86 backend +instead of in target independent code. + +//===---------------------------------------------------------------------===// + +Non-SSE4 insert into 16 x i8 is atrociously bad. + +//===---------------------------------------------------------------------===// + +<2 x i64> extract is substantially worse than <2 x f64>, even if the destination +is memory. + +//===---------------------------------------------------------------------===// + +INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert +any number of 0.0 simultaneously. Currently we only use it for simple +insertions. + +See comments in LowerINSERT_VECTOR_ELT_SSE4. + +//===---------------------------------------------------------------------===// + +On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not +Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are +legal, it'll just take a few extra patterns written in the .td file. + +Note: this is not a code quality issue; the custom lowered code happens to be +right, but we shouldn't have to custom lower anything. This is probably related +to <2 x i64> ops being so bad. + +//===---------------------------------------------------------------------===// + +LLVM currently generates stack realignment code, when it is not necessary +needed. The problem is that we need to know about stack alignment too early, +before RA runs. + +At that point we don't know, whether there will be vector spill, or not. +Stack realignment logic is overly conservative here, but otherwise we can +produce unaligned loads/stores. + +Fixing this will require some huge RA changes. + +Testcase: #include <emmintrin.h> -__m128 foo2 (float x) { - return _mm_set_ps (0, 0, x, 0); -} - -In x86-32 mode, we generate this spiffy code: - -_foo2: - movss 4(%esp), %xmm0 - pshufd $81, %xmm0, %xmm0 - ret - -in x86-64 mode, we generate this code, which could be better: - -_foo2: - xorps %xmm1, %xmm1 - movss %xmm0, %xmm1 - pshufd $81, %xmm1, %xmm0 - ret - -In sse4 mode, we could use insertps to make both better. - -Here's another testcase that could use insertps [mem]: - -#include <xmmintrin.h> -extern float x2, x3; -__m128 foo1 (float x1, float x4) { - return _mm_set_ps (x2, x1, x3, x4); -} - -gcc mainline compiles it to: - -foo1: - insertps $0x10, x2(%rip), %xmm0 - insertps $0x10, x3(%rip), %xmm1 - movaps %xmm1, %xmm2 - movlhps %xmm0, %xmm2 - movaps %xmm2, %xmm0 - ret - -//===---------------------------------------------------------------------===// - -We compile vector multiply-by-constant into poor code: - -define <4 x i32> @f(<4 x i32> %i) nounwind { - %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > - ret <4 x i32> %A -} - -On targets without SSE4.1, this compiles into: - -LCPI1_0: ## <4 x i32> - .long 10 - .long 10 - .long 10 - .long 10 - .text - .align 4,0x90 - .globl _f -_f: - pshufd $3, %xmm0, %xmm1 - movd %xmm1, %eax - imull LCPI1_0+12, %eax - movd %eax, %xmm1 - pshufd $1, %xmm0, %xmm2 - movd %xmm2, %eax - imull LCPI1_0+4, %eax - movd %eax, %xmm2 - punpckldq %xmm1, %xmm2 - movd %xmm0, %eax - imull LCPI1_0, %eax - movd %eax, %xmm1 - movhlps %xmm0, %xmm0 - movd %xmm0, %eax - imull LCPI1_0+8, %eax - movd %eax, %xmm0 - punpckldq %xmm0, %xmm1 - movaps %xmm1, %xmm0 - punpckldq %xmm2, %xmm0 - ret - -It would be better to synthesize integer vector multiplication by constants -using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, -simple cases such as multiplication by powers of two would be better as -vector shifts than as multiplications. - -//===---------------------------------------------------------------------===// - -We compile this: - -__m128i -foo2 (char x) -{ - return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); -} - -into: - movl $1, %eax - xorps %xmm0, %xmm0 - pinsrw $2, %eax, %xmm0 - movzbl 4(%esp), %eax - pinsrw $3, %eax, %xmm0 - movl $256, %eax - pinsrw $7, %eax, %xmm0 - ret - - -gcc-4.2: - subl $12, %esp - movzbl 16(%esp), %eax - movdqa LC0, %xmm0 - pinsrw $3, %eax, %xmm0 - addl $12, %esp - ret - .const - .align 4 -LC0: - .word 0 - .word 0 - .word 1 - .word 0 - .word 0 - .word 0 - .word 0 - .word 256 - -With SSE4, it should be - movdqa .LC0(%rip), %xmm0 - pinsrb $6, %edi, %xmm0 - -//===---------------------------------------------------------------------===// - -We should transform a shuffle of two vectors of constants into a single vector -of constants. Also, insertelement of a constant into a vector of constants -should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. - -We compiled it to something horrible: - - .align 4 -LCPI1_1: ## float - .long 1065353216 ## float 1 - .const - - .align 4 -LCPI1_0: ## <4 x float> - .space 4 - .long 1065353216 ## float 1 - .space 4 - .long 1065353216 ## float 1 - .text - .align 4,0x90 - .globl _t -_t: - xorps %xmm0, %xmm0 - movhps LCPI1_0, %xmm0 - movss LCPI1_1, %xmm1 - movaps %xmm0, %xmm2 - shufps $2, %xmm1, %xmm2 - shufps $132, %xmm2, %xmm0 - movaps %xmm0, 0 - -//===---------------------------------------------------------------------===// -rdar://5907648 - -This function: - -float foo(unsigned char x) { - return x; -} - -compiles to (x86-32): - -define float @foo(i8 zeroext %x) nounwind { - %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] - ret float %tmp12 -} - -compiles to: - -_foo: - subl $4, %esp - movzbl 8(%esp), %eax - cvtsi2ss %eax, %xmm0 - movss %xmm0, (%esp) - flds (%esp) - addl $4, %esp - ret - -We should be able to use: - cvtsi2ss 8($esp), %xmm0 -since we know the stack slot is already zext'd. - -//===---------------------------------------------------------------------===// - -Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) -when code size is critical. movlps is slower than movsd on core2 but it's one -byte shorter. - -//===---------------------------------------------------------------------===// - -We should use a dynamic programming based approach to tell when using FPStack -operations is cheaper than SSE. SciMark montecarlo contains code like this -for example: - -double MonteCarlo_num_flops(int Num_samples) { - return ((double) Num_samples)* 4.0; -} - -In fpstack mode, this compiles into: - -LCPI1_0: - .long 1082130432 ## float 4.000000e+00 -_MonteCarlo_num_flops: - subl $4, %esp - movl 8(%esp), %eax - movl %eax, (%esp) - fildl (%esp) - fmuls LCPI1_0 - addl $4, %esp - ret - -in SSE mode, it compiles into significantly slower code: - -_MonteCarlo_num_flops: - subl $12, %esp - cvtsi2sd 16(%esp), %xmm0 - mulsd LCPI1_0, %xmm0 - movsd %xmm0, (%esp) - fldl (%esp) - addl $12, %esp - ret - -There are also other cases in scimark where using fpstack is better, it is -cheaper to do fld1 than load from a constant pool for example, so -"load, add 1.0, store" is better done in the fp stack, etc. - -//===---------------------------------------------------------------------===// - -These should compile into the same code (PR6214): Perhaps instcombine should -canonicalize the former into the later? - -define float @foo(float %x) nounwind { - %t = bitcast float %x to i32 - %s = and i32 %t, 2147483647 - %d = bitcast i32 %s to float - ret float %d -} - -declare float @fabsf(float %n) -define float @bar(float %x) nounwind { - %d = call float @fabsf(float %x) - ret float %d -} - -//===---------------------------------------------------------------------===// - -This IR (from PR6194): - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-apple-darwin10.0.0" - -%0 = type { double, double } -%struct.float3 = type { float, float, float } - -define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { -entry: - %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] - %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] - %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] - %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] - %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] - %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] - %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] - store float %tmp12, float* %tmp5 - ret void -} - -Compiles to: - -_test: ## @test - movd %xmm0, %rax - shrq $32, %rax - movl %eax, 4(%rdi) - ret - -This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and -doing a shuffle from v[1] to v[0] then a float store. - -//===---------------------------------------------------------------------===// - -[UNSAFE FP] - -void foo(double, double, double); -void norm(double x, double y, double z) { - double scale = __builtin_sqrt(x*x + y*y + z*z); - foo(x/scale, y/scale, z/scale); -} - -We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is -slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first -and emit 3 mulsd in place of the divs. This can be done as a target-independent -transform. - -If we're dealing with floats instead of doubles we could even replace the sqrtss -and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the -cost of reduced accuracy. - -//===---------------------------------------------------------------------===// + +typedef short vSInt16 __attribute__ ((__vector_size__ (16))); + +static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, +- 22725, - 12873};; + +vSInt16 madd(vSInt16 b) +{ + return _mm_madd_epi16(a, b); +} + +Generated code (x86-32, linux): +madd: + pushl %ebp + movl %esp, %ebp + andl $-16, %esp + movaps .LCPI1_0, %xmm1 + pmaddwd %xmm1, %xmm0 + movl %ebp, %esp + popl %ebp + ret + +//===---------------------------------------------------------------------===// + +Consider: +#include <emmintrin.h> +__m128 foo2 (float x) { + return _mm_set_ps (0, 0, x, 0); +} + +In x86-32 mode, we generate this spiffy code: + +_foo2: + movss 4(%esp), %xmm0 + pshufd $81, %xmm0, %xmm0 + ret + +in x86-64 mode, we generate this code, which could be better: + +_foo2: + xorps %xmm1, %xmm1 + movss %xmm0, %xmm1 + pshufd $81, %xmm1, %xmm0 + ret + +In sse4 mode, we could use insertps to make both better. + +Here's another testcase that could use insertps [mem]: + +#include <xmmintrin.h> +extern float x2, x3; +__m128 foo1 (float x1, float x4) { + return _mm_set_ps (x2, x1, x3, x4); +} + +gcc mainline compiles it to: + +foo1: + insertps $0x10, x2(%rip), %xmm0 + insertps $0x10, x3(%rip), %xmm1 + movaps %xmm1, %xmm2 + movlhps %xmm0, %xmm2 + movaps %xmm2, %xmm0 + ret + +//===---------------------------------------------------------------------===// + +We compile vector multiply-by-constant into poor code: + +define <4 x i32> @f(<4 x i32> %i) nounwind { + %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > + ret <4 x i32> %A +} + +On targets without SSE4.1, this compiles into: + +LCPI1_0: ## <4 x i32> + .long 10 + .long 10 + .long 10 + .long 10 + .text + .align 4,0x90 + .globl _f +_f: + pshufd $3, %xmm0, %xmm1 + movd %xmm1, %eax + imull LCPI1_0+12, %eax + movd %eax, %xmm1 + pshufd $1, %xmm0, %xmm2 + movd %xmm2, %eax + imull LCPI1_0+4, %eax + movd %eax, %xmm2 + punpckldq %xmm1, %xmm2 + movd %xmm0, %eax + imull LCPI1_0, %eax + movd %eax, %xmm1 + movhlps %xmm0, %xmm0 + movd %xmm0, %eax + imull LCPI1_0+8, %eax + movd %eax, %xmm0 + punpckldq %xmm0, %xmm1 + movaps %xmm1, %xmm0 + punpckldq %xmm2, %xmm0 + ret + +It would be better to synthesize integer vector multiplication by constants +using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, +simple cases such as multiplication by powers of two would be better as +vector shifts than as multiplications. + +//===---------------------------------------------------------------------===// + +We compile this: + +__m128i +foo2 (char x) +{ + return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); +} + +into: + movl $1, %eax + xorps %xmm0, %xmm0 + pinsrw $2, %eax, %xmm0 + movzbl 4(%esp), %eax + pinsrw $3, %eax, %xmm0 + movl $256, %eax + pinsrw $7, %eax, %xmm0 + ret + + +gcc-4.2: + subl $12, %esp + movzbl 16(%esp), %eax + movdqa LC0, %xmm0 + pinsrw $3, %eax, %xmm0 + addl $12, %esp + ret + .const + .align 4 +LC0: + .word 0 + .word 0 + .word 1 + .word 0 + .word 0 + .word 0 + .word 0 + .word 256 + +With SSE4, it should be + movdqa .LC0(%rip), %xmm0 + pinsrb $6, %edi, %xmm0 + +//===---------------------------------------------------------------------===// + +We should transform a shuffle of two vectors of constants into a single vector +of constants. Also, insertelement of a constant into a vector of constants +should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. + +We compiled it to something horrible: + + .align 4 +LCPI1_1: ## float + .long 1065353216 ## float 1 + .const + + .align 4 +LCPI1_0: ## <4 x float> + .space 4 + .long 1065353216 ## float 1 + .space 4 + .long 1065353216 ## float 1 + .text + .align 4,0x90 + .globl _t +_t: + xorps %xmm0, %xmm0 + movhps LCPI1_0, %xmm0 + movss LCPI1_1, %xmm1 + movaps %xmm0, %xmm2 + shufps $2, %xmm1, %xmm2 + shufps $132, %xmm2, %xmm0 + movaps %xmm0, 0 + +//===---------------------------------------------------------------------===// +rdar://5907648 + +This function: + +float foo(unsigned char x) { + return x; +} + +compiles to (x86-32): + +define float @foo(i8 zeroext %x) nounwind { + %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] + ret float %tmp12 +} + +compiles to: + +_foo: + subl $4, %esp + movzbl 8(%esp), %eax + cvtsi2ss %eax, %xmm0 + movss %xmm0, (%esp) + flds (%esp) + addl $4, %esp + ret + +We should be able to use: + cvtsi2ss 8($esp), %xmm0 +since we know the stack slot is already zext'd. + +//===---------------------------------------------------------------------===// + +Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) +when code size is critical. movlps is slower than movsd on core2 but it's one +byte shorter. + +//===---------------------------------------------------------------------===// + +We should use a dynamic programming based approach to tell when using FPStack +operations is cheaper than SSE. SciMark montecarlo contains code like this +for example: + +double MonteCarlo_num_flops(int Num_samples) { + return ((double) Num_samples)* 4.0; +} + +In fpstack mode, this compiles into: + +LCPI1_0: + .long 1082130432 ## float 4.000000e+00 +_MonteCarlo_num_flops: + subl $4, %esp + movl 8(%esp), %eax + movl %eax, (%esp) + fildl (%esp) + fmuls LCPI1_0 + addl $4, %esp + ret + +in SSE mode, it compiles into significantly slower code: + +_MonteCarlo_num_flops: + subl $12, %esp + cvtsi2sd 16(%esp), %xmm0 + mulsd LCPI1_0, %xmm0 + movsd %xmm0, (%esp) + fldl (%esp) + addl $12, %esp + ret + +There are also other cases in scimark where using fpstack is better, it is +cheaper to do fld1 than load from a constant pool for example, so +"load, add 1.0, store" is better done in the fp stack, etc. + +//===---------------------------------------------------------------------===// + +These should compile into the same code (PR6214): Perhaps instcombine should +canonicalize the former into the later? + +define float @foo(float %x) nounwind { + %t = bitcast float %x to i32 + %s = and i32 %t, 2147483647 + %d = bitcast i32 %s to float + ret float %d +} + +declare float @fabsf(float %n) +define float @bar(float %x) nounwind { + %d = call float @fabsf(float %x) + ret float %d +} + +//===---------------------------------------------------------------------===// + +This IR (from PR6194): + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-darwin10.0.0" + +%0 = type { double, double } +%struct.float3 = type { float, float, float } + +define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { +entry: + %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] + %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] + %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] + %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] + %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] + %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] + %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] + store float %tmp12, float* %tmp5 + ret void +} + +Compiles to: + +_test: ## @test + movd %xmm0, %rax + shrq $32, %rax + movl %eax, 4(%rdi) + ret + +This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and +doing a shuffle from v[1] to v[0] then a float store. + +//===---------------------------------------------------------------------===// + +[UNSAFE FP] + +void foo(double, double, double); +void norm(double x, double y, double z) { + double scale = __builtin_sqrt(x*x + y*y + z*z); + foo(x/scale, y/scale, z/scale); +} + +We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is +slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first +and emit 3 mulsd in place of the divs. This can be done as a target-independent +transform. + +If we're dealing with floats instead of doubles we could even replace the sqrtss +and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the +cost of reduced accuracy. + +//===---------------------------------------------------------------------===// |