diff options
author | shadchin <shadchin@yandex-team.ru> | 2022-02-10 16:44:39 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:39 +0300 |
commit | e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0 (patch) | |
tree | 64175d5cadab313b3e7039ebaa06c5bc3295e274 /contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp | |
parent | 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (diff) | |
download | ydb-e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0.tar.gz |
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp')
-rw-r--r-- | contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp | 4034 |
1 files changed, 2017 insertions, 2017 deletions
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp index c6388617c6..c4150ed528 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -1,2017 +1,2017 @@ -//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// \file -/// This file implements a TargetTransformInfo analysis pass specific to the -/// X86 target machine. It uses the target's detailed information to provide -/// more precise answers to certain TTI queries, while letting the target -/// independent and default TTI implementations handle the rest. -/// -//===----------------------------------------------------------------------===// - -#include "X86TargetTransformInfo.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IntrinsicsX86.h" -#include "llvm/Support/KnownBits.h" -#include "llvm/Transforms/InstCombine/InstCombiner.h" - -using namespace llvm; - -#define DEBUG_TYPE "x86tti" - -/// Return a constant boolean vector that has true elements in all positions -/// where the input constant data vector has an element with the sign bit set. -static Constant *getNegativeIsTrueBoolVec(Constant *V) { - VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); - V = ConstantExpr::getBitCast(V, IntTy); - V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy), - V); - return V; -} - -/// Convert the x86 XMM integer vector mask to a vector of bools based on -/// each element's most significant bit (the sign bit). -static Value *getBoolVecFromMask(Value *Mask) { - // Fold Constant Mask. - if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) - return getNegativeIsTrueBoolVec(ConstantMask); - - // Mask was extended from a boolean vector. - Value *ExtMask; - if (PatternMatch::match( - Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && - ExtMask->getType()->isIntOrIntVectorTy(1)) - return ExtMask; - - return nullptr; -} - -// TODO: If the x86 backend knew how to convert a bool vector mask back to an -// XMM register mask efficiently, we could transform all x86 masked intrinsics -// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. -static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { - Value *Ptr = II.getOperand(0); - Value *Mask = II.getOperand(1); - Constant *ZeroVec = Constant::getNullValue(II.getType()); - - // Zero Mask - masked load instruction creates a zero vector. - if (isa<ConstantAggregateZero>(Mask)) - return IC.replaceInstUsesWith(II, ZeroVec); - - // The mask is constant or extended from a bool vector. Convert this x86 - // intrinsic to the LLVM intrinsic to allow target-independent optimizations. - if (Value *BoolMask = getBoolVecFromMask(Mask)) { - // First, cast the x86 intrinsic scalar pointer to a vector pointer to match - // the LLVM intrinsic definition for the pointer argument. - unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); - PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); - Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); - - // The pass-through vector for an x86 masked load is a zero vector. - CallInst *NewMaskedLoad = - IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); - return IC.replaceInstUsesWith(II, NewMaskedLoad); - } - - return nullptr; -} - -// TODO: If the x86 backend knew how to convert a bool vector mask back to an -// XMM register mask efficiently, we could transform all x86 masked intrinsics -// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. -static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { - Value *Ptr = II.getOperand(0); - Value *Mask = II.getOperand(1); - Value *Vec = II.getOperand(2); - - // Zero Mask - this masked store instruction does nothing. - if (isa<ConstantAggregateZero>(Mask)) { - IC.eraseInstFromFunction(II); - return true; - } - - // The SSE2 version is too weird (eg, unaligned but non-temporal) to do - // anything else at this level. - if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) - return false; - - // The mask is constant or extended from a bool vector. Convert this x86 - // intrinsic to the LLVM intrinsic to allow target-independent optimizations. - if (Value *BoolMask = getBoolVecFromMask(Mask)) { - unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); - PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); - Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); - - IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); - - // 'Replace uses' doesn't work for stores. Erase the original masked store. - IC.eraseInstFromFunction(II); - return true; - } - - return false; -} - -static Value *simplifyX86immShift(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - bool LogicalShift = false; - bool ShiftLeft = false; - bool IsImm = false; - - switch (II.getIntrinsicID()) { - default: - llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx512_psrai_q_128: - case Intrinsic::x86_avx512_psrai_q_256: - case Intrinsic::x86_avx512_psrai_d_512: - case Intrinsic::x86_avx512_psrai_q_512: - case Intrinsic::x86_avx512_psrai_w_512: - IsImm = true; - LLVM_FALLTHROUGH; - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_avx2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx512_psra_q_128: - case Intrinsic::x86_avx512_psra_q_256: - case Intrinsic::x86_avx512_psra_d_512: - case Intrinsic::x86_avx512_psra_q_512: - case Intrinsic::x86_avx512_psra_w_512: - LogicalShift = false; - ShiftLeft = false; - break; - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx512_psrli_d_512: - case Intrinsic::x86_avx512_psrli_q_512: - case Intrinsic::x86_avx512_psrli_w_512: - IsImm = true; - LLVM_FALLTHROUGH; - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx512_psrl_d_512: - case Intrinsic::x86_avx512_psrl_q_512: - case Intrinsic::x86_avx512_psrl_w_512: - LogicalShift = true; - ShiftLeft = false; - break; - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx512_pslli_d_512: - case Intrinsic::x86_avx512_pslli_q_512: - case Intrinsic::x86_avx512_pslli_w_512: - IsImm = true; - LLVM_FALLTHROUGH; - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx512_psll_d_512: - case Intrinsic::x86_avx512_psll_q_512: - case Intrinsic::x86_avx512_psll_w_512: - LogicalShift = true; - ShiftLeft = true; - break; - } - assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); - - auto Vec = II.getArgOperand(0); - auto Amt = II.getArgOperand(1); - auto VT = cast<FixedVectorType>(Vec->getType()); - auto SVT = VT->getElementType(); - auto AmtVT = Amt->getType(); - unsigned VWidth = VT->getNumElements(); - unsigned BitWidth = SVT->getPrimitiveSizeInBits(); - - // If the shift amount is guaranteed to be in-range we can replace it with a - // generic shift. If its guaranteed to be out of range, logical shifts combine - // to zero and arithmetic shifts are clamped to (BitWidth - 1). - if (IsImm) { - assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); - KnownBits KnownAmtBits = - llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); - if (KnownAmtBits.getMaxValue().ult(BitWidth)) { - Amt = Builder.CreateZExtOrTrunc(Amt, SVT); - Amt = Builder.CreateVectorSplat(VWidth, Amt); - return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) - : Builder.CreateLShr(Vec, Amt)) - : Builder.CreateAShr(Vec, Amt)); - } - if (KnownAmtBits.getMinValue().uge(BitWidth)) { - if (LogicalShift) - return ConstantAggregateZero::get(VT); - Amt = ConstantInt::get(SVT, BitWidth - 1); - return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); - } - } else { - // Ensure the first element has an in-range value and the rest of the - // elements in the bottom 64 bits are zero. - assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && - cast<VectorType>(AmtVT)->getElementType() == SVT && - "Unexpected shift-by-scalar type"); - unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); - APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); - APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); - KnownBits KnownLowerBits = llvm::computeKnownBits( - Amt, DemandedLower, II.getModule()->getDataLayout()); - KnownBits KnownUpperBits = llvm::computeKnownBits( - Amt, DemandedUpper, II.getModule()->getDataLayout()); - if (KnownLowerBits.getMaxValue().ult(BitWidth) && - (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) { - SmallVector<int, 16> ZeroSplat(VWidth, 0); - Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); - return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) - : Builder.CreateLShr(Vec, Amt)) - : Builder.CreateAShr(Vec, Amt)); - } - } - - // Simplify if count is constant vector. - auto CDV = dyn_cast<ConstantDataVector>(Amt); - if (!CDV) - return nullptr; - - // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector - // operand to compute the shift amount. - assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && - cast<VectorType>(AmtVT)->getElementType() == SVT && - "Unexpected shift-by-scalar type"); - - // Concatenate the sub-elements to create the 64-bit value. - APInt Count(64, 0); - for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { - unsigned SubEltIdx = (NumSubElts - 1) - i; - auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); - Count <<= BitWidth; - Count |= SubElt->getValue().zextOrTrunc(64); - } - - // If shift-by-zero then just return the original value. - if (Count.isNullValue()) - return Vec; - - // Handle cases when Shift >= BitWidth. - if (Count.uge(BitWidth)) { - // If LogicalShift - just return zero. - if (LogicalShift) - return ConstantAggregateZero::get(VT); - - // If ArithmeticShift - clamp Shift to (BitWidth - 1). - Count = APInt(64, BitWidth - 1); - } - - // Get a constant vector of the same type as the first operand. - auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); - auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); - - if (ShiftLeft) - return Builder.CreateShl(Vec, ShiftVec); - - if (LogicalShift) - return Builder.CreateLShr(Vec, ShiftVec); - - return Builder.CreateAShr(Vec, ShiftVec); -} - -// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. -// Unlike the generic IR shifts, the intrinsics have defined behaviour for out -// of range shift amounts (logical - set to zero, arithmetic - splat sign bit). -static Value *simplifyX86varShift(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - bool LogicalShift = false; - bool ShiftLeft = false; - - switch (II.getIntrinsicID()) { - default: - llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: - case Intrinsic::x86_avx512_psrav_q_128: - case Intrinsic::x86_avx512_psrav_q_256: - case Intrinsic::x86_avx512_psrav_d_512: - case Intrinsic::x86_avx512_psrav_q_512: - case Intrinsic::x86_avx512_psrav_w_128: - case Intrinsic::x86_avx512_psrav_w_256: - case Intrinsic::x86_avx512_psrav_w_512: - LogicalShift = false; - ShiftLeft = false; - break; - case Intrinsic::x86_avx2_psrlv_d: - case Intrinsic::x86_avx2_psrlv_d_256: - case Intrinsic::x86_avx2_psrlv_q: - case Intrinsic::x86_avx2_psrlv_q_256: - case Intrinsic::x86_avx512_psrlv_d_512: - case Intrinsic::x86_avx512_psrlv_q_512: - case Intrinsic::x86_avx512_psrlv_w_128: - case Intrinsic::x86_avx512_psrlv_w_256: - case Intrinsic::x86_avx512_psrlv_w_512: - LogicalShift = true; - ShiftLeft = false; - break; - case Intrinsic::x86_avx2_psllv_d: - case Intrinsic::x86_avx2_psllv_d_256: - case Intrinsic::x86_avx2_psllv_q: - case Intrinsic::x86_avx2_psllv_q_256: - case Intrinsic::x86_avx512_psllv_d_512: - case Intrinsic::x86_avx512_psllv_q_512: - case Intrinsic::x86_avx512_psllv_w_128: - case Intrinsic::x86_avx512_psllv_w_256: - case Intrinsic::x86_avx512_psllv_w_512: - LogicalShift = true; - ShiftLeft = true; - break; - } - assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); - - auto Vec = II.getArgOperand(0); - auto Amt = II.getArgOperand(1); - auto VT = cast<FixedVectorType>(II.getType()); - auto SVT = VT->getElementType(); - int NumElts = VT->getNumElements(); - int BitWidth = SVT->getIntegerBitWidth(); - - // If the shift amount is guaranteed to be in-range we can replace it with a - // generic shift. - APInt UpperBits = - APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth)); - if (llvm::MaskedValueIsZero(Amt, UpperBits, - II.getModule()->getDataLayout())) { - return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) - : Builder.CreateLShr(Vec, Amt)) - : Builder.CreateAShr(Vec, Amt)); - } - - // Simplify if all shift amounts are constant/undef. - auto *CShift = dyn_cast<Constant>(Amt); - if (!CShift) - return nullptr; - - // Collect each element's shift amount. - // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. - bool AnyOutOfRange = false; - SmallVector<int, 8> ShiftAmts; - for (int I = 0; I < NumElts; ++I) { - auto *CElt = CShift->getAggregateElement(I); - if (isa_and_nonnull<UndefValue>(CElt)) { - ShiftAmts.push_back(-1); - continue; - } - - auto *COp = dyn_cast_or_null<ConstantInt>(CElt); - if (!COp) - return nullptr; - - // Handle out of range shifts. - // If LogicalShift - set to BitWidth (special case). - // If ArithmeticShift - set to (BitWidth - 1) (sign splat). - APInt ShiftVal = COp->getValue(); - if (ShiftVal.uge(BitWidth)) { - AnyOutOfRange = LogicalShift; - ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); - continue; - } - - ShiftAmts.push_back((int)ShiftVal.getZExtValue()); - } - - // If all elements out of range or UNDEF, return vector of zeros/undefs. - // ArithmeticShift should only hit this if they are all UNDEF. - auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; - if (llvm::all_of(ShiftAmts, OutOfRange)) { - SmallVector<Constant *, 8> ConstantVec; - for (int Idx : ShiftAmts) { - if (Idx < 0) { - ConstantVec.push_back(UndefValue::get(SVT)); - } else { - assert(LogicalShift && "Logical shift expected"); - ConstantVec.push_back(ConstantInt::getNullValue(SVT)); - } - } - return ConstantVector::get(ConstantVec); - } - - // We can't handle only some out of range values with generic logical shifts. - if (AnyOutOfRange) - return nullptr; - - // Build the shift amount constant vector. - SmallVector<Constant *, 8> ShiftVecAmts; - for (int Idx : ShiftAmts) { - if (Idx < 0) - ShiftVecAmts.push_back(UndefValue::get(SVT)); - else - ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); - } - auto ShiftVec = ConstantVector::get(ShiftVecAmts); - - if (ShiftLeft) - return Builder.CreateShl(Vec, ShiftVec); - - if (LogicalShift) - return Builder.CreateLShr(Vec, ShiftVec); - - return Builder.CreateAShr(Vec, ShiftVec); -} - -static Value *simplifyX86pack(IntrinsicInst &II, - InstCombiner::BuilderTy &Builder, bool IsSigned) { - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - Type *ResTy = II.getType(); - - // Fast all undef handling. - if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) - return UndefValue::get(ResTy); - - auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); - unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; - unsigned NumSrcElts = ArgTy->getNumElements(); - assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && - "Unexpected packing types"); - - unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; - unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); - unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); - assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && - "Unexpected packing types"); - - // Constant folding. - if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) - return nullptr; - - // Clamp Values - signed/unsigned both use signed clamp values, but they - // differ on the min/max values. - APInt MinValue, MaxValue; - if (IsSigned) { - // PACKSS: Truncate signed value with signed saturation. - // Source values less than dst minint are saturated to minint. - // Source values greater than dst maxint are saturated to maxint. - MinValue = - APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); - MaxValue = - APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); - } else { - // PACKUS: Truncate signed value with unsigned saturation. - // Source values less than zero are saturated to zero. - // Source values greater than dst maxuint are saturated to maxuint. - MinValue = APInt::getNullValue(SrcScalarSizeInBits); - MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); - } - - auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); - auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); - Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); - Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); - Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); - Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); - - // Shuffle clamped args together at the lane level. - SmallVector<int, 32> PackMask; - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) - PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); - for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) - PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); - } - auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); - - // Truncate to dst size. - return Builder.CreateTrunc(Shuffle, ResTy); -} - -static Value *simplifyX86movmsk(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Value *Arg = II.getArgOperand(0); - Type *ResTy = II.getType(); - - // movmsk(undef) -> zero as we must ensure the upper bits are zero. - if (isa<UndefValue>(Arg)) - return Constant::getNullValue(ResTy); - - auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); - // We can't easily peek through x86_mmx types. - if (!ArgTy) - return nullptr; - - // Expand MOVMSK to compare/bitcast/zext: - // e.g. PMOVMSKB(v16i8 x): - // %cmp = icmp slt <16 x i8> %x, zeroinitializer - // %int = bitcast <16 x i1> %cmp to i16 - // %res = zext i16 %int to i32 - unsigned NumElts = ArgTy->getNumElements(); - Type *IntegerVecTy = VectorType::getInteger(ArgTy); - Type *IntegerTy = Builder.getIntNTy(NumElts); - - Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); - Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); - Res = Builder.CreateBitCast(Res, IntegerTy); - Res = Builder.CreateZExtOrTrunc(Res, ResTy); - return Res; -} - -static Value *simplifyX86addcarry(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Value *CarryIn = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - Value *Op2 = II.getArgOperand(2); - Type *RetTy = II.getType(); - Type *OpTy = Op1->getType(); - assert(RetTy->getStructElementType(0)->isIntegerTy(8) && - RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && - "Unexpected types for x86 addcarry"); - - // If carry-in is zero, this is just an unsigned add with overflow. - if (match(CarryIn, PatternMatch::m_ZeroInt())) { - Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, - {Op1, Op2}); - // The types have to be adjusted to match the x86 call types. - Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); - Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), - Builder.getInt8Ty()); - Value *Res = UndefValue::get(RetTy); - Res = Builder.CreateInsertValue(Res, UAddOV, 0); - return Builder.CreateInsertValue(Res, UAddResult, 1); - } - - return nullptr; -} - -static Value *simplifyX86insertps(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); - if (!CInt) - return nullptr; - - auto *VecTy = cast<FixedVectorType>(II.getType()); - assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); - - // The immediate permute control byte looks like this: - // [3:0] - zero mask for each 32-bit lane - // [5:4] - select one 32-bit destination lane - // [7:6] - select one 32-bit source lane - - uint8_t Imm = CInt->getZExtValue(); - uint8_t ZMask = Imm & 0xf; - uint8_t DestLane = (Imm >> 4) & 0x3; - uint8_t SourceLane = (Imm >> 6) & 0x3; - - ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); - - // If all zero mask bits are set, this was just a weird way to - // generate a zero vector. - if (ZMask == 0xf) - return ZeroVector; - - // Initialize by passing all of the first source bits through. - int ShuffleMask[4] = {0, 1, 2, 3}; - - // We may replace the second operand with the zero vector. - Value *V1 = II.getArgOperand(1); - - if (ZMask) { - // If the zero mask is being used with a single input or the zero mask - // overrides the destination lane, this is a shuffle with the zero vector. - if ((II.getArgOperand(0) == II.getArgOperand(1)) || - (ZMask & (1 << DestLane))) { - V1 = ZeroVector; - // We may still move 32-bits of the first source vector from one lane - // to another. - ShuffleMask[DestLane] = SourceLane; - // The zero mask may override the previous insert operation. - for (unsigned i = 0; i < 4; ++i) - if ((ZMask >> i) & 0x1) - ShuffleMask[i] = i + 4; - } else { - // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? - return nullptr; - } - } else { - // Replace the selected destination lane with the selected source lane. - ShuffleMask[DestLane] = SourceLane + 4; - } - - return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); -} - -/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding -/// or conversion to a shuffle vector. -static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, - ConstantInt *CILength, ConstantInt *CIIndex, - InstCombiner::BuilderTy &Builder) { - auto LowConstantHighUndef = [&](uint64_t Val) { - Type *IntTy64 = Type::getInt64Ty(II.getContext()); - Constant *Args[] = {ConstantInt::get(IntTy64, Val), - UndefValue::get(IntTy64)}; - return ConstantVector::get(Args); - }; - - // See if we're dealing with constant values. - Constant *C0 = dyn_cast<Constant>(Op0); - ConstantInt *CI0 = - C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) - : nullptr; - - // Attempt to constant fold. - if (CILength && CIIndex) { - // From AMD documentation: "The bit index and field length are each six - // bits in length other bits of the field are ignored." - APInt APIndex = CIIndex->getValue().zextOrTrunc(6); - APInt APLength = CILength->getValue().zextOrTrunc(6); - - unsigned Index = APIndex.getZExtValue(); - - // From AMD documentation: "a value of zero in the field length is - // defined as length of 64". - unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); - - // From AMD documentation: "If the sum of the bit index + length field - // is greater than 64, the results are undefined". - unsigned End = Index + Length; - - // Note that both field index and field length are 8-bit quantities. - // Since variables 'Index' and 'Length' are unsigned values - // obtained from zero-extending field index and field length - // respectively, their sum should never wrap around. - if (End > 64) - return UndefValue::get(II.getType()); - - // If we are inserting whole bytes, we can convert this to a shuffle. - // Lowering can recognize EXTRQI shuffle masks. - if ((Length % 8) == 0 && (Index % 8) == 0) { - // Convert bit indices to byte indices. - Length /= 8; - Index /= 8; - - Type *IntTy8 = Type::getInt8Ty(II.getContext()); - auto *ShufTy = FixedVectorType::get(IntTy8, 16); - - SmallVector<int, 16> ShuffleMask; - for (int i = 0; i != (int)Length; ++i) - ShuffleMask.push_back(i + Index); - for (int i = Length; i != 8; ++i) - ShuffleMask.push_back(i + 16); - for (int i = 8; i != 16; ++i) - ShuffleMask.push_back(-1); - - Value *SV = Builder.CreateShuffleVector( - Builder.CreateBitCast(Op0, ShufTy), - ConstantAggregateZero::get(ShufTy), ShuffleMask); - return Builder.CreateBitCast(SV, II.getType()); - } - - // Constant Fold - shift Index'th bit to lowest position and mask off - // Length bits. - if (CI0) { - APInt Elt = CI0->getValue(); - Elt.lshrInPlace(Index); - Elt = Elt.zextOrTrunc(Length); - return LowConstantHighUndef(Elt.getZExtValue()); - } - - // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. - if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { - Value *Args[] = {Op0, CILength, CIIndex}; - Module *M = II.getModule(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); - return Builder.CreateCall(F, Args); - } - } - - // Constant Fold - extraction from zero is always {zero, undef}. - if (CI0 && CI0->isZero()) - return LowConstantHighUndef(0); - - return nullptr; -} - -/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant -/// folding or conversion to a shuffle vector. -static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, - APInt APLength, APInt APIndex, - InstCombiner::BuilderTy &Builder) { - // From AMD documentation: "The bit index and field length are each six bits - // in length other bits of the field are ignored." - APIndex = APIndex.zextOrTrunc(6); - APLength = APLength.zextOrTrunc(6); - - // Attempt to constant fold. - unsigned Index = APIndex.getZExtValue(); - - // From AMD documentation: "a value of zero in the field length is - // defined as length of 64". - unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); - - // From AMD documentation: "If the sum of the bit index + length field - // is greater than 64, the results are undefined". - unsigned End = Index + Length; - - // Note that both field index and field length are 8-bit quantities. - // Since variables 'Index' and 'Length' are unsigned values - // obtained from zero-extending field index and field length - // respectively, their sum should never wrap around. - if (End > 64) - return UndefValue::get(II.getType()); - - // If we are inserting whole bytes, we can convert this to a shuffle. - // Lowering can recognize INSERTQI shuffle masks. - if ((Length % 8) == 0 && (Index % 8) == 0) { - // Convert bit indices to byte indices. - Length /= 8; - Index /= 8; - - Type *IntTy8 = Type::getInt8Ty(II.getContext()); - auto *ShufTy = FixedVectorType::get(IntTy8, 16); - - SmallVector<int, 16> ShuffleMask; - for (int i = 0; i != (int)Index; ++i) - ShuffleMask.push_back(i); - for (int i = 0; i != (int)Length; ++i) - ShuffleMask.push_back(i + 16); - for (int i = Index + Length; i != 8; ++i) - ShuffleMask.push_back(i); - for (int i = 8; i != 16; ++i) - ShuffleMask.push_back(-1); - - Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), - Builder.CreateBitCast(Op1, ShufTy), - ShuffleMask); - return Builder.CreateBitCast(SV, II.getType()); - } - - // See if we're dealing with constant values. - Constant *C0 = dyn_cast<Constant>(Op0); - Constant *C1 = dyn_cast<Constant>(Op1); - ConstantInt *CI00 = - C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) - : nullptr; - ConstantInt *CI10 = - C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) - : nullptr; - - // Constant Fold - insert bottom Length bits starting at the Index'th bit. - if (CI00 && CI10) { - APInt V00 = CI00->getValue(); - APInt V10 = CI10->getValue(); - APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); - V00 = V00 & ~Mask; - V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); - APInt Val = V00 | V10; - Type *IntTy64 = Type::getInt64Ty(II.getContext()); - Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), - UndefValue::get(IntTy64)}; - return ConstantVector::get(Args); - } - - // If we were an INSERTQ call, we'll save demanded elements if we convert to - // INSERTQI. - if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { - Type *IntTy8 = Type::getInt8Ty(II.getContext()); - Constant *CILength = ConstantInt::get(IntTy8, Length, false); - Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); - - Value *Args[] = {Op0, Op1, CILength, CIIndex}; - Module *M = II.getModule(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); - return Builder.CreateCall(F, Args); - } - - return nullptr; -} - -/// Attempt to convert pshufb* to shufflevector if the mask is constant. -static Value *simplifyX86pshufb(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); - if (!V) - return nullptr; - - auto *VecTy = cast<FixedVectorType>(II.getType()); - unsigned NumElts = VecTy->getNumElements(); - assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && - "Unexpected number of elements in shuffle mask!"); - - // Construct a shuffle mask from constant integers or UNDEFs. - int Indexes[64]; - - // Each byte in the shuffle control mask forms an index to permute the - // corresponding byte in the destination operand. - for (unsigned I = 0; I < NumElts; ++I) { - Constant *COp = V->getAggregateElement(I); - if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) - return nullptr; - - if (isa<UndefValue>(COp)) { - Indexes[I] = -1; - continue; - } - - int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); - - // If the most significant bit (bit[7]) of each byte of the shuffle - // control mask is set, then zero is written in the result byte. - // The zero vector is in the right-hand side of the resulting - // shufflevector. - - // The value of each index for the high 128-bit lane is the least - // significant 4 bits of the respective shuffle control byte. - Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); - Indexes[I] = Index; - } - - auto V1 = II.getArgOperand(0); - auto V2 = Constant::getNullValue(VecTy); - return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); -} - -/// Attempt to convert vpermilvar* to shufflevector if the mask is constant. -static Value *simplifyX86vpermilvar(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); - if (!V) - return nullptr; - - auto *VecTy = cast<FixedVectorType>(II.getType()); - unsigned NumElts = VecTy->getNumElements(); - bool IsPD = VecTy->getScalarType()->isDoubleTy(); - unsigned NumLaneElts = IsPD ? 2 : 4; - assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); - - // Construct a shuffle mask from constant integers or UNDEFs. - int Indexes[16]; - - // The intrinsics only read one or two bits, clear the rest. - for (unsigned I = 0; I < NumElts; ++I) { - Constant *COp = V->getAggregateElement(I); - if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) - return nullptr; - - if (isa<UndefValue>(COp)) { - Indexes[I] = -1; - continue; - } - - APInt Index = cast<ConstantInt>(COp)->getValue(); - Index = Index.zextOrTrunc(32).getLoBits(2); - - // The PD variants uses bit 1 to select per-lane element index, so - // shift down to convert to generic shuffle mask index. - if (IsPD) - Index.lshrInPlace(1); - - // The _256 variants are a bit trickier since the mask bits always index - // into the corresponding 128 half. In order to convert to a generic - // shuffle, we have to make that explicit. - Index += APInt(32, (I / NumLaneElts) * NumLaneElts); - - Indexes[I] = Index.getZExtValue(); - } - - auto V1 = II.getArgOperand(0); - return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts)); -} - -/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. -static Value *simplifyX86vpermv(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - auto *V = dyn_cast<Constant>(II.getArgOperand(1)); - if (!V) - return nullptr; - - auto *VecTy = cast<FixedVectorType>(II.getType()); - unsigned Size = VecTy->getNumElements(); - assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && - "Unexpected shuffle mask size"); - - // Construct a shuffle mask from constant integers or UNDEFs. - int Indexes[64]; - - for (unsigned I = 0; I < Size; ++I) { - Constant *COp = V->getAggregateElement(I); - if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) - return nullptr; - - if (isa<UndefValue>(COp)) { - Indexes[I] = -1; - continue; - } - - uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); - Index &= Size - 1; - Indexes[I] = Index; - } - - auto V1 = II.getArgOperand(0); - return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size)); -} - -Optional<Instruction *> -X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { - auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, - unsigned DemandedWidth) { - APInt UndefElts(Width, 0); - APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); - return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); - }; - - Intrinsic::ID IID = II.getIntrinsicID(); - switch (IID) { - case Intrinsic::x86_bmi_bextr_32: - case Intrinsic::x86_bmi_bextr_64: - case Intrinsic::x86_tbm_bextri_u32: - case Intrinsic::x86_tbm_bextri_u64: - // If the RHS is a constant we can try some simplifications. - if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { - uint64_t Shift = C->getZExtValue(); - uint64_t Length = (Shift >> 8) & 0xff; - Shift &= 0xff; - unsigned BitWidth = II.getType()->getIntegerBitWidth(); - // If the length is 0 or the shift is out of range, replace with zero. - if (Length == 0 || Shift >= BitWidth) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); - } - // If the LHS is also a constant, we can completely constant fold this. - if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { - uint64_t Result = InC->getZExtValue() >> Shift; - if (Length > BitWidth) - Length = BitWidth; - Result &= maskTrailingOnes<uint64_t>(Length); - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), Result)); - } - // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we - // are only masking bits that a shift already cleared? - } - break; - - case Intrinsic::x86_bmi_bzhi_32: - case Intrinsic::x86_bmi_bzhi_64: - // If the RHS is a constant we can try some simplifications. - if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { - uint64_t Index = C->getZExtValue() & 0xff; - unsigned BitWidth = II.getType()->getIntegerBitWidth(); - if (Index >= BitWidth) { - return IC.replaceInstUsesWith(II, II.getArgOperand(0)); - } - if (Index == 0) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); - } - // If the LHS is also a constant, we can completely constant fold this. - if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { - uint64_t Result = InC->getZExtValue(); - Result &= maskTrailingOnes<uint64_t>(Index); - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), Result)); - } - // TODO should we convert this to an AND if the RHS is constant? - } - break; - case Intrinsic::x86_bmi_pext_32: - case Intrinsic::x86_bmi_pext_64: - if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { - if (MaskC->isNullValue()) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); - } - if (MaskC->isAllOnesValue()) { - return IC.replaceInstUsesWith(II, II.getArgOperand(0)); - } - - if (MaskC->getValue().isShiftedMask()) { - // any single contingous sequence of 1s anywhere in the mask simply - // describes a subset of the input bits shifted to the appropriate - // position. Replace with the straight forward IR. - unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); - Value *Input = II.getArgOperand(0); - Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); - Value *Shifted = IC.Builder.CreateLShr(Masked, - ConstantInt::get(II.getType(), - ShiftAmount)); - return IC.replaceInstUsesWith(II, Shifted); - } - - - if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { - uint64_t Src = SrcC->getZExtValue(); - uint64_t Mask = MaskC->getZExtValue(); - uint64_t Result = 0; - uint64_t BitToSet = 1; - - while (Mask) { - // Isolate lowest set bit. - uint64_t BitToTest = Mask & -Mask; - if (BitToTest & Src) - Result |= BitToSet; - - BitToSet <<= 1; - // Clear lowest set bit. - Mask &= Mask - 1; - } - - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), Result)); - } - } - break; - case Intrinsic::x86_bmi_pdep_32: - case Intrinsic::x86_bmi_pdep_64: - if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { - if (MaskC->isNullValue()) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); - } - if (MaskC->isAllOnesValue()) { - return IC.replaceInstUsesWith(II, II.getArgOperand(0)); - } - if (MaskC->getValue().isShiftedMask()) { - // any single contingous sequence of 1s anywhere in the mask simply - // describes a subset of the input bits shifted to the appropriate - // position. Replace with the straight forward IR. - unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); - Value *Input = II.getArgOperand(0); - Value *Shifted = IC.Builder.CreateShl(Input, - ConstantInt::get(II.getType(), - ShiftAmount)); - Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); - return IC.replaceInstUsesWith(II, Masked); - } - - if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { - uint64_t Src = SrcC->getZExtValue(); - uint64_t Mask = MaskC->getZExtValue(); - uint64_t Result = 0; - uint64_t BitToTest = 1; - - while (Mask) { - // Isolate lowest set bit. - uint64_t BitToSet = Mask & -Mask; - if (BitToTest & Src) - Result |= BitToSet; - - BitToTest <<= 1; - // Clear lowest set bit; - Mask &= Mask - 1; - } - - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), Result)); - } - } - break; - - case Intrinsic::x86_sse_cvtss2si: - case Intrinsic::x86_sse_cvtss2si64: - case Intrinsic::x86_sse_cvttss2si: - case Intrinsic::x86_sse_cvttss2si64: - case Intrinsic::x86_sse2_cvtsd2si: - case Intrinsic::x86_sse2_cvtsd2si64: - case Intrinsic::x86_sse2_cvttsd2si: - case Intrinsic::x86_sse2_cvttsd2si64: - case Intrinsic::x86_avx512_vcvtss2si32: - case Intrinsic::x86_avx512_vcvtss2si64: - case Intrinsic::x86_avx512_vcvtss2usi32: - case Intrinsic::x86_avx512_vcvtss2usi64: - case Intrinsic::x86_avx512_vcvtsd2si32: - case Intrinsic::x86_avx512_vcvtsd2si64: - case Intrinsic::x86_avx512_vcvtsd2usi32: - case Intrinsic::x86_avx512_vcvtsd2usi64: - case Intrinsic::x86_avx512_cvttss2si: - case Intrinsic::x86_avx512_cvttss2si64: - case Intrinsic::x86_avx512_cvttss2usi: - case Intrinsic::x86_avx512_cvttss2usi64: - case Intrinsic::x86_avx512_cvttsd2si: - case Intrinsic::x86_avx512_cvttsd2si64: - case Intrinsic::x86_avx512_cvttsd2usi: - case Intrinsic::x86_avx512_cvttsd2usi64: { - // These intrinsics only demand the 0th element of their input vectors. If - // we can simplify the input based on that, do so now. - Value *Arg = II.getArgOperand(0); - unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); - if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { - return IC.replaceOperand(II, 0, V); - } - break; - } - - case Intrinsic::x86_mmx_pmovmskb: - case Intrinsic::x86_sse_movmsk_ps: - case Intrinsic::x86_sse2_movmsk_pd: - case Intrinsic::x86_sse2_pmovmskb_128: - case Intrinsic::x86_avx_movmsk_pd_256: - case Intrinsic::x86_avx_movmsk_ps_256: - case Intrinsic::x86_avx2_pmovmskb: - if (Value *V = simplifyX86movmsk(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_sse_comieq_ss: - case Intrinsic::x86_sse_comige_ss: - case Intrinsic::x86_sse_comigt_ss: - case Intrinsic::x86_sse_comile_ss: - case Intrinsic::x86_sse_comilt_ss: - case Intrinsic::x86_sse_comineq_ss: - case Intrinsic::x86_sse_ucomieq_ss: - case Intrinsic::x86_sse_ucomige_ss: - case Intrinsic::x86_sse_ucomigt_ss: - case Intrinsic::x86_sse_ucomile_ss: - case Intrinsic::x86_sse_ucomilt_ss: - case Intrinsic::x86_sse_ucomineq_ss: - case Intrinsic::x86_sse2_comieq_sd: - case Intrinsic::x86_sse2_comige_sd: - case Intrinsic::x86_sse2_comigt_sd: - case Intrinsic::x86_sse2_comile_sd: - case Intrinsic::x86_sse2_comilt_sd: - case Intrinsic::x86_sse2_comineq_sd: - case Intrinsic::x86_sse2_ucomieq_sd: - case Intrinsic::x86_sse2_ucomige_sd: - case Intrinsic::x86_sse2_ucomigt_sd: - case Intrinsic::x86_sse2_ucomile_sd: - case Intrinsic::x86_sse2_ucomilt_sd: - case Intrinsic::x86_sse2_ucomineq_sd: - case Intrinsic::x86_avx512_vcomi_ss: - case Intrinsic::x86_avx512_vcomi_sd: - case Intrinsic::x86_avx512_mask_cmp_ss: - case Intrinsic::x86_avx512_mask_cmp_sd: { - // These intrinsics only demand the 0th element of their input vectors. If - // we can simplify the input based on that, do so now. - bool MadeChange = false; - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); - if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { - IC.replaceOperand(II, 0, V); - MadeChange = true; - } - if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { - IC.replaceOperand(II, 1, V); - MadeChange = true; - } - if (MadeChange) { - return &II; - } - break; - } - - case Intrinsic::x86_avx512_add_ps_512: - case Intrinsic::x86_avx512_div_ps_512: - case Intrinsic::x86_avx512_mul_ps_512: - case Intrinsic::x86_avx512_sub_ps_512: - case Intrinsic::x86_avx512_add_pd_512: - case Intrinsic::x86_avx512_div_pd_512: - case Intrinsic::x86_avx512_mul_pd_512: - case Intrinsic::x86_avx512_sub_pd_512: - // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular - // IR operations. - if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { - if (R->getValue() == 4) { - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - - Value *V; - switch (IID) { - default: - llvm_unreachable("Case stmts out of sync!"); - case Intrinsic::x86_avx512_add_ps_512: - case Intrinsic::x86_avx512_add_pd_512: - V = IC.Builder.CreateFAdd(Arg0, Arg1); - break; - case Intrinsic::x86_avx512_sub_ps_512: - case Intrinsic::x86_avx512_sub_pd_512: - V = IC.Builder.CreateFSub(Arg0, Arg1); - break; - case Intrinsic::x86_avx512_mul_ps_512: - case Intrinsic::x86_avx512_mul_pd_512: - V = IC.Builder.CreateFMul(Arg0, Arg1); - break; - case Intrinsic::x86_avx512_div_ps_512: - case Intrinsic::x86_avx512_div_pd_512: - V = IC.Builder.CreateFDiv(Arg0, Arg1); - break; - } - - return IC.replaceInstUsesWith(II, V); - } - } - break; - - case Intrinsic::x86_avx512_mask_add_ss_round: - case Intrinsic::x86_avx512_mask_div_ss_round: - case Intrinsic::x86_avx512_mask_mul_ss_round: - case Intrinsic::x86_avx512_mask_sub_ss_round: - case Intrinsic::x86_avx512_mask_add_sd_round: - case Intrinsic::x86_avx512_mask_div_sd_round: - case Intrinsic::x86_avx512_mask_mul_sd_round: - case Intrinsic::x86_avx512_mask_sub_sd_round: - // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular - // IR operations. - if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { - if (R->getValue() == 4) { - // Extract the element as scalars. - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); - Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); - - Value *V; - switch (IID) { - default: - llvm_unreachable("Case stmts out of sync!"); - case Intrinsic::x86_avx512_mask_add_ss_round: - case Intrinsic::x86_avx512_mask_add_sd_round: - V = IC.Builder.CreateFAdd(LHS, RHS); - break; - case Intrinsic::x86_avx512_mask_sub_ss_round: - case Intrinsic::x86_avx512_mask_sub_sd_round: - V = IC.Builder.CreateFSub(LHS, RHS); - break; - case Intrinsic::x86_avx512_mask_mul_ss_round: - case Intrinsic::x86_avx512_mask_mul_sd_round: - V = IC.Builder.CreateFMul(LHS, RHS); - break; - case Intrinsic::x86_avx512_mask_div_ss_round: - case Intrinsic::x86_avx512_mask_div_sd_round: - V = IC.Builder.CreateFDiv(LHS, RHS); - break; - } - - // Handle the masking aspect of the intrinsic. - Value *Mask = II.getArgOperand(3); - auto *C = dyn_cast<ConstantInt>(Mask); - // We don't need a select if we know the mask bit is a 1. - if (!C || !C->getValue()[0]) { - // Cast the mask to an i1 vector and then extract the lowest element. - auto *MaskTy = FixedVectorType::get( - IC.Builder.getInt1Ty(), - cast<IntegerType>(Mask->getType())->getBitWidth()); - Mask = IC.Builder.CreateBitCast(Mask, MaskTy); - Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); - // Extract the lowest element from the passthru operand. - Value *Passthru = - IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); - V = IC.Builder.CreateSelect(Mask, V, Passthru); - } - - // Insert the result back into the original argument 0. - V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); - - return IC.replaceInstUsesWith(II, V); - } - } - break; - - // Constant fold ashr( <A x Bi>, Ci ). - // Constant fold lshr( <A x Bi>, Ci ). - // Constant fold shl( <A x Bi>, Ci ). - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx512_psrai_q_128: - case Intrinsic::x86_avx512_psrai_q_256: - case Intrinsic::x86_avx512_psrai_d_512: - case Intrinsic::x86_avx512_psrai_q_512: - case Intrinsic::x86_avx512_psrai_w_512: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx512_psrli_d_512: - case Intrinsic::x86_avx512_psrli_q_512: - case Intrinsic::x86_avx512_psrli_w_512: - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx512_pslli_d_512: - case Intrinsic::x86_avx512_pslli_q_512: - case Intrinsic::x86_avx512_pslli_w_512: - if (Value *V = simplifyX86immShift(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_avx2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx512_psra_q_128: - case Intrinsic::x86_avx512_psra_q_256: - case Intrinsic::x86_avx512_psra_d_512: - case Intrinsic::x86_avx512_psra_q_512: - case Intrinsic::x86_avx512_psra_w_512: - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx512_psrl_d_512: - case Intrinsic::x86_avx512_psrl_q_512: - case Intrinsic::x86_avx512_psrl_w_512: - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx512_psll_d_512: - case Intrinsic::x86_avx512_psll_q_512: - case Intrinsic::x86_avx512_psll_w_512: { - if (Value *V = simplifyX86immShift(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - - // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector - // operand to compute the shift amount. - Value *Arg1 = II.getArgOperand(1); - assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && - "Unexpected packed shift size"); - unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); - - if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { - return IC.replaceOperand(II, 1, V); - } - break; - } - - case Intrinsic::x86_avx2_psllv_d: - case Intrinsic::x86_avx2_psllv_d_256: - case Intrinsic::x86_avx2_psllv_q: - case Intrinsic::x86_avx2_psllv_q_256: - case Intrinsic::x86_avx512_psllv_d_512: - case Intrinsic::x86_avx512_psllv_q_512: - case Intrinsic::x86_avx512_psllv_w_128: - case Intrinsic::x86_avx512_psllv_w_256: - case Intrinsic::x86_avx512_psllv_w_512: - case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: - case Intrinsic::x86_avx512_psrav_q_128: - case Intrinsic::x86_avx512_psrav_q_256: - case Intrinsic::x86_avx512_psrav_d_512: - case Intrinsic::x86_avx512_psrav_q_512: - case Intrinsic::x86_avx512_psrav_w_128: - case Intrinsic::x86_avx512_psrav_w_256: - case Intrinsic::x86_avx512_psrav_w_512: - case Intrinsic::x86_avx2_psrlv_d: - case Intrinsic::x86_avx2_psrlv_d_256: - case Intrinsic::x86_avx2_psrlv_q: - case Intrinsic::x86_avx2_psrlv_q_256: - case Intrinsic::x86_avx512_psrlv_d_512: - case Intrinsic::x86_avx512_psrlv_q_512: - case Intrinsic::x86_avx512_psrlv_w_128: - case Intrinsic::x86_avx512_psrlv_w_256: - case Intrinsic::x86_avx512_psrlv_w_512: - if (Value *V = simplifyX86varShift(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packsswb: - case Intrinsic::x86_avx512_packssdw_512: - case Intrinsic::x86_avx512_packsswb_512: - if (Value *V = simplifyX86pack(II, IC.Builder, true)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_sse2_packuswb_128: - case Intrinsic::x86_sse41_packusdw: - case Intrinsic::x86_avx2_packusdw: - case Intrinsic::x86_avx2_packuswb: - case Intrinsic::x86_avx512_packusdw_512: - case Intrinsic::x86_avx512_packuswb_512: - if (Value *V = simplifyX86pack(II, IC.Builder, false)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_pclmulqdq: - case Intrinsic::x86_pclmulqdq_256: - case Intrinsic::x86_pclmulqdq_512: { - if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { - unsigned Imm = C->getZExtValue(); - - bool MadeChange = false; - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - unsigned VWidth = - cast<FixedVectorType>(Arg0->getType())->getNumElements(); - - APInt UndefElts1(VWidth, 0); - APInt DemandedElts1 = - APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); - if (Value *V = - IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { - IC.replaceOperand(II, 0, V); - MadeChange = true; - } - - APInt UndefElts2(VWidth, 0); - APInt DemandedElts2 = - APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); - if (Value *V = - IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { - IC.replaceOperand(II, 1, V); - MadeChange = true; - } - - // If either input elements are undef, the result is zero. - if (DemandedElts1.isSubsetOf(UndefElts1) || - DemandedElts2.isSubsetOf(UndefElts2)) { - return IC.replaceInstUsesWith(II, - ConstantAggregateZero::get(II.getType())); - } - - if (MadeChange) { - return &II; - } - } - break; - } - - case Intrinsic::x86_sse41_insertps: - if (Value *V = simplifyX86insertps(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_sse4a_extrq: { - Value *Op0 = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); - unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); - assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && - Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && - VWidth1 == 16 && "Unexpected operand sizes"); - - // See if we're dealing with constant values. - Constant *C1 = dyn_cast<Constant>(Op1); - ConstantInt *CILength = - C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) - : nullptr; - ConstantInt *CIIndex = - C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) - : nullptr; - - // Attempt to simplify to a constant, shuffle vector or EXTRQI call. - if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - - // EXTRQ only uses the lowest 64-bits of the first 128-bit vector - // operands and the lowest 16-bits of the second. - bool MadeChange = false; - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { - IC.replaceOperand(II, 0, V); - MadeChange = true; - } - if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { - IC.replaceOperand(II, 1, V); - MadeChange = true; - } - if (MadeChange) { - return &II; - } - break; - } - - case Intrinsic::x86_sse4a_extrqi: { - // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining - // bits of the lower 64-bits. The upper 64-bits are undefined. - Value *Op0 = II.getArgOperand(0); - unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); - assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && - "Unexpected operand size"); - - // See if we're dealing with constant values. - ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); - ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); - - // Attempt to simplify to a constant or shuffle vector. - if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - - // EXTRQI only uses the lowest 64-bits of the first 128-bit vector - // operand. - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { - return IC.replaceOperand(II, 0, V); - } - break; - } - - case Intrinsic::x86_sse4a_insertq: { - Value *Op0 = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); - assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && - Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && - cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && - "Unexpected operand size"); - - // See if we're dealing with constant values. - Constant *C1 = dyn_cast<Constant>(Op1); - ConstantInt *CI11 = - C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) - : nullptr; - - // Attempt to simplify to a constant, shuffle vector or INSERTQI call. - if (CI11) { - const APInt &V11 = CI11->getValue(); - APInt Len = V11.zextOrTrunc(6); - APInt Idx = V11.lshr(8).zextOrTrunc(6); - if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - } - - // INSERTQ only uses the lowest 64-bits of the first 128-bit vector - // operand. - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { - return IC.replaceOperand(II, 0, V); - } - break; - } - - case Intrinsic::x86_sse4a_insertqi: { - // INSERTQI: Extract lowest Length bits from lower half of second source and - // insert over first source starting at Index bit. The upper 64-bits are - // undefined. - Value *Op0 = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); - unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); - assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && - Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && - VWidth1 == 2 && "Unexpected operand sizes"); - - // See if we're dealing with constant values. - ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); - ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); - - // Attempt to simplify to a constant or shuffle vector. - if (CILength && CIIndex) { - APInt Len = CILength->getValue().zextOrTrunc(6); - APInt Idx = CIIndex->getValue().zextOrTrunc(6); - if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - } - - // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector - // operands. - bool MadeChange = false; - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { - IC.replaceOperand(II, 0, V); - MadeChange = true; - } - if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { - IC.replaceOperand(II, 1, V); - MadeChange = true; - } - if (MadeChange) { - return &II; - } - break; - } - - case Intrinsic::x86_sse41_pblendvb: - case Intrinsic::x86_sse41_blendvps: - case Intrinsic::x86_sse41_blendvpd: - case Intrinsic::x86_avx_blendv_ps_256: - case Intrinsic::x86_avx_blendv_pd_256: - case Intrinsic::x86_avx2_pblendvb: { - // fold (blend A, A, Mask) -> A - Value *Op0 = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - Value *Mask = II.getArgOperand(2); - if (Op0 == Op1) { - return IC.replaceInstUsesWith(II, Op0); - } - - // Zero Mask - select 1st argument. - if (isa<ConstantAggregateZero>(Mask)) { - return IC.replaceInstUsesWith(II, Op0); - } - - // Constant Mask - select 1st/2nd argument lane based on top bit of mask. - if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { - Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); - return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); - } - - // Convert to a vector select if we can bypass casts and find a boolean - // vector condition value. - Value *BoolVec; - Mask = InstCombiner::peekThroughBitcast(Mask); - if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && - BoolVec->getType()->isVectorTy() && - BoolVec->getType()->getScalarSizeInBits() == 1) { - assert(Mask->getType()->getPrimitiveSizeInBits() == - II.getType()->getPrimitiveSizeInBits() && - "Not expecting mask and operands with different sizes"); - - unsigned NumMaskElts = - cast<FixedVectorType>(Mask->getType())->getNumElements(); - unsigned NumOperandElts = - cast<FixedVectorType>(II.getType())->getNumElements(); - if (NumMaskElts == NumOperandElts) { - return SelectInst::Create(BoolVec, Op1, Op0); - } - - // If the mask has less elements than the operands, each mask bit maps to - // multiple elements of the operands. Bitcast back and forth. - if (NumMaskElts < NumOperandElts) { - Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); - Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); - Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); - return new BitCastInst(Sel, II.getType()); - } - } - - break; - } - - case Intrinsic::x86_ssse3_pshuf_b_128: - case Intrinsic::x86_avx2_pshuf_b: - case Intrinsic::x86_avx512_pshuf_b_512: - if (Value *V = simplifyX86pshufb(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_avx_vpermilvar_ps: - case Intrinsic::x86_avx_vpermilvar_ps_256: - case Intrinsic::x86_avx512_vpermilvar_ps_512: - case Intrinsic::x86_avx_vpermilvar_pd: - case Intrinsic::x86_avx_vpermilvar_pd_256: - case Intrinsic::x86_avx512_vpermilvar_pd_512: - if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_avx2_permd: - case Intrinsic::x86_avx2_permps: - case Intrinsic::x86_avx512_permvar_df_256: - case Intrinsic::x86_avx512_permvar_df_512: - case Intrinsic::x86_avx512_permvar_di_256: - case Intrinsic::x86_avx512_permvar_di_512: - case Intrinsic::x86_avx512_permvar_hi_128: - case Intrinsic::x86_avx512_permvar_hi_256: - case Intrinsic::x86_avx512_permvar_hi_512: - case Intrinsic::x86_avx512_permvar_qi_128: - case Intrinsic::x86_avx512_permvar_qi_256: - case Intrinsic::x86_avx512_permvar_qi_512: - case Intrinsic::x86_avx512_permvar_sf_512: - case Intrinsic::x86_avx512_permvar_si_512: - if (Value *V = simplifyX86vpermv(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_avx_maskload_ps: - case Intrinsic::x86_avx_maskload_pd: - case Intrinsic::x86_avx_maskload_ps_256: - case Intrinsic::x86_avx_maskload_pd_256: - case Intrinsic::x86_avx2_maskload_d: - case Intrinsic::x86_avx2_maskload_q: - case Intrinsic::x86_avx2_maskload_d_256: - case Intrinsic::x86_avx2_maskload_q_256: - if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { - return I; - } - break; - - case Intrinsic::x86_sse2_maskmov_dqu: - case Intrinsic::x86_avx_maskstore_ps: - case Intrinsic::x86_avx_maskstore_pd: - case Intrinsic::x86_avx_maskstore_ps_256: - case Intrinsic::x86_avx_maskstore_pd_256: - case Intrinsic::x86_avx2_maskstore_d: - case Intrinsic::x86_avx2_maskstore_q: - case Intrinsic::x86_avx2_maskstore_d_256: - case Intrinsic::x86_avx2_maskstore_q_256: - if (simplifyX86MaskedStore(II, IC)) { - return nullptr; - } - break; - - case Intrinsic::x86_addcarry_32: - case Intrinsic::x86_addcarry_64: - if (Value *V = simplifyX86addcarry(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - default: - break; - } - return None; -} - -Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( - InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, - bool &KnownBitsComputed) const { - switch (II.getIntrinsicID()) { - default: - break; - case Intrinsic::x86_mmx_pmovmskb: - case Intrinsic::x86_sse_movmsk_ps: - case Intrinsic::x86_sse2_movmsk_pd: - case Intrinsic::x86_sse2_pmovmskb_128: - case Intrinsic::x86_avx_movmsk_ps_256: - case Intrinsic::x86_avx_movmsk_pd_256: - case Intrinsic::x86_avx2_pmovmskb: { - // MOVMSK copies the vector elements' sign bits to the low bits - // and zeros the high bits. - unsigned ArgWidth; - if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { - ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. - } else { - auto Arg = II.getArgOperand(0); - auto ArgType = cast<FixedVectorType>(Arg->getType()); - ArgWidth = ArgType->getNumElements(); - } - - // If we don't need any of low bits then return zero, - // we know that DemandedMask is non-zero already. - APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); - Type *VTy = II.getType(); - if (DemandedElts.isNullValue()) { - return ConstantInt::getNullValue(VTy); - } - - // We know that the upper bits are set to zero. - Known.Zero.setBitsFrom(ArgWidth); - KnownBitsComputed = true; - break; - } - } - return None; -} - -Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( - InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, - APInt &UndefElts2, APInt &UndefElts3, - std::function<void(Instruction *, unsigned, APInt, APInt &)> - simplifyAndSetOp) const { - unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); - switch (II.getIntrinsicID()) { - default: - break; - case Intrinsic::x86_xop_vfrcz_ss: - case Intrinsic::x86_xop_vfrcz_sd: - // The instructions for these intrinsics are speced to zero upper bits not - // pass them through like other scalar intrinsics. So we shouldn't just - // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. - // Instead we should return a zero vector. - if (!DemandedElts[0]) { - IC.addToWorklist(&II); - return ConstantAggregateZero::get(II.getType()); - } - - // Only the lower element is used. - DemandedElts = 1; - simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); - - // Only the lower element is undefined. The high elements are zero. - UndefElts = UndefElts[0]; - break; - - // Unary scalar-as-vector operations that work column-wise. - case Intrinsic::x86_sse_rcp_ss: - case Intrinsic::x86_sse_rsqrt_ss: - simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); - - // If lowest element of a scalar op isn't used then use Arg0. - if (!DemandedElts[0]) { - IC.addToWorklist(&II); - return II.getArgOperand(0); - } - // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions - // checks). - break; - - // Binary scalar-as-vector operations that work column-wise. The high - // elements come from operand 0. The low element is a function of both - // operands. - case Intrinsic::x86_sse_min_ss: - case Intrinsic::x86_sse_max_ss: - case Intrinsic::x86_sse_cmp_ss: - case Intrinsic::x86_sse2_min_sd: - case Intrinsic::x86_sse2_max_sd: - case Intrinsic::x86_sse2_cmp_sd: { - simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); - - // If lowest element of a scalar op isn't used then use Arg0. - if (!DemandedElts[0]) { - IC.addToWorklist(&II); - return II.getArgOperand(0); - } - - // Only lower element is used for operand 1. - DemandedElts = 1; - simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); - - // Lower element is undefined if both lower elements are undefined. - // Consider things like undef&0. The result is known zero, not undef. - if (!UndefElts2[0]) - UndefElts.clearBit(0); - - break; - } - - // Binary scalar-as-vector operations that work column-wise. The high - // elements come from operand 0 and the low element comes from operand 1. - case Intrinsic::x86_sse41_round_ss: - case Intrinsic::x86_sse41_round_sd: { - // Don't use the low element of operand 0. - APInt DemandedElts2 = DemandedElts; - DemandedElts2.clearBit(0); - simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); - - // If lowest element of a scalar op isn't used then use Arg0. - if (!DemandedElts[0]) { - IC.addToWorklist(&II); - return II.getArgOperand(0); - } - - // Only lower element is used for operand 1. - DemandedElts = 1; - simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); - - // Take the high undef elements from operand 0 and take the lower element - // from operand 1. - UndefElts.clearBit(0); - UndefElts |= UndefElts2[0]; - break; - } - - // Three input scalar-as-vector operations that work column-wise. The high - // elements come from operand 0 and the low element is a function of all - // three inputs. - case Intrinsic::x86_avx512_mask_add_ss_round: - case Intrinsic::x86_avx512_mask_div_ss_round: - case Intrinsic::x86_avx512_mask_mul_ss_round: - case Intrinsic::x86_avx512_mask_sub_ss_round: - case Intrinsic::x86_avx512_mask_max_ss_round: - case Intrinsic::x86_avx512_mask_min_ss_round: - case Intrinsic::x86_avx512_mask_add_sd_round: - case Intrinsic::x86_avx512_mask_div_sd_round: - case Intrinsic::x86_avx512_mask_mul_sd_round: - case Intrinsic::x86_avx512_mask_sub_sd_round: - case Intrinsic::x86_avx512_mask_max_sd_round: - case Intrinsic::x86_avx512_mask_min_sd_round: - simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); - - // If lowest element of a scalar op isn't used then use Arg0. - if (!DemandedElts[0]) { - IC.addToWorklist(&II); - return II.getArgOperand(0); - } - - // Only lower element is used for operand 1 and 2. - DemandedElts = 1; - simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); - simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); - - // Lower element is undefined if all three lower elements are undefined. - // Consider things like undef&0. The result is known zero, not undef. - if (!UndefElts2[0] || !UndefElts3[0]) - UndefElts.clearBit(0); - break; - - // TODO: Add fmaddsub support? - case Intrinsic::x86_sse3_addsub_pd: - case Intrinsic::x86_sse3_addsub_ps: - case Intrinsic::x86_avx_addsub_pd_256: - case Intrinsic::x86_avx_addsub_ps_256: { - // If none of the even or none of the odd lanes are required, turn this - // into a generic FP math instruction. - APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); - APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); - bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); - bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); - if (IsSubOnly || IsAddOnly) { - assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); - IRBuilderBase::InsertPointGuard Guard(IC.Builder); - IC.Builder.SetInsertPoint(&II); - Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); - return IC.Builder.CreateBinOp( - IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); - } - - simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); - simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); - UndefElts &= UndefElts2; - break; - } - - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_sse2_packuswb_128: - case Intrinsic::x86_sse41_packusdw: - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packsswb: - case Intrinsic::x86_avx2_packusdw: - case Intrinsic::x86_avx2_packuswb: - case Intrinsic::x86_avx512_packssdw_512: - case Intrinsic::x86_avx512_packsswb_512: - case Intrinsic::x86_avx512_packusdw_512: - case Intrinsic::x86_avx512_packuswb_512: { - auto *Ty0 = II.getArgOperand(0)->getType(); - unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); - assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); - - unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; - unsigned VWidthPerLane = VWidth / NumLanes; - unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; - - // Per lane, pack the elements of the first input and then the second. - // e.g. - // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) - // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) - for (int OpNum = 0; OpNum != 2; ++OpNum) { - APInt OpDemandedElts(InnerVWidth, 0); - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - unsigned LaneIdx = Lane * VWidthPerLane; - for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { - unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; - if (DemandedElts[Idx]) - OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); - } - } - - // Demand elements from the operand. - APInt OpUndefElts(InnerVWidth, 0); - simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); - - // Pack the operand's UNDEF elements, one lane at a time. - OpUndefElts = OpUndefElts.zext(VWidth); - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); - LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); - LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); - UndefElts |= LaneElts; - } - } - break; - } - - // PSHUFB - case Intrinsic::x86_ssse3_pshuf_b_128: - case Intrinsic::x86_avx2_pshuf_b: - case Intrinsic::x86_avx512_pshuf_b_512: - // PERMILVAR - case Intrinsic::x86_avx_vpermilvar_ps: - case Intrinsic::x86_avx_vpermilvar_ps_256: - case Intrinsic::x86_avx512_vpermilvar_ps_512: - case Intrinsic::x86_avx_vpermilvar_pd: - case Intrinsic::x86_avx_vpermilvar_pd_256: - case Intrinsic::x86_avx512_vpermilvar_pd_512: - // PERMV - case Intrinsic::x86_avx2_permd: - case Intrinsic::x86_avx2_permps: { - simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); - break; - } - - // SSE4A instructions leave the upper 64-bits of the 128-bit result - // in an undefined state. - case Intrinsic::x86_sse4a_extrq: - case Intrinsic::x86_sse4a_extrqi: - case Intrinsic::x86_sse4a_insertq: - case Intrinsic::x86_sse4a_insertqi: - UndefElts.setHighBits(VWidth / 2); - break; - } - return None; -} +//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements a TargetTransformInfo analysis pass specific to the +/// X86 target machine. It uses the target's detailed information to provide +/// more precise answers to certain TTI queries, while letting the target +/// independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#include "X86TargetTransformInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsX86.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86tti" + +/// Return a constant boolean vector that has true elements in all positions +/// where the input constant data vector has an element with the sign bit set. +static Constant *getNegativeIsTrueBoolVec(Constant *V) { + VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); + V = ConstantExpr::getBitCast(V, IntTy); + V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy), + V); + return V; +} + +/// Convert the x86 XMM integer vector mask to a vector of bools based on +/// each element's most significant bit (the sign bit). +static Value *getBoolVecFromMask(Value *Mask) { + // Fold Constant Mask. + if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) + return getNegativeIsTrueBoolVec(ConstantMask); + + // Mask was extended from a boolean vector. + Value *ExtMask; + if (PatternMatch::match( + Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && + ExtMask->getType()->isIntOrIntVectorTy(1)) + return ExtMask; + + return nullptr; +} + +// TODO: If the x86 backend knew how to convert a bool vector mask back to an +// XMM register mask efficiently, we could transform all x86 masked intrinsics +// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. +static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { + Value *Ptr = II.getOperand(0); + Value *Mask = II.getOperand(1); + Constant *ZeroVec = Constant::getNullValue(II.getType()); + + // Zero Mask - masked load instruction creates a zero vector. + if (isa<ConstantAggregateZero>(Mask)) + return IC.replaceInstUsesWith(II, ZeroVec); + + // The mask is constant or extended from a bool vector. Convert this x86 + // intrinsic to the LLVM intrinsic to allow target-independent optimizations. + if (Value *BoolMask = getBoolVecFromMask(Mask)) { + // First, cast the x86 intrinsic scalar pointer to a vector pointer to match + // the LLVM intrinsic definition for the pointer argument. + unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); + PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); + Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); + + // The pass-through vector for an x86 masked load is a zero vector. + CallInst *NewMaskedLoad = + IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); + return IC.replaceInstUsesWith(II, NewMaskedLoad); + } + + return nullptr; +} + +// TODO: If the x86 backend knew how to convert a bool vector mask back to an +// XMM register mask efficiently, we could transform all x86 masked intrinsics +// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. +static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { + Value *Ptr = II.getOperand(0); + Value *Mask = II.getOperand(1); + Value *Vec = II.getOperand(2); + + // Zero Mask - this masked store instruction does nothing. + if (isa<ConstantAggregateZero>(Mask)) { + IC.eraseInstFromFunction(II); + return true; + } + + // The SSE2 version is too weird (eg, unaligned but non-temporal) to do + // anything else at this level. + if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) + return false; + + // The mask is constant or extended from a bool vector. Convert this x86 + // intrinsic to the LLVM intrinsic to allow target-independent optimizations. + if (Value *BoolMask = getBoolVecFromMask(Mask)) { + unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); + PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); + Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); + + IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); + + // 'Replace uses' doesn't work for stores. Erase the original masked store. + IC.eraseInstFromFunction(II); + return true; + } + + return false; +} + +static Value *simplifyX86immShift(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + bool IsImm = false; + + switch (II.getIntrinsicID()) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psrai_q_256: + case Intrinsic::x86_avx512_psrai_d_512: + case Intrinsic::x86_avx512_psrai_q_512: + case Intrinsic::x86_avx512_psrai_w_512: + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: + LogicalShift = false; + ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx512_psrli_d_512: + case Intrinsic::x86_avx512_psrli_q_512: + case Intrinsic::x86_avx512_psrli_w_512: + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: + LogicalShift = true; + ShiftLeft = false; + break; + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx512_pslli_d_512: + case Intrinsic::x86_avx512_pslli_q_512: + case Intrinsic::x86_avx512_pslli_w_512: + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: + LogicalShift = true; + ShiftLeft = true; + break; + } + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + + auto Vec = II.getArgOperand(0); + auto Amt = II.getArgOperand(1); + auto VT = cast<FixedVectorType>(Vec->getType()); + auto SVT = VT->getElementType(); + auto AmtVT = Amt->getType(); + unsigned VWidth = VT->getNumElements(); + unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + + // If the shift amount is guaranteed to be in-range we can replace it with a + // generic shift. If its guaranteed to be out of range, logical shifts combine + // to zero and arithmetic shifts are clamped to (BitWidth - 1). + if (IsImm) { + assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); + KnownBits KnownAmtBits = + llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); + if (KnownAmtBits.getMaxValue().ult(BitWidth)) { + Amt = Builder.CreateZExtOrTrunc(Amt, SVT); + Amt = Builder.CreateVectorSplat(VWidth, Amt); + return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) + : Builder.CreateLShr(Vec, Amt)) + : Builder.CreateAShr(Vec, Amt)); + } + if (KnownAmtBits.getMinValue().uge(BitWidth)) { + if (LogicalShift) + return ConstantAggregateZero::get(VT); + Amt = ConstantInt::get(SVT, BitWidth - 1); + return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); + } + } else { + // Ensure the first element has an in-range value and the rest of the + // elements in the bottom 64 bits are zero. + assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && + cast<VectorType>(AmtVT)->getElementType() == SVT && + "Unexpected shift-by-scalar type"); + unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); + APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); + APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); + KnownBits KnownLowerBits = llvm::computeKnownBits( + Amt, DemandedLower, II.getModule()->getDataLayout()); + KnownBits KnownUpperBits = llvm::computeKnownBits( + Amt, DemandedUpper, II.getModule()->getDataLayout()); + if (KnownLowerBits.getMaxValue().ult(BitWidth) && + (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) { + SmallVector<int, 16> ZeroSplat(VWidth, 0); + Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); + return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) + : Builder.CreateLShr(Vec, Amt)) + : Builder.CreateAShr(Vec, Amt)); + } + } + + // Simplify if count is constant vector. + auto CDV = dyn_cast<ConstantDataVector>(Amt); + if (!CDV) + return nullptr; + + // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && + cast<VectorType>(AmtVT)->getElementType() == SVT && + "Unexpected shift-by-scalar type"); + + // Concatenate the sub-elements to create the 64-bit value. + APInt Count(64, 0); + for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { + unsigned SubEltIdx = (NumSubElts - 1) - i; + auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); + Count <<= BitWidth; + Count |= SubElt->getValue().zextOrTrunc(64); + } + + // If shift-by-zero then just return the original value. + if (Count.isNullValue()) + return Vec; + + // Handle cases when Shift >= BitWidth. + if (Count.uge(BitWidth)) { + // If LogicalShift - just return zero. + if (LogicalShift) + return ConstantAggregateZero::get(VT); + + // If ArithmeticShift - clamp Shift to (BitWidth - 1). + Count = APInt(64, BitWidth - 1); + } + + // Get a constant vector of the same type as the first operand. + auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); + auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); + + if (ShiftLeft) + return Builder.CreateShl(Vec, ShiftVec); + + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); +} + +// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. +// Unlike the generic IR shifts, the intrinsics have defined behaviour for out +// of range shift amounts (logical - set to zero, arithmetic - splat sign bit). +static Value *simplifyX86varShift(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + + switch (II.getIntrinsicID()) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + case Intrinsic::x86_avx512_psrav_q_128: + case Intrinsic::x86_avx512_psrav_q_256: + case Intrinsic::x86_avx512_psrav_d_512: + case Intrinsic::x86_avx512_psrav_q_512: + case Intrinsic::x86_avx512_psrav_w_128: + case Intrinsic::x86_avx512_psrav_w_256: + case Intrinsic::x86_avx512_psrav_w_512: + LogicalShift = false; + ShiftLeft = false; + break; + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_q_256: + case Intrinsic::x86_avx512_psrlv_d_512: + case Intrinsic::x86_avx512_psrlv_q_512: + case Intrinsic::x86_avx512_psrlv_w_128: + case Intrinsic::x86_avx512_psrlv_w_256: + case Intrinsic::x86_avx512_psrlv_w_512: + LogicalShift = true; + ShiftLeft = false; + break; + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_q_256: + case Intrinsic::x86_avx512_psllv_d_512: + case Intrinsic::x86_avx512_psllv_q_512: + case Intrinsic::x86_avx512_psllv_w_128: + case Intrinsic::x86_avx512_psllv_w_256: + case Intrinsic::x86_avx512_psllv_w_512: + LogicalShift = true; + ShiftLeft = true; + break; + } + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + + auto Vec = II.getArgOperand(0); + auto Amt = II.getArgOperand(1); + auto VT = cast<FixedVectorType>(II.getType()); + auto SVT = VT->getElementType(); + int NumElts = VT->getNumElements(); + int BitWidth = SVT->getIntegerBitWidth(); + + // If the shift amount is guaranteed to be in-range we can replace it with a + // generic shift. + APInt UpperBits = + APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth)); + if (llvm::MaskedValueIsZero(Amt, UpperBits, + II.getModule()->getDataLayout())) { + return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) + : Builder.CreateLShr(Vec, Amt)) + : Builder.CreateAShr(Vec, Amt)); + } + + // Simplify if all shift amounts are constant/undef. + auto *CShift = dyn_cast<Constant>(Amt); + if (!CShift) + return nullptr; + + // Collect each element's shift amount. + // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. + bool AnyOutOfRange = false; + SmallVector<int, 8> ShiftAmts; + for (int I = 0; I < NumElts; ++I) { + auto *CElt = CShift->getAggregateElement(I); + if (isa_and_nonnull<UndefValue>(CElt)) { + ShiftAmts.push_back(-1); + continue; + } + + auto *COp = dyn_cast_or_null<ConstantInt>(CElt); + if (!COp) + return nullptr; + + // Handle out of range shifts. + // If LogicalShift - set to BitWidth (special case). + // If ArithmeticShift - set to (BitWidth - 1) (sign splat). + APInt ShiftVal = COp->getValue(); + if (ShiftVal.uge(BitWidth)) { + AnyOutOfRange = LogicalShift; + ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); + continue; + } + + ShiftAmts.push_back((int)ShiftVal.getZExtValue()); + } + + // If all elements out of range or UNDEF, return vector of zeros/undefs. + // ArithmeticShift should only hit this if they are all UNDEF. + auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; + if (llvm::all_of(ShiftAmts, OutOfRange)) { + SmallVector<Constant *, 8> ConstantVec; + for (int Idx : ShiftAmts) { + if (Idx < 0) { + ConstantVec.push_back(UndefValue::get(SVT)); + } else { + assert(LogicalShift && "Logical shift expected"); + ConstantVec.push_back(ConstantInt::getNullValue(SVT)); + } + } + return ConstantVector::get(ConstantVec); + } + + // We can't handle only some out of range values with generic logical shifts. + if (AnyOutOfRange) + return nullptr; + + // Build the shift amount constant vector. + SmallVector<Constant *, 8> ShiftVecAmts; + for (int Idx : ShiftAmts) { + if (Idx < 0) + ShiftVecAmts.push_back(UndefValue::get(SVT)); + else + ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); + } + auto ShiftVec = ConstantVector::get(ShiftVecAmts); + + if (ShiftLeft) + return Builder.CreateShl(Vec, ShiftVec); + + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); +} + +static Value *simplifyX86pack(IntrinsicInst &II, + InstCombiner::BuilderTy &Builder, bool IsSigned) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + Type *ResTy = II.getType(); + + // Fast all undef handling. + if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) + return UndefValue::get(ResTy); + + auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); + unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; + unsigned NumSrcElts = ArgTy->getNumElements(); + assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && + "Unexpected packing types"); + + unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; + unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); + unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); + assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && + "Unexpected packing types"); + + // Constant folding. + if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) + return nullptr; + + // Clamp Values - signed/unsigned both use signed clamp values, but they + // differ on the min/max values. + APInt MinValue, MaxValue; + if (IsSigned) { + // PACKSS: Truncate signed value with signed saturation. + // Source values less than dst minint are saturated to minint. + // Source values greater than dst maxint are saturated to maxint. + MinValue = + APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); + MaxValue = + APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); + } else { + // PACKUS: Truncate signed value with unsigned saturation. + // Source values less than zero are saturated to zero. + // Source values greater than dst maxuint are saturated to maxuint. + MinValue = APInt::getNullValue(SrcScalarSizeInBits); + MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); + } + + auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); + auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); + Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); + Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); + Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); + Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); + + // Shuffle clamped args together at the lane level. + SmallVector<int, 32> PackMask; + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) + PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); + for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) + PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); + } + auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); + + // Truncate to dst size. + return Builder.CreateTrunc(Shuffle, ResTy); +} + +static Value *simplifyX86movmsk(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Value *Arg = II.getArgOperand(0); + Type *ResTy = II.getType(); + + // movmsk(undef) -> zero as we must ensure the upper bits are zero. + if (isa<UndefValue>(Arg)) + return Constant::getNullValue(ResTy); + + auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); + // We can't easily peek through x86_mmx types. + if (!ArgTy) + return nullptr; + + // Expand MOVMSK to compare/bitcast/zext: + // e.g. PMOVMSKB(v16i8 x): + // %cmp = icmp slt <16 x i8> %x, zeroinitializer + // %int = bitcast <16 x i1> %cmp to i16 + // %res = zext i16 %int to i32 + unsigned NumElts = ArgTy->getNumElements(); + Type *IntegerVecTy = VectorType::getInteger(ArgTy); + Type *IntegerTy = Builder.getIntNTy(NumElts); + + Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); + Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); + Res = Builder.CreateBitCast(Res, IntegerTy); + Res = Builder.CreateZExtOrTrunc(Res, ResTy); + return Res; +} + +static Value *simplifyX86addcarry(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Value *CarryIn = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + Value *Op2 = II.getArgOperand(2); + Type *RetTy = II.getType(); + Type *OpTy = Op1->getType(); + assert(RetTy->getStructElementType(0)->isIntegerTy(8) && + RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && + "Unexpected types for x86 addcarry"); + + // If carry-in is zero, this is just an unsigned add with overflow. + if (match(CarryIn, PatternMatch::m_ZeroInt())) { + Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, + {Op1, Op2}); + // The types have to be adjusted to match the x86 call types. + Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); + Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), + Builder.getInt8Ty()); + Value *Res = UndefValue::get(RetTy); + Res = Builder.CreateInsertValue(Res, UAddOV, 0); + return Builder.CreateInsertValue(Res, UAddResult, 1); + } + + return nullptr; +} + +static Value *simplifyX86insertps(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); + if (!CInt) + return nullptr; + + auto *VecTy = cast<FixedVectorType>(II.getType()); + assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); + + // The immediate permute control byte looks like this: + // [3:0] - zero mask for each 32-bit lane + // [5:4] - select one 32-bit destination lane + // [7:6] - select one 32-bit source lane + + uint8_t Imm = CInt->getZExtValue(); + uint8_t ZMask = Imm & 0xf; + uint8_t DestLane = (Imm >> 4) & 0x3; + uint8_t SourceLane = (Imm >> 6) & 0x3; + + ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); + + // If all zero mask bits are set, this was just a weird way to + // generate a zero vector. + if (ZMask == 0xf) + return ZeroVector; + + // Initialize by passing all of the first source bits through. + int ShuffleMask[4] = {0, 1, 2, 3}; + + // We may replace the second operand with the zero vector. + Value *V1 = II.getArgOperand(1); + + if (ZMask) { + // If the zero mask is being used with a single input or the zero mask + // overrides the destination lane, this is a shuffle with the zero vector. + if ((II.getArgOperand(0) == II.getArgOperand(1)) || + (ZMask & (1 << DestLane))) { + V1 = ZeroVector; + // We may still move 32-bits of the first source vector from one lane + // to another. + ShuffleMask[DestLane] = SourceLane; + // The zero mask may override the previous insert operation. + for (unsigned i = 0; i < 4; ++i) + if ((ZMask >> i) & 0x1) + ShuffleMask[i] = i + 4; + } else { + // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? + return nullptr; + } + } else { + // Replace the selected destination lane with the selected source lane. + ShuffleMask[DestLane] = SourceLane + 4; + } + + return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); +} + +/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding +/// or conversion to a shuffle vector. +static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, + ConstantInt *CILength, ConstantInt *CIIndex, + InstCombiner::BuilderTy &Builder) { + auto LowConstantHighUndef = [&](uint64_t Val) { + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + }; + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast<Constant>(Op0); + ConstantInt *CI0 = + C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) + : nullptr; + + // Attempt to constant fold. + if (CILength && CIIndex) { + // From AMD documentation: "The bit index and field length are each six + // bits in length other bits of the field are ignored." + APInt APIndex = CIIndex->getValue().zextOrTrunc(6); + APInt APLength = CILength->getValue().zextOrTrunc(6); + + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize EXTRQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + auto *ShufTy = FixedVectorType::get(IntTy8, 16); + + SmallVector<int, 16> ShuffleMask; + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back(i + Index); + for (int i = Length; i != 8; ++i) + ShuffleMask.push_back(i + 16); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(-1); + + Value *SV = Builder.CreateShuffleVector( + Builder.CreateBitCast(Op0, ShufTy), + ConstantAggregateZero::get(ShufTy), ShuffleMask); + return Builder.CreateBitCast(SV, II.getType()); + } + + // Constant Fold - shift Index'th bit to lowest position and mask off + // Length bits. + if (CI0) { + APInt Elt = CI0->getValue(); + Elt.lshrInPlace(Index); + Elt = Elt.zextOrTrunc(Length); + return LowConstantHighUndef(Elt.getZExtValue()); + } + + // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { + Value *Args[] = {Op0, CILength, CIIndex}; + Module *M = II.getModule(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); + return Builder.CreateCall(F, Args); + } + } + + // Constant Fold - extraction from zero is always {zero, undef}. + if (CI0 && CI0->isZero()) + return LowConstantHighUndef(0); + + return nullptr; +} + +/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant +/// folding or conversion to a shuffle vector. +static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, + APInt APLength, APInt APIndex, + InstCombiner::BuilderTy &Builder) { + // From AMD documentation: "The bit index and field length are each six bits + // in length other bits of the field are ignored." + APIndex = APIndex.zextOrTrunc(6); + APLength = APLength.zextOrTrunc(6); + + // Attempt to constant fold. + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize INSERTQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + auto *ShufTy = FixedVectorType::get(IntTy8, 16); + + SmallVector<int, 16> ShuffleMask; + for (int i = 0; i != (int)Index; ++i) + ShuffleMask.push_back(i); + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back(i + 16); + for (int i = Index + Length; i != 8; ++i) + ShuffleMask.push_back(i); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(-1); + + Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), + Builder.CreateBitCast(Op1, ShufTy), + ShuffleMask); + return Builder.CreateBitCast(SV, II.getType()); + } + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast<Constant>(Op0); + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CI00 = + C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CI10 = + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) + : nullptr; + + // Constant Fold - insert bottom Length bits starting at the Index'th bit. + if (CI00 && CI10) { + APInt V00 = CI00->getValue(); + APInt V10 = CI10->getValue(); + APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); + V00 = V00 & ~Mask; + V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); + APInt Val = V00 | V10; + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + } + + // If we were an INSERTQ call, we'll save demanded elements if we convert to + // INSERTQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Constant *CILength = ConstantInt::get(IntTy8, Length, false); + Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); + + Value *Args[] = {Op0, Op1, CILength, CIIndex}; + Module *M = II.getModule(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + return Builder.CreateCall(F, Args); + } + + return nullptr; +} + +/// Attempt to convert pshufb* to shufflevector if the mask is constant. +static Value *simplifyX86pshufb(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); + if (!V) + return nullptr; + + auto *VecTy = cast<FixedVectorType>(II.getType()); + unsigned NumElts = VecTy->getNumElements(); + assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && + "Unexpected number of elements in shuffle mask!"); + + // Construct a shuffle mask from constant integers or UNDEFs. + int Indexes[64]; + + // Each byte in the shuffle control mask forms an index to permute the + // corresponding byte in the destination operand. + for (unsigned I = 0; I < NumElts; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) + return nullptr; + + if (isa<UndefValue>(COp)) { + Indexes[I] = -1; + continue; + } + + int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); + + // If the most significant bit (bit[7]) of each byte of the shuffle + // control mask is set, then zero is written in the result byte. + // The zero vector is in the right-hand side of the resulting + // shufflevector. + + // The value of each index for the high 128-bit lane is the least + // significant 4 bits of the respective shuffle control byte. + Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); + Indexes[I] = Index; + } + + auto V1 = II.getArgOperand(0); + auto V2 = Constant::getNullValue(VecTy); + return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); +} + +/// Attempt to convert vpermilvar* to shufflevector if the mask is constant. +static Value *simplifyX86vpermilvar(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); + if (!V) + return nullptr; + + auto *VecTy = cast<FixedVectorType>(II.getType()); + unsigned NumElts = VecTy->getNumElements(); + bool IsPD = VecTy->getScalarType()->isDoubleTy(); + unsigned NumLaneElts = IsPD ? 2 : 4; + assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); + + // Construct a shuffle mask from constant integers or UNDEFs. + int Indexes[16]; + + // The intrinsics only read one or two bits, clear the rest. + for (unsigned I = 0; I < NumElts; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) + return nullptr; + + if (isa<UndefValue>(COp)) { + Indexes[I] = -1; + continue; + } + + APInt Index = cast<ConstantInt>(COp)->getValue(); + Index = Index.zextOrTrunc(32).getLoBits(2); + + // The PD variants uses bit 1 to select per-lane element index, so + // shift down to convert to generic shuffle mask index. + if (IsPD) + Index.lshrInPlace(1); + + // The _256 variants are a bit trickier since the mask bits always index + // into the corresponding 128 half. In order to convert to a generic + // shuffle, we have to make that explicit. + Index += APInt(32, (I / NumLaneElts) * NumLaneElts); + + Indexes[I] = Index.getZExtValue(); + } + + auto V1 = II.getArgOperand(0); + return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts)); +} + +/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. +static Value *simplifyX86vpermv(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + auto *V = dyn_cast<Constant>(II.getArgOperand(1)); + if (!V) + return nullptr; + + auto *VecTy = cast<FixedVectorType>(II.getType()); + unsigned Size = VecTy->getNumElements(); + assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && + "Unexpected shuffle mask size"); + + // Construct a shuffle mask from constant integers or UNDEFs. + int Indexes[64]; + + for (unsigned I = 0; I < Size; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) + return nullptr; + + if (isa<UndefValue>(COp)) { + Indexes[I] = -1; + continue; + } + + uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); + Index &= Size - 1; + Indexes[I] = Index; + } + + auto V1 = II.getArgOperand(0); + return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size)); +} + +Optional<Instruction *> +X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { + auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, + unsigned DemandedWidth) { + APInt UndefElts(Width, 0); + APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); + return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); + }; + + Intrinsic::ID IID = II.getIntrinsicID(); + switch (IID) { + case Intrinsic::x86_bmi_bextr_32: + case Intrinsic::x86_bmi_bextr_64: + case Intrinsic::x86_tbm_bextri_u32: + case Intrinsic::x86_tbm_bextri_u64: + // If the RHS is a constant we can try some simplifications. + if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { + uint64_t Shift = C->getZExtValue(); + uint64_t Length = (Shift >> 8) & 0xff; + Shift &= 0xff; + unsigned BitWidth = II.getType()->getIntegerBitWidth(); + // If the length is 0 or the shift is out of range, replace with zero. + if (Length == 0 || Shift >= BitWidth) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); + } + // If the LHS is also a constant, we can completely constant fold this. + if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { + uint64_t Result = InC->getZExtValue() >> Shift; + if (Length > BitWidth) + Length = BitWidth; + Result &= maskTrailingOnes<uint64_t>(Length); + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), Result)); + } + // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we + // are only masking bits that a shift already cleared? + } + break; + + case Intrinsic::x86_bmi_bzhi_32: + case Intrinsic::x86_bmi_bzhi_64: + // If the RHS is a constant we can try some simplifications. + if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { + uint64_t Index = C->getZExtValue() & 0xff; + unsigned BitWidth = II.getType()->getIntegerBitWidth(); + if (Index >= BitWidth) { + return IC.replaceInstUsesWith(II, II.getArgOperand(0)); + } + if (Index == 0) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); + } + // If the LHS is also a constant, we can completely constant fold this. + if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { + uint64_t Result = InC->getZExtValue(); + Result &= maskTrailingOnes<uint64_t>(Index); + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), Result)); + } + // TODO should we convert this to an AND if the RHS is constant? + } + break; + case Intrinsic::x86_bmi_pext_32: + case Intrinsic::x86_bmi_pext_64: + if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { + if (MaskC->isNullValue()) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); + } + if (MaskC->isAllOnesValue()) { + return IC.replaceInstUsesWith(II, II.getArgOperand(0)); + } + + if (MaskC->getValue().isShiftedMask()) { + // any single contingous sequence of 1s anywhere in the mask simply + // describes a subset of the input bits shifted to the appropriate + // position. Replace with the straight forward IR. + unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); + Value *Input = II.getArgOperand(0); + Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); + Value *Shifted = IC.Builder.CreateLShr(Masked, + ConstantInt::get(II.getType(), + ShiftAmount)); + return IC.replaceInstUsesWith(II, Shifted); + } + + + if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { + uint64_t Src = SrcC->getZExtValue(); + uint64_t Mask = MaskC->getZExtValue(); + uint64_t Result = 0; + uint64_t BitToSet = 1; + + while (Mask) { + // Isolate lowest set bit. + uint64_t BitToTest = Mask & -Mask; + if (BitToTest & Src) + Result |= BitToSet; + + BitToSet <<= 1; + // Clear lowest set bit. + Mask &= Mask - 1; + } + + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), Result)); + } + } + break; + case Intrinsic::x86_bmi_pdep_32: + case Intrinsic::x86_bmi_pdep_64: + if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { + if (MaskC->isNullValue()) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); + } + if (MaskC->isAllOnesValue()) { + return IC.replaceInstUsesWith(II, II.getArgOperand(0)); + } + if (MaskC->getValue().isShiftedMask()) { + // any single contingous sequence of 1s anywhere in the mask simply + // describes a subset of the input bits shifted to the appropriate + // position. Replace with the straight forward IR. + unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); + Value *Input = II.getArgOperand(0); + Value *Shifted = IC.Builder.CreateShl(Input, + ConstantInt::get(II.getType(), + ShiftAmount)); + Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); + return IC.replaceInstUsesWith(II, Masked); + } + + if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { + uint64_t Src = SrcC->getZExtValue(); + uint64_t Mask = MaskC->getZExtValue(); + uint64_t Result = 0; + uint64_t BitToTest = 1; + + while (Mask) { + // Isolate lowest set bit. + uint64_t BitToSet = Mask & -Mask; + if (BitToTest & Src) + Result |= BitToSet; + + BitToTest <<= 1; + // Clear lowest set bit; + Mask &= Mask - 1; + } + + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), Result)); + } + } + break; + + case Intrinsic::x86_sse_cvtss2si: + case Intrinsic::x86_sse_cvtss2si64: + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + case Intrinsic::x86_sse2_cvtsd2si: + case Intrinsic::x86_sse2_cvtsd2si64: + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: + case Intrinsic::x86_avx512_vcvtss2si32: + case Intrinsic::x86_avx512_vcvtss2si64: + case Intrinsic::x86_avx512_vcvtss2usi32: + case Intrinsic::x86_avx512_vcvtss2usi64: + case Intrinsic::x86_avx512_vcvtsd2si32: + case Intrinsic::x86_avx512_vcvtsd2si64: + case Intrinsic::x86_avx512_vcvtsd2usi32: + case Intrinsic::x86_avx512_vcvtsd2usi64: + case Intrinsic::x86_avx512_cvttss2si: + case Intrinsic::x86_avx512_cvttss2si64: + case Intrinsic::x86_avx512_cvttss2usi: + case Intrinsic::x86_avx512_cvttss2usi64: + case Intrinsic::x86_avx512_cvttsd2si: + case Intrinsic::x86_avx512_cvttsd2si64: + case Intrinsic::x86_avx512_cvttsd2usi: + case Intrinsic::x86_avx512_cvttsd2usi64: { + // These intrinsics only demand the 0th element of their input vectors. If + // we can simplify the input based on that, do so now. + Value *Arg = II.getArgOperand(0); + unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); + if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { + return IC.replaceOperand(II, 0, V); + } + break; + } + + case Intrinsic::x86_mmx_pmovmskb: + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_avx2_pmovmskb: + if (Value *V = simplifyX86movmsk(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse_comieq_ss: + case Intrinsic::x86_sse_comige_ss: + case Intrinsic::x86_sse_comigt_ss: + case Intrinsic::x86_sse_comile_ss: + case Intrinsic::x86_sse_comilt_ss: + case Intrinsic::x86_sse_comineq_ss: + case Intrinsic::x86_sse_ucomieq_ss: + case Intrinsic::x86_sse_ucomige_ss: + case Intrinsic::x86_sse_ucomigt_ss: + case Intrinsic::x86_sse_ucomile_ss: + case Intrinsic::x86_sse_ucomilt_ss: + case Intrinsic::x86_sse_ucomineq_ss: + case Intrinsic::x86_sse2_comieq_sd: + case Intrinsic::x86_sse2_comige_sd: + case Intrinsic::x86_sse2_comigt_sd: + case Intrinsic::x86_sse2_comile_sd: + case Intrinsic::x86_sse2_comilt_sd: + case Intrinsic::x86_sse2_comineq_sd: + case Intrinsic::x86_sse2_ucomieq_sd: + case Intrinsic::x86_sse2_ucomige_sd: + case Intrinsic::x86_sse2_ucomigt_sd: + case Intrinsic::x86_sse2_ucomile_sd: + case Intrinsic::x86_sse2_ucomilt_sd: + case Intrinsic::x86_sse2_ucomineq_sd: + case Intrinsic::x86_avx512_vcomi_ss: + case Intrinsic::x86_avx512_vcomi_sd: + case Intrinsic::x86_avx512_mask_cmp_ss: + case Intrinsic::x86_avx512_mask_cmp_sd: { + // These intrinsics only demand the 0th element of their input vectors. If + // we can simplify the input based on that, do so now. + bool MadeChange = false; + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); + if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { + IC.replaceOperand(II, 0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { + IC.replaceOperand(II, 1, V); + MadeChange = true; + } + if (MadeChange) { + return &II; + } + break; + } + + case Intrinsic::x86_avx512_add_ps_512: + case Intrinsic::x86_avx512_div_ps_512: + case Intrinsic::x86_avx512_mul_ps_512: + case Intrinsic::x86_avx512_sub_ps_512: + case Intrinsic::x86_avx512_add_pd_512: + case Intrinsic::x86_avx512_div_pd_512: + case Intrinsic::x86_avx512_mul_pd_512: + case Intrinsic::x86_avx512_sub_pd_512: + // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular + // IR operations. + if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { + if (R->getValue() == 4) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + + Value *V; + switch (IID) { + default: + llvm_unreachable("Case stmts out of sync!"); + case Intrinsic::x86_avx512_add_ps_512: + case Intrinsic::x86_avx512_add_pd_512: + V = IC.Builder.CreateFAdd(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_sub_ps_512: + case Intrinsic::x86_avx512_sub_pd_512: + V = IC.Builder.CreateFSub(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_mul_ps_512: + case Intrinsic::x86_avx512_mul_pd_512: + V = IC.Builder.CreateFMul(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_div_ps_512: + case Intrinsic::x86_avx512_div_pd_512: + V = IC.Builder.CreateFDiv(Arg0, Arg1); + break; + } + + return IC.replaceInstUsesWith(II, V); + } + } + break; + + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular + // IR operations. + if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { + if (R->getValue() == 4) { + // Extract the element as scalars. + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); + Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); + + Value *V; + switch (IID) { + default: + llvm_unreachable("Case stmts out of sync!"); + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + V = IC.Builder.CreateFAdd(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + V = IC.Builder.CreateFSub(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + V = IC.Builder.CreateFMul(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + V = IC.Builder.CreateFDiv(LHS, RHS); + break; + } + + // Handle the masking aspect of the intrinsic. + Value *Mask = II.getArgOperand(3); + auto *C = dyn_cast<ConstantInt>(Mask); + // We don't need a select if we know the mask bit is a 1. + if (!C || !C->getValue()[0]) { + // Cast the mask to an i1 vector and then extract the lowest element. + auto *MaskTy = FixedVectorType::get( + IC.Builder.getInt1Ty(), + cast<IntegerType>(Mask->getType())->getBitWidth()); + Mask = IC.Builder.CreateBitCast(Mask, MaskTy); + Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); + // Extract the lowest element from the passthru operand. + Value *Passthru = + IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); + V = IC.Builder.CreateSelect(Mask, V, Passthru); + } + + // Insert the result back into the original argument 0. + V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); + + return IC.replaceInstUsesWith(II, V); + } + } + break; + + // Constant fold ashr( <A x Bi>, Ci ). + // Constant fold lshr( <A x Bi>, Ci ). + // Constant fold shl( <A x Bi>, Ci ). + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psrai_q_256: + case Intrinsic::x86_avx512_psrai_d_512: + case Intrinsic::x86_avx512_psrai_q_512: + case Intrinsic::x86_avx512_psrai_w_512: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx512_psrli_d_512: + case Intrinsic::x86_avx512_psrli_q_512: + case Intrinsic::x86_avx512_psrli_w_512: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx512_pslli_d_512: + case Intrinsic::x86_avx512_pslli_q_512: + case Intrinsic::x86_avx512_pslli_w_512: + if (Value *V = simplifyX86immShift(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: { + if (Value *V = simplifyX86immShift(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + + // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + Value *Arg1 = II.getArgOperand(1); + assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && + "Unexpected packed shift size"); + unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); + + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { + return IC.replaceOperand(II, 1, V); + } + break; + } + + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_q_256: + case Intrinsic::x86_avx512_psllv_d_512: + case Intrinsic::x86_avx512_psllv_q_512: + case Intrinsic::x86_avx512_psllv_w_128: + case Intrinsic::x86_avx512_psllv_w_256: + case Intrinsic::x86_avx512_psllv_w_512: + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + case Intrinsic::x86_avx512_psrav_q_128: + case Intrinsic::x86_avx512_psrav_q_256: + case Intrinsic::x86_avx512_psrav_d_512: + case Intrinsic::x86_avx512_psrav_q_512: + case Intrinsic::x86_avx512_psrav_w_128: + case Intrinsic::x86_avx512_psrav_w_256: + case Intrinsic::x86_avx512_psrav_w_512: + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_q_256: + case Intrinsic::x86_avx512_psrlv_d_512: + case Intrinsic::x86_avx512_psrlv_q_512: + case Intrinsic::x86_avx512_psrlv_w_128: + case Intrinsic::x86_avx512_psrlv_w_256: + case Intrinsic::x86_avx512_psrlv_w_512: + if (Value *V = simplifyX86varShift(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + case Intrinsic::x86_avx512_packssdw_512: + case Intrinsic::x86_avx512_packsswb_512: + if (Value *V = simplifyX86pack(II, IC.Builder, true)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx512_packusdw_512: + case Intrinsic::x86_avx512_packuswb_512: + if (Value *V = simplifyX86pack(II, IC.Builder, false)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_pclmulqdq: + case Intrinsic::x86_pclmulqdq_256: + case Intrinsic::x86_pclmulqdq_512: { + if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { + unsigned Imm = C->getZExtValue(); + + bool MadeChange = false; + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + unsigned VWidth = + cast<FixedVectorType>(Arg0->getType())->getNumElements(); + + APInt UndefElts1(VWidth, 0); + APInt DemandedElts1 = + APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); + if (Value *V = + IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { + IC.replaceOperand(II, 0, V); + MadeChange = true; + } + + APInt UndefElts2(VWidth, 0); + APInt DemandedElts2 = + APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); + if (Value *V = + IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { + IC.replaceOperand(II, 1, V); + MadeChange = true; + } + + // If either input elements are undef, the result is zero. + if (DemandedElts1.isSubsetOf(UndefElts1) || + DemandedElts2.isSubsetOf(UndefElts2)) { + return IC.replaceInstUsesWith(II, + ConstantAggregateZero::get(II.getType())); + } + + if (MadeChange) { + return &II; + } + } + break; + } + + case Intrinsic::x86_sse41_insertps: + if (Value *V = simplifyX86insertps(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse4a_extrq: { + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); + unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 16 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CILength = + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CIIndex = + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or EXTRQI call. + if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + + // EXTRQ only uses the lowest 64-bits of the first 128-bit vector + // operands and the lowest 16-bits of the second. + bool MadeChange = false; + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + IC.replaceOperand(II, 0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { + IC.replaceOperand(II, 1, V); + MadeChange = true; + } + if (MadeChange) { + return &II; + } + break; + } + + case Intrinsic::x86_sse4a_extrqi: { + // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining + // bits of the lower 64-bits. The upper 64-bits are undefined. + Value *Op0 = II.getArgOperand(0); + unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); + ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); + + // Attempt to simplify to a constant or shuffle vector. + if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + + // EXTRQI only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + return IC.replaceOperand(II, 0, V); + } + break; + } + + case Intrinsic::x86_sse4a_insertq: { + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CI11 = + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or INSERTQI call. + if (CI11) { + const APInt &V11 = CI11->getValue(); + APInt Len = V11.zextOrTrunc(6); + APInt Idx = V11.lshr(8).zextOrTrunc(6); + if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + } + + // INSERTQ only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + return IC.replaceOperand(II, 0, V); + } + break; + } + + case Intrinsic::x86_sse4a_insertqi: { + // INSERTQI: Extract lowest Length bits from lower half of second source and + // insert over first source starting at Index bit. The upper 64-bits are + // undefined. + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); + unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 2 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); + ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); + + // Attempt to simplify to a constant or shuffle vector. + if (CILength && CIIndex) { + APInt Len = CILength->getValue().zextOrTrunc(6); + APInt Idx = CIIndex->getValue().zextOrTrunc(6); + if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + } + + // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector + // operands. + bool MadeChange = false; + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + IC.replaceOperand(II, 0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { + IC.replaceOperand(II, 1, V); + MadeChange = true; + } + if (MadeChange) { + return &II; + } + break; + } + + case Intrinsic::x86_sse41_pblendvb: + case Intrinsic::x86_sse41_blendvps: + case Intrinsic::x86_sse41_blendvpd: + case Intrinsic::x86_avx_blendv_ps_256: + case Intrinsic::x86_avx_blendv_pd_256: + case Intrinsic::x86_avx2_pblendvb: { + // fold (blend A, A, Mask) -> A + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + Value *Mask = II.getArgOperand(2); + if (Op0 == Op1) { + return IC.replaceInstUsesWith(II, Op0); + } + + // Zero Mask - select 1st argument. + if (isa<ConstantAggregateZero>(Mask)) { + return IC.replaceInstUsesWith(II, Op0); + } + + // Constant Mask - select 1st/2nd argument lane based on top bit of mask. + if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { + Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); + return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); + } + + // Convert to a vector select if we can bypass casts and find a boolean + // vector condition value. + Value *BoolVec; + Mask = InstCombiner::peekThroughBitcast(Mask); + if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && + BoolVec->getType()->isVectorTy() && + BoolVec->getType()->getScalarSizeInBits() == 1) { + assert(Mask->getType()->getPrimitiveSizeInBits() == + II.getType()->getPrimitiveSizeInBits() && + "Not expecting mask and operands with different sizes"); + + unsigned NumMaskElts = + cast<FixedVectorType>(Mask->getType())->getNumElements(); + unsigned NumOperandElts = + cast<FixedVectorType>(II.getType())->getNumElements(); + if (NumMaskElts == NumOperandElts) { + return SelectInst::Create(BoolVec, Op1, Op0); + } + + // If the mask has less elements than the operands, each mask bit maps to + // multiple elements of the operands. Bitcast back and forth. + if (NumMaskElts < NumOperandElts) { + Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); + Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); + Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); + return new BitCastInst(Sel, II.getType()); + } + } + + break; + } + + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: + if (Value *V = simplifyX86pshufb(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_avx_vpermilvar_ps: + case Intrinsic::x86_avx_vpermilvar_ps_256: + case Intrinsic::x86_avx512_vpermilvar_ps_512: + case Intrinsic::x86_avx_vpermilvar_pd: + case Intrinsic::x86_avx_vpermilvar_pd_256: + case Intrinsic::x86_avx512_vpermilvar_pd_512: + if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: + case Intrinsic::x86_avx512_permvar_df_256: + case Intrinsic::x86_avx512_permvar_df_512: + case Intrinsic::x86_avx512_permvar_di_256: + case Intrinsic::x86_avx512_permvar_di_512: + case Intrinsic::x86_avx512_permvar_hi_128: + case Intrinsic::x86_avx512_permvar_hi_256: + case Intrinsic::x86_avx512_permvar_hi_512: + case Intrinsic::x86_avx512_permvar_qi_128: + case Intrinsic::x86_avx512_permvar_qi_256: + case Intrinsic::x86_avx512_permvar_qi_512: + case Intrinsic::x86_avx512_permvar_sf_512: + case Intrinsic::x86_avx512_permvar_si_512: + if (Value *V = simplifyX86vpermv(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_avx_maskload_ps: + case Intrinsic::x86_avx_maskload_pd: + case Intrinsic::x86_avx_maskload_ps_256: + case Intrinsic::x86_avx_maskload_pd_256: + case Intrinsic::x86_avx2_maskload_d: + case Intrinsic::x86_avx2_maskload_q: + case Intrinsic::x86_avx2_maskload_d_256: + case Intrinsic::x86_avx2_maskload_q_256: + if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { + return I; + } + break; + + case Intrinsic::x86_sse2_maskmov_dqu: + case Intrinsic::x86_avx_maskstore_ps: + case Intrinsic::x86_avx_maskstore_pd: + case Intrinsic::x86_avx_maskstore_ps_256: + case Intrinsic::x86_avx_maskstore_pd_256: + case Intrinsic::x86_avx2_maskstore_d: + case Intrinsic::x86_avx2_maskstore_q: + case Intrinsic::x86_avx2_maskstore_d_256: + case Intrinsic::x86_avx2_maskstore_q_256: + if (simplifyX86MaskedStore(II, IC)) { + return nullptr; + } + break; + + case Intrinsic::x86_addcarry_32: + case Intrinsic::x86_addcarry_64: + if (Value *V = simplifyX86addcarry(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + default: + break; + } + return None; +} + +Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, + bool &KnownBitsComputed) const { + switch (II.getIntrinsicID()) { + default: + break; + case Intrinsic::x86_mmx_pmovmskb: + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_avx2_pmovmskb: { + // MOVMSK copies the vector elements' sign bits to the low bits + // and zeros the high bits. + unsigned ArgWidth; + if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { + ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. + } else { + auto Arg = II.getArgOperand(0); + auto ArgType = cast<FixedVectorType>(Arg->getType()); + ArgWidth = ArgType->getNumElements(); + } + + // If we don't need any of low bits then return zero, + // we know that DemandedMask is non-zero already. + APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); + Type *VTy = II.getType(); + if (DemandedElts.isNullValue()) { + return ConstantInt::getNullValue(VTy); + } + + // We know that the upper bits are set to zero. + Known.Zero.setBitsFrom(ArgWidth); + KnownBitsComputed = true; + break; + } + } + return None; +} + +Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function<void(Instruction *, unsigned, APInt, APInt &)> + simplifyAndSetOp) const { + unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); + switch (II.getIntrinsicID()) { + default: + break; + case Intrinsic::x86_xop_vfrcz_ss: + case Intrinsic::x86_xop_vfrcz_sd: + // The instructions for these intrinsics are speced to zero upper bits not + // pass them through like other scalar intrinsics. So we shouldn't just + // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. + // Instead we should return a zero vector. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return ConstantAggregateZero::get(II.getType()); + } + + // Only the lower element is used. + DemandedElts = 1; + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + + // Only the lower element is undefined. The high elements are zero. + UndefElts = UndefElts[0]; + break; + + // Unary scalar-as-vector operations that work column-wise. + case Intrinsic::x86_sse_rcp_ss: + case Intrinsic::x86_sse_rsqrt_ss: + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return II.getArgOperand(0); + } + // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions + // checks). + break; + + // Binary scalar-as-vector operations that work column-wise. The high + // elements come from operand 0. The low element is a function of both + // operands. + case Intrinsic::x86_sse_min_ss: + case Intrinsic::x86_sse_max_ss: + case Intrinsic::x86_sse_cmp_ss: + case Intrinsic::x86_sse2_min_sd: + case Intrinsic::x86_sse2_max_sd: + case Intrinsic::x86_sse2_cmp_sd: { + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return II.getArgOperand(0); + } + + // Only lower element is used for operand 1. + DemandedElts = 1; + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); + + // Lower element is undefined if both lower elements are undefined. + // Consider things like undef&0. The result is known zero, not undef. + if (!UndefElts2[0]) + UndefElts.clearBit(0); + + break; + } + + // Binary scalar-as-vector operations that work column-wise. The high + // elements come from operand 0 and the low element comes from operand 1. + case Intrinsic::x86_sse41_round_ss: + case Intrinsic::x86_sse41_round_sd: { + // Don't use the low element of operand 0. + APInt DemandedElts2 = DemandedElts; + DemandedElts2.clearBit(0); + simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return II.getArgOperand(0); + } + + // Only lower element is used for operand 1. + DemandedElts = 1; + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); + + // Take the high undef elements from operand 0 and take the lower element + // from operand 1. + UndefElts.clearBit(0); + UndefElts |= UndefElts2[0]; + break; + } + + // Three input scalar-as-vector operations that work column-wise. The high + // elements come from operand 0 and the low element is a function of all + // three inputs. + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_max_ss_round: + case Intrinsic::x86_avx512_mask_min_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + case Intrinsic::x86_avx512_mask_max_sd_round: + case Intrinsic::x86_avx512_mask_min_sd_round: + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return II.getArgOperand(0); + } + + // Only lower element is used for operand 1 and 2. + DemandedElts = 1; + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); + simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); + + // Lower element is undefined if all three lower elements are undefined. + // Consider things like undef&0. The result is known zero, not undef. + if (!UndefElts2[0] || !UndefElts3[0]) + UndefElts.clearBit(0); + break; + + // TODO: Add fmaddsub support? + case Intrinsic::x86_sse3_addsub_pd: + case Intrinsic::x86_sse3_addsub_ps: + case Intrinsic::x86_avx_addsub_pd_256: + case Intrinsic::x86_avx_addsub_ps_256: { + // If none of the even or none of the odd lanes are required, turn this + // into a generic FP math instruction. + APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); + APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); + bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); + bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); + if (IsSubOnly || IsAddOnly) { + assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); + IRBuilderBase::InsertPointGuard Guard(IC.Builder); + IC.Builder.SetInsertPoint(&II); + Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); + return IC.Builder.CreateBinOp( + IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); + } + + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); + UndefElts &= UndefElts2; + break; + } + + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx512_packssdw_512: + case Intrinsic::x86_avx512_packsswb_512: + case Intrinsic::x86_avx512_packusdw_512: + case Intrinsic::x86_avx512_packuswb_512: { + auto *Ty0 = II.getArgOperand(0)->getType(); + unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); + assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); + + unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; + unsigned VWidthPerLane = VWidth / NumLanes; + unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; + + // Per lane, pack the elements of the first input and then the second. + // e.g. + // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) + // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) + for (int OpNum = 0; OpNum != 2; ++OpNum) { + APInt OpDemandedElts(InnerVWidth, 0); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + unsigned LaneIdx = Lane * VWidthPerLane; + for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { + unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; + if (DemandedElts[Idx]) + OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); + } + } + + // Demand elements from the operand. + APInt OpUndefElts(InnerVWidth, 0); + simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); + + // Pack the operand's UNDEF elements, one lane at a time. + OpUndefElts = OpUndefElts.zext(VWidth); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); + LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); + LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); + UndefElts |= LaneElts; + } + } + break; + } + + // PSHUFB + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: + // PERMILVAR + case Intrinsic::x86_avx_vpermilvar_ps: + case Intrinsic::x86_avx_vpermilvar_ps_256: + case Intrinsic::x86_avx512_vpermilvar_ps_512: + case Intrinsic::x86_avx_vpermilvar_pd: + case Intrinsic::x86_avx_vpermilvar_pd_256: + case Intrinsic::x86_avx512_vpermilvar_pd_512: + // PERMV + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: { + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); + break; + } + + // SSE4A instructions leave the upper 64-bits of the 128-bit result + // in an undefined state. + case Intrinsic::x86_sse4a_extrq: + case Intrinsic::x86_sse4a_extrqi: + case Intrinsic::x86_sse4a_insertq: + case Intrinsic::x86_sse4a_insertqi: + UndefElts.setHighBits(VWidth / 2); + break; + } + return None; +} |