aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/llvm12/lib/Target/X86
diff options
context:
space:
mode:
authorheretic <heretic@yandex-team.ru>2022-02-10 16:45:43 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:43 +0300
commit397cbe258b9e064f49c4ca575279f02f39fef76e (patch)
treea0b0eb3cca6a14e4e8ea715393637672fa651284 /contrib/libs/llvm12/lib/Target/X86
parent43f5a35593ebc9f6bcea619bb170394ea7ae468e (diff)
downloadydb-397cbe258b9e064f49c4ca575279f02f39fef76e.tar.gz
Restoring authorship annotation for <heretic@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Target/X86')
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/.yandex_meta/licenses.list.txt618
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/AsmParser/.yandex_meta/licenses.list.txt14
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make18
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/Disassembler/.yandex_meta/licenses.list.txt14
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make18
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/.yandex_meta/licenses.list.txt14
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make18
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/README-FPStack.txt170
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/README-SSE.txt1656
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/README-X86-64.txt368
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/README.txt3588
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/TargetInfo/.yandex_meta/licenses.list.txt14
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make18
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/ya.make22
14 files changed, 3275 insertions, 3275 deletions
diff --git a/contrib/libs/llvm12/lib/Target/X86/.yandex_meta/licenses.list.txt b/contrib/libs/llvm12/lib/Target/X86/.yandex_meta/licenses.list.txt
index f08f43f1d8..92fbe1c084 100644
--- a/contrib/libs/llvm12/lib/Target/X86/.yandex_meta/licenses.list.txt
+++ b/contrib/libs/llvm12/lib/Target/X86/.yandex_meta/licenses.list.txt
@@ -1,309 +1,309 @@
-====================Apache-2.0 WITH LLVM-exception====================
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-
-
-====================Apache-2.0 WITH LLVM-exception====================
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-
-====================COPYRIGHT====================
- Trampoline->setComdat(C);
- BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
- IRBuilder<> Builder(EntryBB);
-
-
-====================File: LICENSE.TXT====================
-==============================================================================
-The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
-==============================================================================
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-
----- LLVM Exceptions to the Apache 2.0 License ----
-
-As an exception, if, as a result of your compiling your source code, portions
-of this Software are embedded into an Object form of such source code, you
-may redistribute such embedded portions in such Object form without complying
-with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
-
-In addition, if you combine or link compiled forms of this Software with
-software that is licensed under the GPLv2 ("Combined Software") and if a
-court of competent jurisdiction determines that the patent provision (Section
-3), the indemnity provision (Section 9) or other Section of the License
-conflicts with the conditions of the GPLv2, you may retroactively and
-prospectively choose to deem waived or otherwise exclude such Section(s) of
-the License, but only in their entirety and only with respect to the Combined
-Software.
-
-==============================================================================
-Software from third parties included in the LLVM Project:
-==============================================================================
-The LLVM Project contains third party software which is under different license
-terms. All such code will be identified clearly using at least one of two
-mechanisms:
-1) It will be in a separate directory tree with its own `LICENSE.txt` or
- `LICENSE` file at the top containing the specific license and restrictions
- which apply to that software, or
-2) It will contain specific license and restriction terms at the top of every
- file.
-
-==============================================================================
-Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy):
-==============================================================================
-University of Illinois/NCSA
-Open Source License
-
-Copyright (c) 2003-2019 University of Illinois at Urbana-Champaign.
-All rights reserved.
-
-Developed by:
-
- LLVM Team
-
- University of Illinois at Urbana-Champaign
-
- http://llvm.org
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal with
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimers.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimers in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the names of the LLVM Team, University of Illinois at
- Urbana-Champaign, nor the names of its contributors may be used to
- endorse or promote products derived from this Software without specific
- prior written permission.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
-SOFTWARE.
-
-
-
-====================File: include/llvm/Support/LICENSE.TXT====================
-LLVM System Interface Library
--------------------------------------------------------------------------------
-The LLVM System Interface Library is licensed under the Illinois Open Source
-License and has the following additional copyright:
-
-Copyright (C) 2004 eXtensible Systems, Inc.
-
-
-====================NCSA====================
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+====================Apache-2.0 WITH LLVM-exception====================
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+
+
+====================Apache-2.0 WITH LLVM-exception====================
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+====================COPYRIGHT====================
+ Trampoline->setComdat(C);
+ BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
+ IRBuilder<> Builder(EntryBB);
+
+
+====================File: LICENSE.TXT====================
+==============================================================================
+The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
+==============================================================================
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+
+==============================================================================
+Software from third parties included in the LLVM Project:
+==============================================================================
+The LLVM Project contains third party software which is under different license
+terms. All such code will be identified clearly using at least one of two
+mechanisms:
+1) It will be in a separate directory tree with its own `LICENSE.txt` or
+ `LICENSE` file at the top containing the specific license and restrictions
+ which apply to that software, or
+2) It will contain specific license and restriction terms at the top of every
+ file.
+
+==============================================================================
+Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy):
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2003-2019 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+ LLVM Team
+
+ University of Illinois at Urbana-Champaign
+
+ http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimers.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimers in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the names of the LLVM Team, University of Illinois at
+ Urbana-Champaign, nor the names of its contributors may be used to
+ endorse or promote products derived from this Software without specific
+ prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+
+
+====================File: include/llvm/Support/LICENSE.TXT====================
+LLVM System Interface Library
+-------------------------------------------------------------------------------
+The LLVM System Interface Library is licensed under the Illinois Open Source
+License and has the following additional copyright:
+
+Copyright (C) 2004 eXtensible Systems, Inc.
+
+
+====================NCSA====================
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
diff --git a/contrib/libs/llvm12/lib/Target/X86/AsmParser/.yandex_meta/licenses.list.txt b/contrib/libs/llvm12/lib/Target/X86/AsmParser/.yandex_meta/licenses.list.txt
index c62d353021..a4433625d4 100644
--- a/contrib/libs/llvm12/lib/Target/X86/AsmParser/.yandex_meta/licenses.list.txt
+++ b/contrib/libs/llvm12/lib/Target/X86/AsmParser/.yandex_meta/licenses.list.txt
@@ -1,7 +1,7 @@
-====================Apache-2.0 WITH LLVM-exception====================
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-
-
-====================Apache-2.0 WITH LLVM-exception====================
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+====================Apache-2.0 WITH LLVM-exception====================
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+
+
+====================Apache-2.0 WITH LLVM-exception====================
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make b/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make
index f88283f4e5..c30cd4cf65 100644
--- a/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make
+++ b/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make
@@ -2,15 +2,15 @@
LIBRARY()
-OWNER(
- orivej
- g:cpp-contrib
-)
-
-LICENSE(Apache-2.0 WITH LLVM-exception)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
+OWNER(
+ orivej
+ g:cpp-contrib
+)
+
+LICENSE(Apache-2.0 WITH LLVM-exception)
+
+LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
+
PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include
diff --git a/contrib/libs/llvm12/lib/Target/X86/Disassembler/.yandex_meta/licenses.list.txt b/contrib/libs/llvm12/lib/Target/X86/Disassembler/.yandex_meta/licenses.list.txt
index c62d353021..a4433625d4 100644
--- a/contrib/libs/llvm12/lib/Target/X86/Disassembler/.yandex_meta/licenses.list.txt
+++ b/contrib/libs/llvm12/lib/Target/X86/Disassembler/.yandex_meta/licenses.list.txt
@@ -1,7 +1,7 @@
-====================Apache-2.0 WITH LLVM-exception====================
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-
-
-====================Apache-2.0 WITH LLVM-exception====================
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+====================Apache-2.0 WITH LLVM-exception====================
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+
+
+====================Apache-2.0 WITH LLVM-exception====================
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make b/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make
index b55833692f..d1c75366c1 100644
--- a/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make
+++ b/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make
@@ -2,15 +2,15 @@
LIBRARY()
-OWNER(
- orivej
- g:cpp-contrib
-)
-
-LICENSE(Apache-2.0 WITH LLVM-exception)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
+OWNER(
+ orivej
+ g:cpp-contrib
+)
+
+LICENSE(Apache-2.0 WITH LLVM-exception)
+
+LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
+
PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/.yandex_meta/licenses.list.txt b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/.yandex_meta/licenses.list.txt
index c62d353021..a4433625d4 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/.yandex_meta/licenses.list.txt
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/.yandex_meta/licenses.list.txt
@@ -1,7 +1,7 @@
-====================Apache-2.0 WITH LLVM-exception====================
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-
-
-====================Apache-2.0 WITH LLVM-exception====================
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+====================Apache-2.0 WITH LLVM-exception====================
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+
+
+====================Apache-2.0 WITH LLVM-exception====================
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make
index 8da0d02f5b..565dda72f5 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make
@@ -2,15 +2,15 @@
LIBRARY()
-OWNER(
- orivej
- g:cpp-contrib
-)
-
-LICENSE(Apache-2.0 WITH LLVM-exception)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
+OWNER(
+ orivej
+ g:cpp-contrib
+)
+
+LICENSE(Apache-2.0 WITH LLVM-exception)
+
+LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
+
PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include
diff --git a/contrib/libs/llvm12/lib/Target/X86/README-FPStack.txt b/contrib/libs/llvm12/lib/Target/X86/README-FPStack.txt
index 39efd2dbcf..aab9759b35 100644
--- a/contrib/libs/llvm12/lib/Target/X86/README-FPStack.txt
+++ b/contrib/libs/llvm12/lib/Target/X86/README-FPStack.txt
@@ -1,85 +1,85 @@
-//===---------------------------------------------------------------------===//
-// Random ideas for the X86 backend: FP stack related stuff
-//===---------------------------------------------------------------------===//
-
-//===---------------------------------------------------------------------===//
-
-Some targets (e.g. athlons) prefer freep to fstp ST(0):
-http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
-
-//===---------------------------------------------------------------------===//
-
-This should use fiadd on chips where it is profitable:
-double foo(double P, int *I) { return P+*I; }
-
-We have fiadd patterns now but the followings have the same cost and
-complexity. We need a way to specify the later is more profitable.
-
-def FpADD32m : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
- [(set RFP:$dst, (fadd RFP:$src1,
- (extloadf64f32 addr:$src2)))]>;
- // ST(0) = ST(0) + [mem32]
-
-def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
- [(set RFP:$dst, (fadd RFP:$src1,
- (X86fild addr:$src2, i32)))]>;
- // ST(0) = ST(0) + [mem32int]
-
-//===---------------------------------------------------------------------===//
-
-The FP stackifier should handle simple permutates to reduce number of shuffle
-instructions, e.g. turning:
-
-fld P -> fld Q
-fld Q fld P
-fxch
-
-or:
-
-fxch -> fucomi
-fucomi jl X
-jg X
-
-Ideas:
-http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
-
-
-//===---------------------------------------------------------------------===//
-
-Add a target specific hook to DAG combiner to handle SINT_TO_FP and
-FP_TO_SINT when the source operand is already in memory.
-
-//===---------------------------------------------------------------------===//
-
-Open code rint,floor,ceil,trunc:
-http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
-http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
-
-Opencode the sincos[f] libcall.
-
-//===---------------------------------------------------------------------===//
-
-None of the FPStack instructions are handled in
-X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
-folding spill code into the instructions.
-
-//===---------------------------------------------------------------------===//
-
-Currently the x86 codegen isn't very good at mixing SSE and FPStack
-code:
-
-unsigned int foo(double x) { return x; }
-
-foo:
- subl $20, %esp
- movsd 24(%esp), %xmm0
- movsd %xmm0, 8(%esp)
- fldl 8(%esp)
- fisttpll (%esp)
- movl (%esp), %eax
- addl $20, %esp
- ret
-
-This just requires being smarter when custom expanding fptoui.
-
-//===---------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: FP stack related stuff
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+Some targets (e.g. athlons) prefer freep to fstp ST(0):
+http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
+
+//===---------------------------------------------------------------------===//
+
+This should use fiadd on chips where it is profitable:
+double foo(double P, int *I) { return P+*I; }
+
+We have fiadd patterns now but the followings have the same cost and
+complexity. We need a way to specify the later is more profitable.
+
+def FpADD32m : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP:$dst, (fadd RFP:$src1,
+ (extloadf64f32 addr:$src2)))]>;
+ // ST(0) = ST(0) + [mem32]
+
+def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
+ [(set RFP:$dst, (fadd RFP:$src1,
+ (X86fild addr:$src2, i32)))]>;
+ // ST(0) = ST(0) + [mem32int]
+
+//===---------------------------------------------------------------------===//
+
+The FP stackifier should handle simple permutates to reduce number of shuffle
+instructions, e.g. turning:
+
+fld P -> fld Q
+fld Q fld P
+fxch
+
+or:
+
+fxch -> fucomi
+fucomi jl X
+jg X
+
+Ideas:
+http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
+
+
+//===---------------------------------------------------------------------===//
+
+Add a target specific hook to DAG combiner to handle SINT_TO_FP and
+FP_TO_SINT when the source operand is already in memory.
+
+//===---------------------------------------------------------------------===//
+
+Open code rint,floor,ceil,trunc:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
+
+Opencode the sincos[f] libcall.
+
+//===---------------------------------------------------------------------===//
+
+None of the FPStack instructions are handled in
+X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
+folding spill code into the instructions.
+
+//===---------------------------------------------------------------------===//
+
+Currently the x86 codegen isn't very good at mixing SSE and FPStack
+code:
+
+unsigned int foo(double x) { return x; }
+
+foo:
+ subl $20, %esp
+ movsd 24(%esp), %xmm0
+ movsd %xmm0, 8(%esp)
+ fldl 8(%esp)
+ fisttpll (%esp)
+ movl (%esp), %eax
+ addl $20, %esp
+ ret
+
+This just requires being smarter when custom expanding fptoui.
+
+//===---------------------------------------------------------------------===//
diff --git a/contrib/libs/llvm12/lib/Target/X86/README-SSE.txt b/contrib/libs/llvm12/lib/Target/X86/README-SSE.txt
index d52840e5c4..40f526b478 100644
--- a/contrib/libs/llvm12/lib/Target/X86/README-SSE.txt
+++ b/contrib/libs/llvm12/lib/Target/X86/README-SSE.txt
@@ -1,829 +1,829 @@
-//===---------------------------------------------------------------------===//
-// Random ideas for the X86 backend: SSE-specific stuff.
-//===---------------------------------------------------------------------===//
-
-//===---------------------------------------------------------------------===//
-
-SSE Variable shift can be custom lowered to something like this, which uses a
-small table + unaligned load + shuffle instead of going through memory.
-
-__m128i_shift_right:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-
-...
-__m128i shift_right(__m128i value, unsigned long offset) {
- return _mm_shuffle_epi8(value,
- _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
-}
-
-//===---------------------------------------------------------------------===//
-
-SSE has instructions for doing operations on complex numbers, we should pattern
-match them. For example, this should turn into a horizontal add:
-
-typedef float __attribute__((vector_size(16))) v4f32;
-float f32(v4f32 A) {
- return A[0]+A[1]+A[2]+A[3];
-}
-
-Instead we get this:
-
-_f32: ## @f32
- pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0]
- addss %xmm0, %xmm1
- pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0]
- movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1]
- movaps %xmm0, %xmm3
- addss %xmm1, %xmm3
- movdqa %xmm2, %xmm0
- addss %xmm3, %xmm0
- ret
-
-Also, there are cases where some simple local SLP would improve codegen a bit.
-compiling this:
-
-_Complex float f32(_Complex float A, _Complex float B) {
- return A+B;
-}
-
-into:
-
-_f32: ## @f32
- movdqa %xmm0, %xmm2
- addss %xmm1, %xmm2
- pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0]
- pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0]
- addss %xmm1, %xmm3
- movaps %xmm2, %xmm0
- unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
- ret
-
-seems silly when it could just be one addps.
-
-
-//===---------------------------------------------------------------------===//
-
-Expand libm rounding functions inline: Significant speedups possible.
-http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
-
-//===---------------------------------------------------------------------===//
-
-When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
-other fast SSE modes.
-
-//===---------------------------------------------------------------------===//
-
-Think about doing i64 math in SSE regs on x86-32.
-
-//===---------------------------------------------------------------------===//
-
-This testcase should have no SSE instructions in it, and only one load from
-a constant pool:
-
-double %test3(bool %B) {
- %C = select bool %B, double 123.412, double 523.01123123
- ret double %C
-}
-
-Currently, the select is being lowered, which prevents the dag combiner from
-turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
-
-The pattern isel got this one right.
-
-//===---------------------------------------------------------------------===//
-
-Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
-feasible.
-
-//===---------------------------------------------------------------------===//
-
-Codegen:
- if (copysign(1.0, x) == copysign(1.0, y))
-into:
- if (x^y & mask)
-when using SSE.
-
-//===---------------------------------------------------------------------===//
-
-Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
-of a v4sf value.
-
-//===---------------------------------------------------------------------===//
-
-Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
-Perhaps use pxor / xorp* to clear a XMM register first?
-
-//===---------------------------------------------------------------------===//
-
-External test Nurbs exposed some problems. Look for
-__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
-emits:
-
- movaps (%edx), %xmm2 #59.21
- movaps (%edx), %xmm5 #60.21
- movaps (%edx), %xmm4 #61.21
- movaps (%edx), %xmm3 #62.21
- movl 40(%ecx), %ebp #69.49
- shufps $0, %xmm2, %xmm5 #60.21
- movl 100(%esp), %ebx #69.20
- movl (%ebx), %edi #69.20
- imull %ebp, %edi #69.49
- addl (%eax), %edi #70.33
- shufps $85, %xmm2, %xmm4 #61.21
- shufps $170, %xmm2, %xmm3 #62.21
- shufps $255, %xmm2, %xmm2 #63.21
- lea (%ebp,%ebp,2), %ebx #69.49
- negl %ebx #69.49
- lea -3(%edi,%ebx), %ebx #70.33
- shll $4, %ebx #68.37
- addl 32(%ecx), %ebx #68.37
- testb $15, %bl #91.13
- jne L_B1.24 # Prob 5% #91.13
-
-This is the llvm code after instruction scheduling:
-
-cond_next140 (0xa910740, LLVM BB @0xa90beb0):
- %reg1078 = MOV32ri -3
- %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0
- %reg1037 = MOV32rm %reg1024, 1, %noreg, 40
- %reg1080 = IMUL32rr %reg1079, %reg1037
- %reg1081 = MOV32rm %reg1058, 1, %noreg, 0
- %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
- %reg1036 = MOV32rm %reg1024, 1, %noreg, 32
- %reg1082 = SHL32ri %reg1038, 4
- %reg1039 = ADD32rr %reg1036, %reg1082
- %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0
- %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
- %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
- %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
- %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
- %reg1040 = MOV32rr %reg1039
- %reg1084 = AND32ri8 %reg1039, 15
- CMP32ri8 %reg1084, 0
- JE mbb<cond_next204,0xa914d30>
-
-Still ok. After register allocation:
-
-cond_next140 (0xa910740, LLVM BB @0xa90beb0):
- %eax = MOV32ri -3
- %edx = MOV32rm %stack.3, 1, %noreg, 0
- ADD32rm %eax<def&use>, %edx, 1, %noreg, 0
- %edx = MOV32rm %stack.7, 1, %noreg, 0
- %edx = MOV32rm %edx, 1, %noreg, 40
- IMUL32rr %eax<def&use>, %edx
- %esi = MOV32rm %stack.5, 1, %noreg, 0
- %esi = MOV32rm %esi, 1, %noreg, 0
- MOV32mr %stack.4, 1, %noreg, 0, %esi
- %eax = LEA32r %esi, 1, %eax, -3
- %esi = MOV32rm %stack.7, 1, %noreg, 0
- %esi = MOV32rm %esi, 1, %noreg, 32
- %edi = MOV32rr %eax
- SHL32ri %edi<def&use>, 4
- ADD32rr %edi<def&use>, %esi
- %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0
- %xmm1 = MOVAPSrr %xmm0
- SHUFPSrr %xmm1<def&use>, %xmm1, 170
- %xmm2 = MOVAPSrr %xmm0
- SHUFPSrr %xmm2<def&use>, %xmm2, 0
- %xmm3 = MOVAPSrr %xmm0
- SHUFPSrr %xmm3<def&use>, %xmm3, 255
- SHUFPSrr %xmm0<def&use>, %xmm0, 85
- %ebx = MOV32rr %edi
- AND32ri8 %ebx<def&use>, 15
- CMP32ri8 %ebx, 0
- JE mbb<cond_next204,0xa914d30>
-
-This looks really bad. The problem is shufps is a destructive opcode. Since it
-appears as operand two in more than one shufps ops. It resulted in a number of
-copies. Note icc also suffers from the same problem. Either the instruction
-selector should select pshufd or The register allocator can made the two-address
-to three-address transformation.
-
-It also exposes some other problems. See MOV32ri -3 and the spills.
-
-//===---------------------------------------------------------------------===//
-
-Consider:
-
-__m128 test(float a) {
- return _mm_set_ps(0.0, 0.0, 0.0, a*a);
-}
-
-This compiles into:
-
-movss 4(%esp), %xmm1
-mulss %xmm1, %xmm1
-xorps %xmm0, %xmm0
-movss %xmm1, %xmm0
-ret
-
-Because mulss doesn't modify the top 3 elements, the top elements of
-xmm1 are already zero'd. We could compile this to:
-
-movss 4(%esp), %xmm0
-mulss %xmm0, %xmm0
-ret
-
-//===---------------------------------------------------------------------===//
-
-Here's a sick and twisted idea. Consider code like this:
-
-__m128 test(__m128 a) {
- float b = *(float*)&A;
- ...
- return _mm_set_ps(0.0, 0.0, 0.0, b);
-}
-
-This might compile to this code:
-
-movaps c(%esp), %xmm1
-xorps %xmm0, %xmm0
-movss %xmm1, %xmm0
-ret
-
-Now consider if the ... code caused xmm1 to get spilled. This might produce
-this code:
-
-movaps c(%esp), %xmm1
-movaps %xmm1, c2(%esp)
-...
-
-xorps %xmm0, %xmm0
-movaps c2(%esp), %xmm1
-movss %xmm1, %xmm0
-ret
-
-However, since the reload is only used by these instructions, we could
-"fold" it into the uses, producing something like this:
-
-movaps c(%esp), %xmm1
-movaps %xmm1, c2(%esp)
-...
-
-movss c2(%esp), %xmm0
-ret
-
-... saving two instructions.
-
-The basic idea is that a reload from a spill slot, can, if only one 4-byte
-chunk is used, bring in 3 zeros the one element instead of 4 elements.
-This can be used to simplify a variety of shuffle operations, where the
-elements are fixed zeros.
-
-//===---------------------------------------------------------------------===//
-
-This code generates ugly code, probably due to costs being off or something:
-
-define void @test(float* %P, <4 x float>* %P2 ) {
- %xFloat0.688 = load float* %P
- %tmp = load <4 x float>* %P2
- %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
- store <4 x float> %inFloat3.713, <4 x float>* %P2
- ret void
-}
-
-Generates:
-
-_test:
- movl 8(%esp), %eax
- movaps (%eax), %xmm0
- pxor %xmm1, %xmm1
- movaps %xmm0, %xmm2
- shufps $50, %xmm1, %xmm2
- shufps $132, %xmm2, %xmm0
- movaps %xmm0, (%eax)
- ret
-
-Would it be better to generate:
-
-_test:
- movl 8(%esp), %ecx
- movaps (%ecx), %xmm0
- xor %eax, %eax
- pinsrw $6, %eax, %xmm0
- pinsrw $7, %eax, %xmm0
- movaps %xmm0, (%ecx)
- ret
-
-?
-
-//===---------------------------------------------------------------------===//
-
-Some useful information in the Apple Altivec / SSE Migration Guide:
-
-http://developer.apple.com/documentation/Performance/Conceptual/
-Accelerate_sse_migration/index.html
-
-e.g. SSE select using and, andnot, or. Various SSE compare translations.
-
-//===---------------------------------------------------------------------===//
-
-Add hooks to commute some CMPP operations.
-
-//===---------------------------------------------------------------------===//
-
-Apply the same transformation that merged four float into a single 128-bit load
-to loads from constant pool.
-
-//===---------------------------------------------------------------------===//
-
-Floating point max / min are commutable when -enable-unsafe-fp-path is
-specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
-nodes which are selected to max / min instructions that are marked commutable.
-
-//===---------------------------------------------------------------------===//
-
-We should materialize vector constants like "all ones" and "signbit" with
-code like:
-
- cmpeqps xmm1, xmm1 ; xmm1 = all-ones
-
-and:
- cmpeqps xmm1, xmm1 ; xmm1 = all-ones
- psrlq xmm1, 31 ; xmm1 = all 100000000000...
-
-instead of using a load from the constant pool. The later is important for
-ABS/NEG/copysign etc.
-
-//===---------------------------------------------------------------------===//
-
-These functions:
-
-#include <xmmintrin.h>
-__m128i a;
-void x(unsigned short n) {
- a = _mm_slli_epi32 (a, n);
-}
-void y(unsigned n) {
- a = _mm_slli_epi32 (a, n);
-}
-
-compile to ( -O3 -static -fomit-frame-pointer):
-_x:
- movzwl 4(%esp), %eax
- movd %eax, %xmm0
- movaps _a, %xmm1
- pslld %xmm0, %xmm1
- movaps %xmm1, _a
- ret
-_y:
- movd 4(%esp), %xmm0
- movaps _a, %xmm1
- pslld %xmm0, %xmm1
- movaps %xmm1, _a
- ret
-
-"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems
-like movd would be sufficient in both cases as the value is already zero
-extended in the 32-bit stack slot IIRC. For signed short, it should also be
-save, as a really-signed value would be undefined for pslld.
-
-
-//===---------------------------------------------------------------------===//
-
-#include <math.h>
-int t1(double d) { return signbit(d); }
-
-This currently compiles to:
- subl $12, %esp
- movsd 16(%esp), %xmm0
- movsd %xmm0, (%esp)
- movl 4(%esp), %eax
- shrl $31, %eax
- addl $12, %esp
- ret
-
-We should use movmskp{s|d} instead.
-
-//===---------------------------------------------------------------------===//
-
-CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
-(aligned) vector load. This functionality has a couple of problems.
-
-1. The code to infer alignment from loads of globals is in the X86 backend,
- not the dag combiner. This is because dagcombine2 needs to be able to see
- through the X86ISD::Wrapper node, which DAGCombine can't really do.
-2. The code for turning 4 x load into a single vector load is target
- independent and should be moved to the dag combiner.
-3. The code for turning 4 x load into a vector load can only handle a direct
- load from a global or a direct load from the stack. It should be generalized
- to handle any load from P, P+4, P+8, P+12, where P can be anything.
-4. The alignment inference code cannot handle loads from globals in non-static
- mode because it doesn't look through the extra dyld stub load. If you try
- vec_align.ll without -relocation-model=static, you'll see what I mean.
-
-//===---------------------------------------------------------------------===//
-
-We should lower store(fneg(load p), q) into an integer load+xor+store, which
-eliminates a constant pool load. For example, consider:
-
-define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
-entry:
- %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1]
- %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
- ret i64 %tmp20
-}
-declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
-
-This currently compiles to:
-
-LCPI1_0: # <4 x float>
- .long 2147483648 # float -0
- .long 2147483648 # float -0
- .long 2147483648 # float -0
- .long 2147483648 # float -0
-_ccosf:
- subl $12, %esp
- movss 16(%esp), %xmm0
- movss %xmm0, 4(%esp)
- movss 20(%esp), %xmm0
- xorps LCPI1_0, %xmm0
- movss %xmm0, (%esp)
- call L_ccoshf$stub
- addl $12, %esp
- ret
-
-Note the load into xmm0, then xor (to negate), then store. In PIC mode,
-this code computes the pic base and does two loads to do the constant pool
-load, so the improvement is much bigger.
-
-The tricky part about this xform is that the argument load/store isn't exposed
-until post-legalize, and at that point, the fneg has been custom expanded into
-an X86 fxor. This means that we need to handle this case in the x86 backend
-instead of in target independent code.
-
-//===---------------------------------------------------------------------===//
-
-Non-SSE4 insert into 16 x i8 is atrociously bad.
-
-//===---------------------------------------------------------------------===//
-
-<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
-is memory.
-
-//===---------------------------------------------------------------------===//
-
-INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
-any number of 0.0 simultaneously. Currently we only use it for simple
-insertions.
-
-See comments in LowerINSERT_VECTOR_ELT_SSE4.
-
-//===---------------------------------------------------------------------===//
-
-On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
-Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
-legal, it'll just take a few extra patterns written in the .td file.
-
-Note: this is not a code quality issue; the custom lowered code happens to be
-right, but we shouldn't have to custom lower anything. This is probably related
-to <2 x i64> ops being so bad.
-
-//===---------------------------------------------------------------------===//
-
-LLVM currently generates stack realignment code, when it is not necessary
-needed. The problem is that we need to know about stack alignment too early,
-before RA runs.
-
-At that point we don't know, whether there will be vector spill, or not.
-Stack realignment logic is overly conservative here, but otherwise we can
-produce unaligned loads/stores.
-
-Fixing this will require some huge RA changes.
-
-Testcase:
-#include <emmintrin.h>
-
-typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
-
-static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
-- 22725, - 12873};;
-
-vSInt16 madd(vSInt16 b)
-{
- return _mm_madd_epi16(a, b);
-}
-
-Generated code (x86-32, linux):
-madd:
- pushl %ebp
- movl %esp, %ebp
- andl $-16, %esp
- movaps .LCPI1_0, %xmm1
- pmaddwd %xmm1, %xmm0
- movl %ebp, %esp
- popl %ebp
- ret
-
-//===---------------------------------------------------------------------===//
-
-Consider:
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: SSE-specific stuff.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+SSE Variable shift can be custom lowered to something like this, which uses a
+small table + unaligned load + shuffle instead of going through memory.
+
+__m128i_shift_right:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+
+...
+__m128i shift_right(__m128i value, unsigned long offset) {
+ return _mm_shuffle_epi8(value,
+ _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
+}
+
+//===---------------------------------------------------------------------===//
+
+SSE has instructions for doing operations on complex numbers, we should pattern
+match them. For example, this should turn into a horizontal add:
+
+typedef float __attribute__((vector_size(16))) v4f32;
+float f32(v4f32 A) {
+ return A[0]+A[1]+A[2]+A[3];
+}
+
+Instead we get this:
+
+_f32: ## @f32
+ pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0]
+ addss %xmm0, %xmm1
+ pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0]
+ movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1]
+ movaps %xmm0, %xmm3
+ addss %xmm1, %xmm3
+ movdqa %xmm2, %xmm0
+ addss %xmm3, %xmm0
+ ret
+
+Also, there are cases where some simple local SLP would improve codegen a bit.
+compiling this:
+
+_Complex float f32(_Complex float A, _Complex float B) {
+ return A+B;
+}
+
+into:
+
+_f32: ## @f32
+ movdqa %xmm0, %xmm2
+ addss %xmm1, %xmm2
+ pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0]
+ pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0]
+ addss %xmm1, %xmm3
+ movaps %xmm2, %xmm0
+ unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+ ret
+
+seems silly when it could just be one addps.
+
+
+//===---------------------------------------------------------------------===//
+
+Expand libm rounding functions inline: Significant speedups possible.
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
+
+//===---------------------------------------------------------------------===//
+
+When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
+other fast SSE modes.
+
+//===---------------------------------------------------------------------===//
+
+Think about doing i64 math in SSE regs on x86-32.
+
+//===---------------------------------------------------------------------===//
+
+This testcase should have no SSE instructions in it, and only one load from
+a constant pool:
+
+double %test3(bool %B) {
+ %C = select bool %B, double 123.412, double 523.01123123
+ ret double %C
+}
+
+Currently, the select is being lowered, which prevents the dag combiner from
+turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
+
+The pattern isel got this one right.
+
+//===---------------------------------------------------------------------===//
+
+Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
+feasible.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+ if (copysign(1.0, x) == copysign(1.0, y))
+into:
+ if (x^y & mask)
+when using SSE.
+
+//===---------------------------------------------------------------------===//
+
+Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
+of a v4sf value.
+
+//===---------------------------------------------------------------------===//
+
+Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
+Perhaps use pxor / xorp* to clear a XMM register first?
+
+//===---------------------------------------------------------------------===//
+
+External test Nurbs exposed some problems. Look for
+__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
+emits:
+
+ movaps (%edx), %xmm2 #59.21
+ movaps (%edx), %xmm5 #60.21
+ movaps (%edx), %xmm4 #61.21
+ movaps (%edx), %xmm3 #62.21
+ movl 40(%ecx), %ebp #69.49
+ shufps $0, %xmm2, %xmm5 #60.21
+ movl 100(%esp), %ebx #69.20
+ movl (%ebx), %edi #69.20
+ imull %ebp, %edi #69.49
+ addl (%eax), %edi #70.33
+ shufps $85, %xmm2, %xmm4 #61.21
+ shufps $170, %xmm2, %xmm3 #62.21
+ shufps $255, %xmm2, %xmm2 #63.21
+ lea (%ebp,%ebp,2), %ebx #69.49
+ negl %ebx #69.49
+ lea -3(%edi,%ebx), %ebx #70.33
+ shll $4, %ebx #68.37
+ addl 32(%ecx), %ebx #68.37
+ testb $15, %bl #91.13
+ jne L_B1.24 # Prob 5% #91.13
+
+This is the llvm code after instruction scheduling:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+ %reg1078 = MOV32ri -3
+ %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0
+ %reg1037 = MOV32rm %reg1024, 1, %noreg, 40
+ %reg1080 = IMUL32rr %reg1079, %reg1037
+ %reg1081 = MOV32rm %reg1058, 1, %noreg, 0
+ %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
+ %reg1036 = MOV32rm %reg1024, 1, %noreg, 32
+ %reg1082 = SHL32ri %reg1038, 4
+ %reg1039 = ADD32rr %reg1036, %reg1082
+ %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0
+ %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
+ %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
+ %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
+ %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
+ %reg1040 = MOV32rr %reg1039
+ %reg1084 = AND32ri8 %reg1039, 15
+ CMP32ri8 %reg1084, 0
+ JE mbb<cond_next204,0xa914d30>
+
+Still ok. After register allocation:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+ %eax = MOV32ri -3
+ %edx = MOV32rm %stack.3, 1, %noreg, 0
+ ADD32rm %eax<def&use>, %edx, 1, %noreg, 0
+ %edx = MOV32rm %stack.7, 1, %noreg, 0
+ %edx = MOV32rm %edx, 1, %noreg, 40
+ IMUL32rr %eax<def&use>, %edx
+ %esi = MOV32rm %stack.5, 1, %noreg, 0
+ %esi = MOV32rm %esi, 1, %noreg, 0
+ MOV32mr %stack.4, 1, %noreg, 0, %esi
+ %eax = LEA32r %esi, 1, %eax, -3
+ %esi = MOV32rm %stack.7, 1, %noreg, 0
+ %esi = MOV32rm %esi, 1, %noreg, 32
+ %edi = MOV32rr %eax
+ SHL32ri %edi<def&use>, 4
+ ADD32rr %edi<def&use>, %esi
+ %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0
+ %xmm1 = MOVAPSrr %xmm0
+ SHUFPSrr %xmm1<def&use>, %xmm1, 170
+ %xmm2 = MOVAPSrr %xmm0
+ SHUFPSrr %xmm2<def&use>, %xmm2, 0
+ %xmm3 = MOVAPSrr %xmm0
+ SHUFPSrr %xmm3<def&use>, %xmm3, 255
+ SHUFPSrr %xmm0<def&use>, %xmm0, 85
+ %ebx = MOV32rr %edi
+ AND32ri8 %ebx<def&use>, 15
+ CMP32ri8 %ebx, 0
+ JE mbb<cond_next204,0xa914d30>
+
+This looks really bad. The problem is shufps is a destructive opcode. Since it
+appears as operand two in more than one shufps ops. It resulted in a number of
+copies. Note icc also suffers from the same problem. Either the instruction
+selector should select pshufd or The register allocator can made the two-address
+to three-address transformation.
+
+It also exposes some other problems. See MOV32ri -3 and the spills.
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+__m128 test(float a) {
+ return _mm_set_ps(0.0, 0.0, 0.0, a*a);
+}
+
+This compiles into:
+
+movss 4(%esp), %xmm1
+mulss %xmm1, %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Because mulss doesn't modify the top 3 elements, the top elements of
+xmm1 are already zero'd. We could compile this to:
+
+movss 4(%esp), %xmm0
+mulss %xmm0, %xmm0
+ret
+
+//===---------------------------------------------------------------------===//
+
+Here's a sick and twisted idea. Consider code like this:
+
+__m128 test(__m128 a) {
+ float b = *(float*)&A;
+ ...
+ return _mm_set_ps(0.0, 0.0, 0.0, b);
+}
+
+This might compile to this code:
+
+movaps c(%esp), %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Now consider if the ... code caused xmm1 to get spilled. This might produce
+this code:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+xorps %xmm0, %xmm0
+movaps c2(%esp), %xmm1
+movss %xmm1, %xmm0
+ret
+
+However, since the reload is only used by these instructions, we could
+"fold" it into the uses, producing something like this:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+movss c2(%esp), %xmm0
+ret
+
+... saving two instructions.
+
+The basic idea is that a reload from a spill slot, can, if only one 4-byte
+chunk is used, bring in 3 zeros the one element instead of 4 elements.
+This can be used to simplify a variety of shuffle operations, where the
+elements are fixed zeros.
+
+//===---------------------------------------------------------------------===//
+
+This code generates ugly code, probably due to costs being off or something:
+
+define void @test(float* %P, <4 x float>* %P2 ) {
+ %xFloat0.688 = load float* %P
+ %tmp = load <4 x float>* %P2
+ %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
+ store <4 x float> %inFloat3.713, <4 x float>* %P2
+ ret void
+}
+
+Generates:
+
+_test:
+ movl 8(%esp), %eax
+ movaps (%eax), %xmm0
+ pxor %xmm1, %xmm1
+ movaps %xmm0, %xmm2
+ shufps $50, %xmm1, %xmm2
+ shufps $132, %xmm2, %xmm0
+ movaps %xmm0, (%eax)
+ ret
+
+Would it be better to generate:
+
+_test:
+ movl 8(%esp), %ecx
+ movaps (%ecx), %xmm0
+ xor %eax, %eax
+ pinsrw $6, %eax, %xmm0
+ pinsrw $7, %eax, %xmm0
+ movaps %xmm0, (%ecx)
+ ret
+
+?
+
+//===---------------------------------------------------------------------===//
+
+Some useful information in the Apple Altivec / SSE Migration Guide:
+
+http://developer.apple.com/documentation/Performance/Conceptual/
+Accelerate_sse_migration/index.html
+
+e.g. SSE select using and, andnot, or. Various SSE compare translations.
+
+//===---------------------------------------------------------------------===//
+
+Add hooks to commute some CMPP operations.
+
+//===---------------------------------------------------------------------===//
+
+Apply the same transformation that merged four float into a single 128-bit load
+to loads from constant pool.
+
+//===---------------------------------------------------------------------===//
+
+Floating point max / min are commutable when -enable-unsafe-fp-path is
+specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
+nodes which are selected to max / min instructions that are marked commutable.
+
+//===---------------------------------------------------------------------===//
+
+We should materialize vector constants like "all ones" and "signbit" with
+code like:
+
+ cmpeqps xmm1, xmm1 ; xmm1 = all-ones
+
+and:
+ cmpeqps xmm1, xmm1 ; xmm1 = all-ones
+ psrlq xmm1, 31 ; xmm1 = all 100000000000...
+
+instead of using a load from the constant pool. The later is important for
+ABS/NEG/copysign etc.
+
+//===---------------------------------------------------------------------===//
+
+These functions:
+
+#include <xmmintrin.h>
+__m128i a;
+void x(unsigned short n) {
+ a = _mm_slli_epi32 (a, n);
+}
+void y(unsigned n) {
+ a = _mm_slli_epi32 (a, n);
+}
+
+compile to ( -O3 -static -fomit-frame-pointer):
+_x:
+ movzwl 4(%esp), %eax
+ movd %eax, %xmm0
+ movaps _a, %xmm1
+ pslld %xmm0, %xmm1
+ movaps %xmm1, _a
+ ret
+_y:
+ movd 4(%esp), %xmm0
+ movaps _a, %xmm1
+ pslld %xmm0, %xmm1
+ movaps %xmm1, _a
+ ret
+
+"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems
+like movd would be sufficient in both cases as the value is already zero
+extended in the 32-bit stack slot IIRC. For signed short, it should also be
+save, as a really-signed value would be undefined for pslld.
+
+
+//===---------------------------------------------------------------------===//
+
+#include <math.h>
+int t1(double d) { return signbit(d); }
+
+This currently compiles to:
+ subl $12, %esp
+ movsd 16(%esp), %xmm0
+ movsd %xmm0, (%esp)
+ movl 4(%esp), %eax
+ shrl $31, %eax
+ addl $12, %esp
+ ret
+
+We should use movmskp{s|d} instead.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
+(aligned) vector load. This functionality has a couple of problems.
+
+1. The code to infer alignment from loads of globals is in the X86 backend,
+ not the dag combiner. This is because dagcombine2 needs to be able to see
+ through the X86ISD::Wrapper node, which DAGCombine can't really do.
+2. The code for turning 4 x load into a single vector load is target
+ independent and should be moved to the dag combiner.
+3. The code for turning 4 x load into a vector load can only handle a direct
+ load from a global or a direct load from the stack. It should be generalized
+ to handle any load from P, P+4, P+8, P+12, where P can be anything.
+4. The alignment inference code cannot handle loads from globals in non-static
+ mode because it doesn't look through the extra dyld stub load. If you try
+ vec_align.ll without -relocation-model=static, you'll see what I mean.
+
+//===---------------------------------------------------------------------===//
+
+We should lower store(fneg(load p), q) into an integer load+xor+store, which
+eliminates a constant pool load. For example, consider:
+
+define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
+entry:
+ %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1]
+ %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
+ ret i64 %tmp20
+}
+declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
+
+This currently compiles to:
+
+LCPI1_0: # <4 x float>
+ .long 2147483648 # float -0
+ .long 2147483648 # float -0
+ .long 2147483648 # float -0
+ .long 2147483648 # float -0
+_ccosf:
+ subl $12, %esp
+ movss 16(%esp), %xmm0
+ movss %xmm0, 4(%esp)
+ movss 20(%esp), %xmm0
+ xorps LCPI1_0, %xmm0
+ movss %xmm0, (%esp)
+ call L_ccoshf$stub
+ addl $12, %esp
+ ret
+
+Note the load into xmm0, then xor (to negate), then store. In PIC mode,
+this code computes the pic base and does two loads to do the constant pool
+load, so the improvement is much bigger.
+
+The tricky part about this xform is that the argument load/store isn't exposed
+until post-legalize, and at that point, the fneg has been custom expanded into
+an X86 fxor. This means that we need to handle this case in the x86 backend
+instead of in target independent code.
+
+//===---------------------------------------------------------------------===//
+
+Non-SSE4 insert into 16 x i8 is atrociously bad.
+
+//===---------------------------------------------------------------------===//
+
+<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
+is memory.
+
+//===---------------------------------------------------------------------===//
+
+INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
+any number of 0.0 simultaneously. Currently we only use it for simple
+insertions.
+
+See comments in LowerINSERT_VECTOR_ELT_SSE4.
+
+//===---------------------------------------------------------------------===//
+
+On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
+Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
+legal, it'll just take a few extra patterns written in the .td file.
+
+Note: this is not a code quality issue; the custom lowered code happens to be
+right, but we shouldn't have to custom lower anything. This is probably related
+to <2 x i64> ops being so bad.
+
+//===---------------------------------------------------------------------===//
+
+LLVM currently generates stack realignment code, when it is not necessary
+needed. The problem is that we need to know about stack alignment too early,
+before RA runs.
+
+At that point we don't know, whether there will be vector spill, or not.
+Stack realignment logic is overly conservative here, but otherwise we can
+produce unaligned loads/stores.
+
+Fixing this will require some huge RA changes.
+
+Testcase:
#include <emmintrin.h>
-__m128 foo2 (float x) {
- return _mm_set_ps (0, 0, x, 0);
-}
-
-In x86-32 mode, we generate this spiffy code:
-
-_foo2:
- movss 4(%esp), %xmm0
- pshufd $81, %xmm0, %xmm0
- ret
-
-in x86-64 mode, we generate this code, which could be better:
-
-_foo2:
- xorps %xmm1, %xmm1
- movss %xmm0, %xmm1
- pshufd $81, %xmm1, %xmm0
- ret
-
-In sse4 mode, we could use insertps to make both better.
-
-Here's another testcase that could use insertps [mem]:
-
-#include <xmmintrin.h>
-extern float x2, x3;
-__m128 foo1 (float x1, float x4) {
- return _mm_set_ps (x2, x1, x3, x4);
-}
-
-gcc mainline compiles it to:
-
-foo1:
- insertps $0x10, x2(%rip), %xmm0
- insertps $0x10, x3(%rip), %xmm1
- movaps %xmm1, %xmm2
- movlhps %xmm0, %xmm2
- movaps %xmm2, %xmm0
- ret
-
-//===---------------------------------------------------------------------===//
-
-We compile vector multiply-by-constant into poor code:
-
-define <4 x i32> @f(<4 x i32> %i) nounwind {
- %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
- ret <4 x i32> %A
-}
-
-On targets without SSE4.1, this compiles into:
-
-LCPI1_0: ## <4 x i32>
- .long 10
- .long 10
- .long 10
- .long 10
- .text
- .align 4,0x90
- .globl _f
-_f:
- pshufd $3, %xmm0, %xmm1
- movd %xmm1, %eax
- imull LCPI1_0+12, %eax
- movd %eax, %xmm1
- pshufd $1, %xmm0, %xmm2
- movd %xmm2, %eax
- imull LCPI1_0+4, %eax
- movd %eax, %xmm2
- punpckldq %xmm1, %xmm2
- movd %xmm0, %eax
- imull LCPI1_0, %eax
- movd %eax, %xmm1
- movhlps %xmm0, %xmm0
- movd %xmm0, %eax
- imull LCPI1_0+8, %eax
- movd %eax, %xmm0
- punpckldq %xmm0, %xmm1
- movaps %xmm1, %xmm0
- punpckldq %xmm2, %xmm0
- ret
-
-It would be better to synthesize integer vector multiplication by constants
-using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
-simple cases such as multiplication by powers of two would be better as
-vector shifts than as multiplications.
-
-//===---------------------------------------------------------------------===//
-
-We compile this:
-
-__m128i
-foo2 (char x)
-{
- return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
-}
-
-into:
- movl $1, %eax
- xorps %xmm0, %xmm0
- pinsrw $2, %eax, %xmm0
- movzbl 4(%esp), %eax
- pinsrw $3, %eax, %xmm0
- movl $256, %eax
- pinsrw $7, %eax, %xmm0
- ret
-
-
-gcc-4.2:
- subl $12, %esp
- movzbl 16(%esp), %eax
- movdqa LC0, %xmm0
- pinsrw $3, %eax, %xmm0
- addl $12, %esp
- ret
- .const
- .align 4
-LC0:
- .word 0
- .word 0
- .word 1
- .word 0
- .word 0
- .word 0
- .word 0
- .word 256
-
-With SSE4, it should be
- movdqa .LC0(%rip), %xmm0
- pinsrb $6, %edi, %xmm0
-
-//===---------------------------------------------------------------------===//
-
-We should transform a shuffle of two vectors of constants into a single vector
-of constants. Also, insertelement of a constant into a vector of constants
-should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
-
-We compiled it to something horrible:
-
- .align 4
-LCPI1_1: ## float
- .long 1065353216 ## float 1
- .const
-
- .align 4
-LCPI1_0: ## <4 x float>
- .space 4
- .long 1065353216 ## float 1
- .space 4
- .long 1065353216 ## float 1
- .text
- .align 4,0x90
- .globl _t
-_t:
- xorps %xmm0, %xmm0
- movhps LCPI1_0, %xmm0
- movss LCPI1_1, %xmm1
- movaps %xmm0, %xmm2
- shufps $2, %xmm1, %xmm2
- shufps $132, %xmm2, %xmm0
- movaps %xmm0, 0
-
-//===---------------------------------------------------------------------===//
-rdar://5907648
-
-This function:
-
-float foo(unsigned char x) {
- return x;
-}
-
-compiles to (x86-32):
-
-define float @foo(i8 zeroext %x) nounwind {
- %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]
- ret float %tmp12
-}
-
-compiles to:
-
-_foo:
- subl $4, %esp
- movzbl 8(%esp), %eax
- cvtsi2ss %eax, %xmm0
- movss %xmm0, (%esp)
- flds (%esp)
- addl $4, %esp
- ret
-
-We should be able to use:
- cvtsi2ss 8($esp), %xmm0
-since we know the stack slot is already zext'd.
-
-//===---------------------------------------------------------------------===//
-
-Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
-when code size is critical. movlps is slower than movsd on core2 but it's one
-byte shorter.
-
-//===---------------------------------------------------------------------===//
-
-We should use a dynamic programming based approach to tell when using FPStack
-operations is cheaper than SSE. SciMark montecarlo contains code like this
-for example:
-
-double MonteCarlo_num_flops(int Num_samples) {
- return ((double) Num_samples)* 4.0;
-}
-
-In fpstack mode, this compiles into:
-
-LCPI1_0:
- .long 1082130432 ## float 4.000000e+00
-_MonteCarlo_num_flops:
- subl $4, %esp
- movl 8(%esp), %eax
- movl %eax, (%esp)
- fildl (%esp)
- fmuls LCPI1_0
- addl $4, %esp
- ret
-
-in SSE mode, it compiles into significantly slower code:
-
-_MonteCarlo_num_flops:
- subl $12, %esp
- cvtsi2sd 16(%esp), %xmm0
- mulsd LCPI1_0, %xmm0
- movsd %xmm0, (%esp)
- fldl (%esp)
- addl $12, %esp
- ret
-
-There are also other cases in scimark where using fpstack is better, it is
-cheaper to do fld1 than load from a constant pool for example, so
-"load, add 1.0, store" is better done in the fp stack, etc.
-
-//===---------------------------------------------------------------------===//
-
-These should compile into the same code (PR6214): Perhaps instcombine should
-canonicalize the former into the later?
-
-define float @foo(float %x) nounwind {
- %t = bitcast float %x to i32
- %s = and i32 %t, 2147483647
- %d = bitcast i32 %s to float
- ret float %d
-}
-
-declare float @fabsf(float %n)
-define float @bar(float %x) nounwind {
- %d = call float @fabsf(float %x)
- ret float %d
-}
-
-//===---------------------------------------------------------------------===//
-
-This IR (from PR6194):
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-darwin10.0.0"
-
-%0 = type { double, double }
-%struct.float3 = type { float, float, float }
-
-define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
-entry:
- %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1]
- %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1]
- %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1]
- %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1]
- %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1]
- %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1]
- %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
- store float %tmp12, float* %tmp5
- ret void
-}
-
-Compiles to:
-
-_test: ## @test
- movd %xmm0, %rax
- shrq $32, %rax
- movl %eax, 4(%rdi)
- ret
-
-This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
-doing a shuffle from v[1] to v[0] then a float store.
-
-//===---------------------------------------------------------------------===//
-
-[UNSAFE FP]
-
-void foo(double, double, double);
-void norm(double x, double y, double z) {
- double scale = __builtin_sqrt(x*x + y*y + z*z);
- foo(x/scale, y/scale, z/scale);
-}
-
-We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
-slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
-and emit 3 mulsd in place of the divs. This can be done as a target-independent
-transform.
-
-If we're dealing with floats instead of doubles we could even replace the sqrtss
-and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
-cost of reduced accuracy.
-
-//===---------------------------------------------------------------------===//
+
+typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
+
+static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
+- 22725, - 12873};;
+
+vSInt16 madd(vSInt16 b)
+{
+ return _mm_madd_epi16(a, b);
+}
+
+Generated code (x86-32, linux):
+madd:
+ pushl %ebp
+ movl %esp, %ebp
+ andl $-16, %esp
+ movaps .LCPI1_0, %xmm1
+ pmaddwd %xmm1, %xmm0
+ movl %ebp, %esp
+ popl %ebp
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+#include <emmintrin.h>
+__m128 foo2 (float x) {
+ return _mm_set_ps (0, 0, x, 0);
+}
+
+In x86-32 mode, we generate this spiffy code:
+
+_foo2:
+ movss 4(%esp), %xmm0
+ pshufd $81, %xmm0, %xmm0
+ ret
+
+in x86-64 mode, we generate this code, which could be better:
+
+_foo2:
+ xorps %xmm1, %xmm1
+ movss %xmm0, %xmm1
+ pshufd $81, %xmm1, %xmm0
+ ret
+
+In sse4 mode, we could use insertps to make both better.
+
+Here's another testcase that could use insertps [mem]:
+
+#include <xmmintrin.h>
+extern float x2, x3;
+__m128 foo1 (float x1, float x4) {
+ return _mm_set_ps (x2, x1, x3, x4);
+}
+
+gcc mainline compiles it to:
+
+foo1:
+ insertps $0x10, x2(%rip), %xmm0
+ insertps $0x10, x3(%rip), %xmm1
+ movaps %xmm1, %xmm2
+ movlhps %xmm0, %xmm2
+ movaps %xmm2, %xmm0
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We compile vector multiply-by-constant into poor code:
+
+define <4 x i32> @f(<4 x i32> %i) nounwind {
+ %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
+ ret <4 x i32> %A
+}
+
+On targets without SSE4.1, this compiles into:
+
+LCPI1_0: ## <4 x i32>
+ .long 10
+ .long 10
+ .long 10
+ .long 10
+ .text
+ .align 4,0x90
+ .globl _f
+_f:
+ pshufd $3, %xmm0, %xmm1
+ movd %xmm1, %eax
+ imull LCPI1_0+12, %eax
+ movd %eax, %xmm1
+ pshufd $1, %xmm0, %xmm2
+ movd %xmm2, %eax
+ imull LCPI1_0+4, %eax
+ movd %eax, %xmm2
+ punpckldq %xmm1, %xmm2
+ movd %xmm0, %eax
+ imull LCPI1_0, %eax
+ movd %eax, %xmm1
+ movhlps %xmm0, %xmm0
+ movd %xmm0, %eax
+ imull LCPI1_0+8, %eax
+ movd %eax, %xmm0
+ punpckldq %xmm0, %xmm1
+ movaps %xmm1, %xmm0
+ punpckldq %xmm2, %xmm0
+ ret
+
+It would be better to synthesize integer vector multiplication by constants
+using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
+simple cases such as multiplication by powers of two would be better as
+vector shifts than as multiplications.
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+__m128i
+foo2 (char x)
+{
+ return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
+}
+
+into:
+ movl $1, %eax
+ xorps %xmm0, %xmm0
+ pinsrw $2, %eax, %xmm0
+ movzbl 4(%esp), %eax
+ pinsrw $3, %eax, %xmm0
+ movl $256, %eax
+ pinsrw $7, %eax, %xmm0
+ ret
+
+
+gcc-4.2:
+ subl $12, %esp
+ movzbl 16(%esp), %eax
+ movdqa LC0, %xmm0
+ pinsrw $3, %eax, %xmm0
+ addl $12, %esp
+ ret
+ .const
+ .align 4
+LC0:
+ .word 0
+ .word 0
+ .word 1
+ .word 0
+ .word 0
+ .word 0
+ .word 0
+ .word 256
+
+With SSE4, it should be
+ movdqa .LC0(%rip), %xmm0
+ pinsrb $6, %edi, %xmm0
+
+//===---------------------------------------------------------------------===//
+
+We should transform a shuffle of two vectors of constants into a single vector
+of constants. Also, insertelement of a constant into a vector of constants
+should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
+
+We compiled it to something horrible:
+
+ .align 4
+LCPI1_1: ## float
+ .long 1065353216 ## float 1
+ .const
+
+ .align 4
+LCPI1_0: ## <4 x float>
+ .space 4
+ .long 1065353216 ## float 1
+ .space 4
+ .long 1065353216 ## float 1
+ .text
+ .align 4,0x90
+ .globl _t
+_t:
+ xorps %xmm0, %xmm0
+ movhps LCPI1_0, %xmm0
+ movss LCPI1_1, %xmm1
+ movaps %xmm0, %xmm2
+ shufps $2, %xmm1, %xmm2
+ shufps $132, %xmm2, %xmm0
+ movaps %xmm0, 0
+
+//===---------------------------------------------------------------------===//
+rdar://5907648
+
+This function:
+
+float foo(unsigned char x) {
+ return x;
+}
+
+compiles to (x86-32):
+
+define float @foo(i8 zeroext %x) nounwind {
+ %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]
+ ret float %tmp12
+}
+
+compiles to:
+
+_foo:
+ subl $4, %esp
+ movzbl 8(%esp), %eax
+ cvtsi2ss %eax, %xmm0
+ movss %xmm0, (%esp)
+ flds (%esp)
+ addl $4, %esp
+ ret
+
+We should be able to use:
+ cvtsi2ss 8($esp), %xmm0
+since we know the stack slot is already zext'd.
+
+//===---------------------------------------------------------------------===//
+
+Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
+when code size is critical. movlps is slower than movsd on core2 but it's one
+byte shorter.
+
+//===---------------------------------------------------------------------===//
+
+We should use a dynamic programming based approach to tell when using FPStack
+operations is cheaper than SSE. SciMark montecarlo contains code like this
+for example:
+
+double MonteCarlo_num_flops(int Num_samples) {
+ return ((double) Num_samples)* 4.0;
+}
+
+In fpstack mode, this compiles into:
+
+LCPI1_0:
+ .long 1082130432 ## float 4.000000e+00
+_MonteCarlo_num_flops:
+ subl $4, %esp
+ movl 8(%esp), %eax
+ movl %eax, (%esp)
+ fildl (%esp)
+ fmuls LCPI1_0
+ addl $4, %esp
+ ret
+
+in SSE mode, it compiles into significantly slower code:
+
+_MonteCarlo_num_flops:
+ subl $12, %esp
+ cvtsi2sd 16(%esp), %xmm0
+ mulsd LCPI1_0, %xmm0
+ movsd %xmm0, (%esp)
+ fldl (%esp)
+ addl $12, %esp
+ ret
+
+There are also other cases in scimark where using fpstack is better, it is
+cheaper to do fld1 than load from a constant pool for example, so
+"load, add 1.0, store" is better done in the fp stack, etc.
+
+//===---------------------------------------------------------------------===//
+
+These should compile into the same code (PR6214): Perhaps instcombine should
+canonicalize the former into the later?
+
+define float @foo(float %x) nounwind {
+ %t = bitcast float %x to i32
+ %s = and i32 %t, 2147483647
+ %d = bitcast i32 %s to float
+ ret float %d
+}
+
+declare float @fabsf(float %n)
+define float @bar(float %x) nounwind {
+ %d = call float @fabsf(float %x)
+ ret float %d
+}
+
+//===---------------------------------------------------------------------===//
+
+This IR (from PR6194):
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin10.0.0"
+
+%0 = type { double, double }
+%struct.float3 = type { float, float, float }
+
+define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
+entry:
+ %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1]
+ %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1]
+ %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1]
+ %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1]
+ %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1]
+ %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1]
+ %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
+ store float %tmp12, float* %tmp5
+ ret void
+}
+
+Compiles to:
+
+_test: ## @test
+ movd %xmm0, %rax
+ shrq $32, %rax
+ movl %eax, 4(%rdi)
+ ret
+
+This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
+doing a shuffle from v[1] to v[0] then a float store.
+
+//===---------------------------------------------------------------------===//
+
+[UNSAFE FP]
+
+void foo(double, double, double);
+void norm(double x, double y, double z) {
+ double scale = __builtin_sqrt(x*x + y*y + z*z);
+ foo(x/scale, y/scale, z/scale);
+}
+
+We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
+slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
+and emit 3 mulsd in place of the divs. This can be done as a target-independent
+transform.
+
+If we're dealing with floats instead of doubles we could even replace the sqrtss
+and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
+cost of reduced accuracy.
+
+//===---------------------------------------------------------------------===//
diff --git a/contrib/libs/llvm12/lib/Target/X86/README-X86-64.txt b/contrib/libs/llvm12/lib/Target/X86/README-X86-64.txt
index a3ea4595ac..d919c697bd 100644
--- a/contrib/libs/llvm12/lib/Target/X86/README-X86-64.txt
+++ b/contrib/libs/llvm12/lib/Target/X86/README-X86-64.txt
@@ -1,184 +1,184 @@
-//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
-
-AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
-multiplication by a constant. How much of it applies to Intel's X86-64
-implementation? There are definite trade-offs to consider: latency vs. register
-pressure vs. code size.
-
-//===---------------------------------------------------------------------===//
-
-Are we better off using branches instead of cmove to implement FP to
-unsigned i64?
-
-_conv:
- ucomiss LC0(%rip), %xmm0
- cvttss2siq %xmm0, %rdx
- jb L3
- subss LC0(%rip), %xmm0
- movabsq $-9223372036854775808, %rax
- cvttss2siq %xmm0, %rdx
- xorq %rax, %rdx
-L3:
- movq %rdx, %rax
- ret
-
-instead of
-
-_conv:
- movss LCPI1_0(%rip), %xmm1
- cvttss2siq %xmm0, %rcx
- movaps %xmm0, %xmm2
- subss %xmm1, %xmm2
- cvttss2siq %xmm2, %rax
- movabsq $-9223372036854775808, %rdx
- xorq %rdx, %rax
- ucomiss %xmm1, %xmm0
- cmovb %rcx, %rax
- ret
-
-Seems like the jb branch has high likelihood of being taken. It would have
-saved a few instructions.
-
-//===---------------------------------------------------------------------===//
-
-It's not possible to reference AH, BH, CH, and DH registers in an instruction
-requiring REX prefix. However, divb and mulb both produce results in AH. If isel
-emits a CopyFromReg which gets turned into a movb and that can be allocated a
-r8b - r15b.
-
-To get around this, isel emits a CopyFromReg from AX and then right shift it
-down by 8 and truncate it. It's not pretty but it works. We need some register
-allocation magic to make the hack go away (e.g. putting additional constraints
-on the result of the movb).
-
-//===---------------------------------------------------------------------===//
-
-The x86-64 ABI for hidden-argument struct returns requires that the
-incoming value of %rdi be copied into %rax by the callee upon return.
-
-The idea is that it saves callers from having to remember this value,
-which would often require a callee-saved register. Callees usually
-need to keep this value live for most of their body anyway, so it
-doesn't add a significant burden on them.
-
-We currently implement this in codegen, however this is suboptimal
-because it means that it would be quite awkward to implement the
-optimization for callers.
-
-A better implementation would be to relax the LLVM IR rules for sret
-arguments to allow a function with an sret argument to have a non-void
-return type, and to have the front-end to set up the sret argument value
-as the return value of the function. The front-end could more easily
-emit uses of the returned struct value to be in terms of the function's
-lowered return value, and it would free non-C frontends from a
-complication only required by a C-based ABI.
-
-//===---------------------------------------------------------------------===//
-
-We get a redundant zero extension for code like this:
-
-int mask[1000];
-int foo(unsigned x) {
- if (x < 10)
- x = x * 45;
- else
- x = x * 78;
- return mask[x];
-}
-
-_foo:
-LBB1_0: ## entry
- cmpl $9, %edi
- jbe LBB1_3 ## bb
-LBB1_1: ## bb1
- imull $78, %edi, %eax
-LBB1_2: ## bb2
- movl %eax, %eax <----
- movq _mask@GOTPCREL(%rip), %rcx
- movl (%rcx,%rax,4), %eax
- ret
-LBB1_3: ## bb
- imull $45, %edi, %eax
- jmp LBB1_2 ## bb2
-
-Before regalloc, we have:
-
- %reg1025 = IMUL32rri8 %reg1024, 45, implicit-def %eflags
- JMP mbb<bb2,0x203afb0>
- Successors according to CFG: 0x203afb0 (#3)
-
-bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
- Predecessors according to CFG: 0x203aec0 (#0)
- %reg1026 = IMUL32rri8 %reg1024, 78, implicit-def %eflags
- Successors according to CFG: 0x203afb0 (#3)
-
-bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
- Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
- %reg1027 = PHI %reg1025, mbb<bb,0x203af10>,
- %reg1026, mbb<bb1,0x203af60>
- %reg1029 = MOVZX64rr32 %reg1027
-
-so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
-be able to recognize the zero extend. This could also presumably be implemented
-if we have whole-function selectiondags.
-
-//===---------------------------------------------------------------------===//
-
-Take the following code
-(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
-extern unsigned long table[];
-unsigned long foo(unsigned char *p) {
- unsigned long tag = *p;
- return table[tag >> 4] + table[tag & 0xf];
-}
-
-Current code generated:
- movzbl (%rdi), %eax
- movq %rax, %rcx
- andq $240, %rcx
- shrq %rcx
- andq $15, %rax
- movq table(,%rax,8), %rax
- addq table(%rcx), %rax
- ret
-
-Issues:
-1. First movq should be movl; saves a byte.
-2. Both andq's should be andl; saves another two bytes. I think this was
- implemented at one point, but subsequently regressed.
-3. shrq should be shrl; saves another byte.
-4. The first andq can be completely eliminated by using a slightly more
- expensive addressing mode.
-
-//===---------------------------------------------------------------------===//
-
-Consider the following (contrived testcase, but contains common factors):
-
-#include <stdarg.h>
-int test(int x, ...) {
- int sum, i;
- va_list l;
- va_start(l, x);
- for (i = 0; i < x; i++)
- sum += va_arg(l, int);
- va_end(l);
- return sum;
-}
-
-Testcase given in C because fixing it will likely involve changing the IR
-generated for it. The primary issue with the result is that it doesn't do any
-of the optimizations which are possible if we know the address of a va_list
-in the current function is never taken:
-1. We shouldn't spill the XMM registers because we only call va_arg with "int".
-2. It would be nice if we could sroa the va_list.
-3. Probably overkill, but it'd be cool if we could peel off the first five
-iterations of the loop.
-
-Other optimizations involving functions which use va_arg on floats which don't
-have the address of a va_list taken:
-1. Conversely to the above, we shouldn't spill general registers if we only
- call va_arg on "double".
-2. If we know nothing more than 64 bits wide is read from the XMM registers,
- we can change the spilling code to reduce the amount of stack used by half.
-
-//===---------------------------------------------------------------------===//
+//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
+
+AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
+multiplication by a constant. How much of it applies to Intel's X86-64
+implementation? There are definite trade-offs to consider: latency vs. register
+pressure vs. code size.
+
+//===---------------------------------------------------------------------===//
+
+Are we better off using branches instead of cmove to implement FP to
+unsigned i64?
+
+_conv:
+ ucomiss LC0(%rip), %xmm0
+ cvttss2siq %xmm0, %rdx
+ jb L3
+ subss LC0(%rip), %xmm0
+ movabsq $-9223372036854775808, %rax
+ cvttss2siq %xmm0, %rdx
+ xorq %rax, %rdx
+L3:
+ movq %rdx, %rax
+ ret
+
+instead of
+
+_conv:
+ movss LCPI1_0(%rip), %xmm1
+ cvttss2siq %xmm0, %rcx
+ movaps %xmm0, %xmm2
+ subss %xmm1, %xmm2
+ cvttss2siq %xmm2, %rax
+ movabsq $-9223372036854775808, %rdx
+ xorq %rdx, %rax
+ ucomiss %xmm1, %xmm0
+ cmovb %rcx, %rax
+ ret
+
+Seems like the jb branch has high likelihood of being taken. It would have
+saved a few instructions.
+
+//===---------------------------------------------------------------------===//
+
+It's not possible to reference AH, BH, CH, and DH registers in an instruction
+requiring REX prefix. However, divb and mulb both produce results in AH. If isel
+emits a CopyFromReg which gets turned into a movb and that can be allocated a
+r8b - r15b.
+
+To get around this, isel emits a CopyFromReg from AX and then right shift it
+down by 8 and truncate it. It's not pretty but it works. We need some register
+allocation magic to make the hack go away (e.g. putting additional constraints
+on the result of the movb).
+
+//===---------------------------------------------------------------------===//
+
+The x86-64 ABI for hidden-argument struct returns requires that the
+incoming value of %rdi be copied into %rax by the callee upon return.
+
+The idea is that it saves callers from having to remember this value,
+which would often require a callee-saved register. Callees usually
+need to keep this value live for most of their body anyway, so it
+doesn't add a significant burden on them.
+
+We currently implement this in codegen, however this is suboptimal
+because it means that it would be quite awkward to implement the
+optimization for callers.
+
+A better implementation would be to relax the LLVM IR rules for sret
+arguments to allow a function with an sret argument to have a non-void
+return type, and to have the front-end to set up the sret argument value
+as the return value of the function. The front-end could more easily
+emit uses of the returned struct value to be in terms of the function's
+lowered return value, and it would free non-C frontends from a
+complication only required by a C-based ABI.
+
+//===---------------------------------------------------------------------===//
+
+We get a redundant zero extension for code like this:
+
+int mask[1000];
+int foo(unsigned x) {
+ if (x < 10)
+ x = x * 45;
+ else
+ x = x * 78;
+ return mask[x];
+}
+
+_foo:
+LBB1_0: ## entry
+ cmpl $9, %edi
+ jbe LBB1_3 ## bb
+LBB1_1: ## bb1
+ imull $78, %edi, %eax
+LBB1_2: ## bb2
+ movl %eax, %eax <----
+ movq _mask@GOTPCREL(%rip), %rcx
+ movl (%rcx,%rax,4), %eax
+ ret
+LBB1_3: ## bb
+ imull $45, %edi, %eax
+ jmp LBB1_2 ## bb2
+
+Before regalloc, we have:
+
+ %reg1025 = IMUL32rri8 %reg1024, 45, implicit-def %eflags
+ JMP mbb<bb2,0x203afb0>
+ Successors according to CFG: 0x203afb0 (#3)
+
+bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
+ Predecessors according to CFG: 0x203aec0 (#0)
+ %reg1026 = IMUL32rri8 %reg1024, 78, implicit-def %eflags
+ Successors according to CFG: 0x203afb0 (#3)
+
+bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
+ Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
+ %reg1027 = PHI %reg1025, mbb<bb,0x203af10>,
+ %reg1026, mbb<bb1,0x203af60>
+ %reg1029 = MOVZX64rr32 %reg1027
+
+so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
+be able to recognize the zero extend. This could also presumably be implemented
+if we have whole-function selectiondags.
+
+//===---------------------------------------------------------------------===//
+
+Take the following code
+(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
+extern unsigned long table[];
+unsigned long foo(unsigned char *p) {
+ unsigned long tag = *p;
+ return table[tag >> 4] + table[tag & 0xf];
+}
+
+Current code generated:
+ movzbl (%rdi), %eax
+ movq %rax, %rcx
+ andq $240, %rcx
+ shrq %rcx
+ andq $15, %rax
+ movq table(,%rax,8), %rax
+ addq table(%rcx), %rax
+ ret
+
+Issues:
+1. First movq should be movl; saves a byte.
+2. Both andq's should be andl; saves another two bytes. I think this was
+ implemented at one point, but subsequently regressed.
+3. shrq should be shrl; saves another byte.
+4. The first andq can be completely eliminated by using a slightly more
+ expensive addressing mode.
+
+//===---------------------------------------------------------------------===//
+
+Consider the following (contrived testcase, but contains common factors):
+
+#include <stdarg.h>
+int test(int x, ...) {
+ int sum, i;
+ va_list l;
+ va_start(l, x);
+ for (i = 0; i < x; i++)
+ sum += va_arg(l, int);
+ va_end(l);
+ return sum;
+}
+
+Testcase given in C because fixing it will likely involve changing the IR
+generated for it. The primary issue with the result is that it doesn't do any
+of the optimizations which are possible if we know the address of a va_list
+in the current function is never taken:
+1. We shouldn't spill the XMM registers because we only call va_arg with "int".
+2. It would be nice if we could sroa the va_list.
+3. Probably overkill, but it'd be cool if we could peel off the first five
+iterations of the loop.
+
+Other optimizations involving functions which use va_arg on floats which don't
+have the address of a va_list taken:
+1. Conversely to the above, we shouldn't spill general registers if we only
+ call va_arg on "double".
+2. If we know nothing more than 64 bits wide is read from the XMM registers,
+ we can change the spilling code to reduce the amount of stack used by half.
+
+//===---------------------------------------------------------------------===//
diff --git a/contrib/libs/llvm12/lib/Target/X86/README.txt b/contrib/libs/llvm12/lib/Target/X86/README.txt
index c06a7b1ade..6bc6e74b26 100644
--- a/contrib/libs/llvm12/lib/Target/X86/README.txt
+++ b/contrib/libs/llvm12/lib/Target/X86/README.txt
@@ -1,1794 +1,1794 @@
-//===---------------------------------------------------------------------===//
-// Random ideas for the X86 backend.
-//===---------------------------------------------------------------------===//
-
-Improvements to the multiply -> shift/add algorithm:
-http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
-
-//===---------------------------------------------------------------------===//
-
-Improve code like this (occurs fairly frequently, e.g. in LLVM):
-long long foo(int x) { return 1LL << x; }
-
-http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
-http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
-http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
-
-Another useful one would be ~0ULL >> X and ~0ULL << X.
-
-One better solution for 1LL << x is:
- xorl %eax, %eax
- xorl %edx, %edx
- testb $32, %cl
- sete %al
- setne %dl
- sall %cl, %eax
- sall %cl, %edx
-
-But that requires good 8-bit subreg support.
-
-Also, this might be better. It's an extra shift, but it's one instruction
-shorter, and doesn't stress 8-bit subreg support.
-(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
-but without the unnecessary and.)
- movl %ecx, %eax
- shrl $5, %eax
- movl %eax, %edx
- xorl $1, %edx
- sall %cl, %eax
- sall %cl. %edx
-
-64-bit shifts (in general) expand to really bad code. Instead of using
-cmovs, we should expand to a conditional branch like GCC produces.
-
-//===---------------------------------------------------------------------===//
-
-Some isel ideas:
-
-1. Dynamic programming based approach when compile time is not an
- issue.
-2. Code duplication (addressing mode) during isel.
-3. Other ideas from "Register-Sensitive Selection, Duplication, and
- Sequencing of Instructions".
-4. Scheduling for reduced register pressure. E.g. "Minimum Register
- Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
- and other related papers.
- http://citeseer.ist.psu.edu/govindarajan01minimum.html
-
-//===---------------------------------------------------------------------===//
-
-Should we promote i16 to i32 to avoid partial register update stalls?
-
-//===---------------------------------------------------------------------===//
-
-Leave any_extend as pseudo instruction and hint to register
-allocator. Delay codegen until post register allocation.
-Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
-the coalescer how to deal with it though.
-
-//===---------------------------------------------------------------------===//
-
-It appears icc use push for parameter passing. Need to investigate.
-
-//===---------------------------------------------------------------------===//
-
-The instruction selector sometimes misses folding a load into a compare. The
-pattern is written as (cmp reg, (load p)). Because the compare isn't
-commutative, it is not matched with the load on both sides. The dag combiner
-should be made smart enough to canonicalize the load into the RHS of a compare
-when it can invert the result of the compare for free.
-
-//===---------------------------------------------------------------------===//
-
-In many cases, LLVM generates code like this:
-
-_test:
- movl 8(%esp), %eax
- cmpl %eax, 4(%esp)
- setl %al
- movzbl %al, %eax
- ret
-
-on some processors (which ones?), it is more efficient to do this:
-
-_test:
- movl 8(%esp), %ebx
- xor %eax, %eax
- cmpl %ebx, 4(%esp)
- setl %al
- ret
-
-Doing this correctly is tricky though, as the xor clobbers the flags.
-
-//===---------------------------------------------------------------------===//
-
-We should generate bts/btr/etc instructions on targets where they are cheap or
-when codesize is important. e.g., for:
-
-void setbit(int *target, int bit) {
- *target |= (1 << bit);
-}
-void clearbit(int *target, int bit) {
- *target &= ~(1 << bit);
-}
-
-//===---------------------------------------------------------------------===//
-
-Instead of the following for memset char*, 1, 10:
-
- movl $16843009, 4(%edx)
- movl $16843009, (%edx)
- movw $257, 8(%edx)
-
-It might be better to generate
-
- movl $16843009, %eax
- movl %eax, 4(%edx)
- movl %eax, (%edx)
- movw al, 8(%edx)
-
-when we can spare a register. It reduces code size.
-
-//===---------------------------------------------------------------------===//
-
-Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
-get this:
-
-define i32 @test1(i32 %X) {
- %Y = sdiv i32 %X, 8
- ret i32 %Y
-}
-
-_test1:
- movl 4(%esp), %eax
- movl %eax, %ecx
- sarl $31, %ecx
- shrl $29, %ecx
- addl %ecx, %eax
- sarl $3, %eax
- ret
-
-GCC knows several different ways to codegen it, one of which is this:
-
-_test1:
- movl 4(%esp), %eax
- cmpl $-1, %eax
- leal 7(%eax), %ecx
- cmovle %ecx, %eax
- sarl $3, %eax
- ret
-
-which is probably slower, but it's interesting at least :)
-
-//===---------------------------------------------------------------------===//
-
-We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
-We should leave these as libcalls for everything over a much lower threshold,
-since libc is hand tuned for medium and large mem ops (avoiding RFO for large
-stores, TLB preheating, etc)
-
-//===---------------------------------------------------------------------===//
-
-Optimize this into something reasonable:
- x * copysign(1.0, y) * copysign(1.0, z)
-
-//===---------------------------------------------------------------------===//
-
-Optimize copysign(x, *y) to use an integer load from y.
-
-//===---------------------------------------------------------------------===//
-
-The following tests perform worse with LSR:
-
-lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
-
-//===---------------------------------------------------------------------===//
-
-Adding to the list of cmp / test poor codegen issues:
-
-int test(__m128 *A, __m128 *B) {
- if (_mm_comige_ss(*A, *B))
- return 3;
- else
- return 4;
-}
-
-_test:
- movl 8(%esp), %eax
- movaps (%eax), %xmm0
- movl 4(%esp), %eax
- movaps (%eax), %xmm1
- comiss %xmm0, %xmm1
- setae %al
- movzbl %al, %ecx
- movl $3, %eax
- movl $4, %edx
- cmpl $0, %ecx
- cmove %edx, %eax
- ret
-
-Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
-are a number of issues. 1) We are introducing a setcc between the result of the
-intrisic call and select. 2) The intrinsic is expected to produce a i32 value
-so a any extend (which becomes a zero extend) is added.
-
-We probably need some kind of target DAG combine hook to fix this.
-
-//===---------------------------------------------------------------------===//
-
-We generate significantly worse code for this than GCC:
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
-http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
-
-There is also one case we do worse on PPC.
-
-//===---------------------------------------------------------------------===//
-
-For this:
-
-int test(int a)
-{
- return a * 3;
-}
-
-We currently emits
- imull $3, 4(%esp), %eax
-
-Perhaps this is what we really should generate is? Is imull three or four
-cycles? Note: ICC generates this:
- movl 4(%esp), %eax
- leal (%eax,%eax,2), %eax
-
-The current instruction priority is based on pattern complexity. The former is
-more "complex" because it folds a load so the latter will not be emitted.
-
-Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
-should always try to match LEA first since the LEA matching code does some
-estimate to determine whether the match is profitable.
-
-However, if we care more about code size, then imull is better. It's two bytes
-shorter than movl + leal.
-
-On a Pentium M, both variants have the same characteristics with regard
-to throughput; however, the multiplication has a latency of four cycles, as
-opposed to two cycles for the movl+lea variant.
-
-//===---------------------------------------------------------------------===//
-
-It appears gcc place string data with linkonce linkage in
-.section __TEXT,__const_coal,coalesced instead of
-.section __DATA,__const_coal,coalesced.
-Take a look at darwin.h, there are other Darwin assembler directives that we
-do not make use of.
-
-//===---------------------------------------------------------------------===//
-
-define i32 @foo(i32* %a, i32 %t) {
-entry:
- br label %cond_true
-
-cond_true: ; preds = %cond_true, %entry
- %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]
- %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]
- %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]
- %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
- %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]
- %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]
- %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]
- %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]
- br i1 %tmp, label %bb12, label %cond_true
-
-bb12: ; preds = %cond_true
- ret i32 %tmp7
-}
-is pessimized by -loop-reduce and -indvars
-
-//===---------------------------------------------------------------------===//
-
-u32 to float conversion improvement:
-
-float uint32_2_float( unsigned u ) {
- float fl = (int) (u & 0xffff);
- float fh = (int) (u >> 16);
- fh *= 0x1.0p16f;
- return fh + fl;
-}
-
-00000000 subl $0x04,%esp
-00000003 movl 0x08(%esp,1),%eax
-00000007 movl %eax,%ecx
-00000009 shrl $0x10,%ecx
-0000000c cvtsi2ss %ecx,%xmm0
-00000010 andl $0x0000ffff,%eax
-00000015 cvtsi2ss %eax,%xmm1
-00000019 mulss 0x00000078,%xmm0
-00000021 addss %xmm1,%xmm0
-00000025 movss %xmm0,(%esp,1)
-0000002a flds (%esp,1)
-0000002d addl $0x04,%esp
-00000030 ret
-
-//===---------------------------------------------------------------------===//
-
-When using fastcc abi, align stack slot of argument of type double on 8 byte
-boundary to improve performance.
-
-//===---------------------------------------------------------------------===//
-
-GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
-simplifications for integer "x cmp y ? a : b".
-
-//===---------------------------------------------------------------------===//
-
-Consider the expansion of:
-
-define i32 @test3(i32 %X) {
- %tmp1 = urem i32 %X, 255
- ret i32 %tmp1
-}
-
-Currently it compiles to:
-
-...
- movl $2155905153, %ecx
- movl 8(%esp), %esi
- movl %esi, %eax
- mull %ecx
-...
-
-This could be "reassociated" into:
-
- movl $2155905153, %eax
- movl 8(%esp), %ecx
- mull %ecx
-
-to avoid the copy. In fact, the existing two-address stuff would do this
-except that mul isn't a commutative 2-addr instruction. I guess this has
-to be done at isel time based on the #uses to mul?
-
-//===---------------------------------------------------------------------===//
-
-Make sure the instruction which starts a loop does not cross a cacheline
-boundary. This requires knowning the exact length of each machine instruction.
-That is somewhat complicated, but doable. Example 256.bzip2:
-
-In the new trace, the hot loop has an instruction which crosses a cacheline
-boundary. In addition to potential cache misses, this can't help decoding as I
-imagine there has to be some kind of complicated decoder reset and realignment
-to grab the bytes from the next cacheline.
-
-532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
-942 942 0x3d03 movl %dh, (1809(%esp, %esi)
-937 937 0x3d0a incl %esi
-3 3 0x3d0b cmpb %bl, %dl
-27 27 0x3d0d jnz 0x000062db <main+11707>
-
-//===---------------------------------------------------------------------===//
-
-In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
-
-//===---------------------------------------------------------------------===//
-
-This could be a single 16-bit load.
-
-int f(char *p) {
- if ((p[0] == 1) & (p[1] == 2)) return 1;
- return 0;
-}
-
-//===---------------------------------------------------------------------===//
-
-We should inline lrintf and probably other libc functions.
-
-//===---------------------------------------------------------------------===//
-
-This code:
-
-void test(int X) {
- if (X) abort();
-}
-
-is currently compiled to:
-
-_test:
- subl $12, %esp
- cmpl $0, 16(%esp)
- jne LBB1_1
- addl $12, %esp
- ret
-LBB1_1:
- call L_abort$stub
-
-It would be better to produce:
-
-_test:
- subl $12, %esp
- cmpl $0, 16(%esp)
- jne L_abort$stub
- addl $12, %esp
- ret
-
-This can be applied to any no-return function call that takes no arguments etc.
-Alternatively, the stack save/restore logic could be shrink-wrapped, producing
-something like this:
-
-_test:
- cmpl $0, 4(%esp)
- jne LBB1_1
- ret
-LBB1_1:
- subl $12, %esp
- call L_abort$stub
-
-Both are useful in different situations. Finally, it could be shrink-wrapped
-and tail called, like this:
-
-_test:
- cmpl $0, 4(%esp)
- jne LBB1_1
- ret
-LBB1_1:
- pop %eax # realign stack.
- call L_abort$stub
-
-Though this probably isn't worth it.
-
-//===---------------------------------------------------------------------===//
-
-Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
-a neg instead of a sub instruction. Consider:
-
-int test(char X) { return 7-X; }
-
-we currently produce:
-_test:
- movl $7, %eax
- movsbl 4(%esp), %ecx
- subl %ecx, %eax
- ret
-
-We would use one fewer register if codegen'd as:
-
- movsbl 4(%esp), %eax
- neg %eax
- add $7, %eax
- ret
-
-Note that this isn't beneficial if the load can be folded into the sub. In
-this case, we want a sub:
-
-int test(int X) { return 7-X; }
-_test:
- movl $7, %eax
- subl 4(%esp), %eax
- ret
-
-//===---------------------------------------------------------------------===//
-
-Leaf functions that require one 4-byte spill slot have a prolog like this:
-
-_foo:
- pushl %esi
- subl $4, %esp
-...
-and an epilog like this:
- addl $4, %esp
- popl %esi
- ret
-
-It would be smaller, and potentially faster, to push eax on entry and to
-pop into a dummy register instead of using addl/subl of esp. Just don't pop
-into any return registers :)
-
-//===---------------------------------------------------------------------===//
-
-The X86 backend should fold (branch (or (setcc, setcc))) into multiple
-branches. We generate really poor code for:
-
-double testf(double a) {
- return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
-}
-
-For example, the entry BB is:
-
-_testf:
- subl $20, %esp
- pxor %xmm0, %xmm0
- movsd 24(%esp), %xmm1
- ucomisd %xmm0, %xmm1
- setnp %al
- sete %cl
- testb %cl, %al
- jne LBB1_5 # UnifiedReturnBlock
-LBB1_1: # cond_true
-
-
-it would be better to replace the last four instructions with:
-
- jp LBB1_1
- je LBB1_5
-LBB1_1:
-
-We also codegen the inner ?: into a diamond:
-
- cvtss2sd LCPI1_0(%rip), %xmm2
- cvtss2sd LCPI1_1(%rip), %xmm3
- ucomisd %xmm1, %xmm0
- ja LBB1_3 # cond_true
-LBB1_2: # cond_true
- movapd %xmm3, %xmm2
-LBB1_3: # cond_true
- movapd %xmm2, %xmm0
- ret
-
-We should sink the load into xmm3 into the LBB1_2 block. This should
-be pretty easy, and will nuke all the copies.
-
-//===---------------------------------------------------------------------===//
-
-This:
- #include <algorithm>
- inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
- { return std::make_pair(a + b, a + b < a); }
- bool no_overflow(unsigned a, unsigned b)
- { return !full_add(a, b).second; }
-
-Should compile to:
- addl %esi, %edi
- setae %al
- movzbl %al, %eax
- ret
-
-on x86-64, instead of the rather stupid-looking:
- addl %esi, %edi
- setb %al
- xorb $1, %al
- movzbl %al, %eax
- ret
-
-
-//===---------------------------------------------------------------------===//
-
-The following code:
-
-bb114.preheader: ; preds = %cond_next94
- %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
- %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
- %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
- %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
- %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
- %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
- %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
- %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
- %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
- %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
- %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
- br label %bb114
-
-produces:
-
-LBB3_5: # bb114.preheader
- movswl -68(%ebp), %eax
- movl $32, %ecx
- movl %ecx, -80(%ebp)
- subl %eax, -80(%ebp)
- movswl -52(%ebp), %eax
- movl %ecx, -84(%ebp)
- subl %eax, -84(%ebp)
- movswl -70(%ebp), %eax
- movl %ecx, -88(%ebp)
- subl %eax, -88(%ebp)
- movswl -50(%ebp), %eax
- subl %eax, %ecx
- movl %ecx, -76(%ebp)
- movswl -42(%ebp), %eax
- movl %eax, -92(%ebp)
- movswl -66(%ebp), %eax
- movl %eax, -96(%ebp)
- movw $0, -98(%ebp)
-
-This appears to be bad because the RA is not folding the store to the stack
-slot into the movl. The above instructions could be:
- movl $32, -80(%ebp)
-...
- movl $32, -84(%ebp)
-...
-This seems like a cross between remat and spill folding.
-
-This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
-change, so we could simply subtract %eax from %ecx first and then use %ecx (or
-vice-versa).
-
-//===---------------------------------------------------------------------===//
-
-This code:
-
- %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
- br i1 %tmp659, label %cond_true662, label %cond_next715
-
-produces this:
-
- testw %cx, %cx
- movswl %cx, %esi
- jns LBB4_109 # cond_next715
-
-Shark tells us that using %cx in the testw instruction is sub-optimal. It
-suggests using the 32-bit register (which is what ICC uses).
-
-//===---------------------------------------------------------------------===//
-
-We compile this:
-
-void compare (long long foo) {
- if (foo < 4294967297LL)
- abort();
-}
-
-to:
-
-compare:
- subl $4, %esp
- cmpl $0, 8(%esp)
- setne %al
- movzbw %al, %ax
- cmpl $1, 12(%esp)
- setg %cl
- movzbw %cl, %cx
- cmove %ax, %cx
- testb $1, %cl
- jne .LBB1_2 # UnifiedReturnBlock
-.LBB1_1: # ifthen
- call abort
-.LBB1_2: # UnifiedReturnBlock
- addl $4, %esp
- ret
-
-(also really horrible code on ppc). This is due to the expand code for 64-bit
-compares. GCC produces multiple branches, which is much nicer:
-
-compare:
- subl $12, %esp
- movl 20(%esp), %edx
- movl 16(%esp), %eax
- decl %edx
- jle .L7
-.L5:
- addl $12, %esp
- ret
- .p2align 4,,7
-.L7:
- jl .L4
- cmpl $0, %eax
- .p2align 4,,8
- ja .L5
-.L4:
- .p2align 4,,9
- call abort
-
-//===---------------------------------------------------------------------===//
-
-Tail call optimization improvements: Tail call optimization currently
-pushes all arguments on the top of the stack (their normal place for
-non-tail call optimized calls) that source from the callers arguments
-or that source from a virtual register (also possibly sourcing from
-callers arguments).
-This is done to prevent overwriting of parameters (see example
-below) that might be used later.
-
-example:
-
-int callee(int32, int64);
-int caller(int32 arg1, int32 arg2) {
- int64 local = arg2 * 2;
- return callee(arg2, (int64)local);
-}
-
-[arg1] [!arg2 no longer valid since we moved local onto it]
-[arg2] -> [(int64)
-[RETADDR] local ]
-
-Moving arg1 onto the stack slot of callee function would overwrite
-arg2 of the caller.
-
-Possible optimizations:
-
-
- - Analyse the actual parameters of the callee to see which would
- overwrite a caller parameter which is used by the callee and only
- push them onto the top of the stack.
-
- int callee (int32 arg1, int32 arg2);
- int caller (int32 arg1, int32 arg2) {
- return callee(arg1,arg2);
- }
-
- Here we don't need to write any variables to the top of the stack
- since they don't overwrite each other.
-
- int callee (int32 arg1, int32 arg2);
- int caller (int32 arg1, int32 arg2) {
- return callee(arg2,arg1);
- }
-
- Here we need to push the arguments because they overwrite each
- other.
-
-//===---------------------------------------------------------------------===//
-
-main ()
-{
- int i = 0;
- unsigned long int z = 0;
-
- do {
- z -= 0x00004000;
- i++;
- if (i > 0x00040000)
- abort ();
- } while (z > 0);
- exit (0);
-}
-
-gcc compiles this to:
-
-_main:
- subl $28, %esp
- xorl %eax, %eax
- jmp L2
-L3:
- cmpl $262144, %eax
- je L10
-L2:
- addl $1, %eax
- cmpl $262145, %eax
- jne L3
- call L_abort$stub
-L10:
- movl $0, (%esp)
- call L_exit$stub
-
-llvm:
-
-_main:
- subl $12, %esp
- movl $1, %eax
- movl $16384, %ecx
-LBB1_1: # bb
- cmpl $262145, %eax
- jge LBB1_4 # cond_true
-LBB1_2: # cond_next
- incl %eax
- addl $4294950912, %ecx
- cmpl $16384, %ecx
- jne LBB1_1 # bb
-LBB1_3: # bb11
- xorl %eax, %eax
- addl $12, %esp
- ret
-LBB1_4: # cond_true
- call L_abort$stub
-
-1. LSR should rewrite the first cmp with induction variable %ecx.
-2. DAG combiner should fold
- leal 1(%eax), %edx
- cmpl $262145, %edx
- =>
- cmpl $262144, %eax
-
-//===---------------------------------------------------------------------===//
-
-define i64 @test(double %X) {
- %Y = fptosi double %X to i64
- ret i64 %Y
-}
-
-compiles to:
-
-_test:
- subl $20, %esp
- movsd 24(%esp), %xmm0
- movsd %xmm0, 8(%esp)
- fldl 8(%esp)
- fisttpll (%esp)
- movl 4(%esp), %edx
- movl (%esp), %eax
- addl $20, %esp
- #FP_REG_KILL
- ret
-
-This should just fldl directly from the input stack slot.
-
-//===---------------------------------------------------------------------===//
-
-This code:
-int foo (int x) { return (x & 65535) | 255; }
-
-Should compile into:
-
-_foo:
- movzwl 4(%esp), %eax
- orl $255, %eax
- ret
-
-instead of:
-_foo:
- movl $65280, %eax
- andl 4(%esp), %eax
- orl $255, %eax
- ret
-
-//===---------------------------------------------------------------------===//
-
-We're codegen'ing multiply of long longs inefficiently:
-
-unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
- return arg1 * arg2;
-}
-
-We compile to (fomit-frame-pointer):
-
-_LLM:
- pushl %esi
- movl 8(%esp), %ecx
- movl 16(%esp), %esi
- movl %esi, %eax
- mull %ecx
- imull 12(%esp), %esi
- addl %edx, %esi
- imull 20(%esp), %ecx
- movl %esi, %edx
- addl %ecx, %edx
- popl %esi
- ret
-
-This looks like a scheduling deficiency and lack of remat of the load from
-the argument area. ICC apparently produces:
-
- movl 8(%esp), %ecx
- imull 12(%esp), %ecx
- movl 16(%esp), %eax
- imull 4(%esp), %eax
- addl %eax, %ecx
- movl 4(%esp), %eax
- mull 12(%esp)
- addl %ecx, %edx
- ret
-
-Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
-
-//===---------------------------------------------------------------------===//
-
-We can fold a store into "zeroing a reg". Instead of:
-
-xorl %eax, %eax
-movl %eax, 124(%esp)
-
-we should get:
-
-movl $0, 124(%esp)
-
-if the flags of the xor are dead.
-
-Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
-be folded into: shl [mem], 1
-
-//===---------------------------------------------------------------------===//
-
-In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
-or and instruction, for example:
-
- xorpd LCPI1_0, %xmm2
-
-However, if xmm2 gets spilled, we end up with really ugly code like this:
-
- movsd (%esp), %xmm0
- xorpd LCPI1_0, %xmm0
- movsd %xmm0, (%esp)
-
-Since we 'know' that this is a 'neg', we can actually "fold" the spill into
-the neg/abs instruction, turning it into an *integer* operation, like this:
-
- xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
-
-you could also use xorb, but xorl is less likely to lead to a partial register
-stall. Here is a contrived testcase:
-
-double a, b, c;
-void test(double *P) {
- double X = *P;
- a = X;
- bar();
- X = -X;
- b = X;
- bar();
- c = X;
-}
-
-//===---------------------------------------------------------------------===//
-
-The generated code on x86 for checking for signed overflow on a multiply the
-obvious way is much longer than it needs to be.
-
-int x(int a, int b) {
- long long prod = (long long)a*b;
- return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
-}
-
-See PR2053 for more details.
-
-//===---------------------------------------------------------------------===//
-
-We should investigate using cdq/ctld (effect: edx = sar eax, 31)
-more aggressively; it should cost the same as a move+shift on any modern
-processor, but it's a lot shorter. Downside is that it puts more
-pressure on register allocation because it has fixed operands.
-
-Example:
-int abs(int x) {return x < 0 ? -x : x;}
-
-gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
-abs:
- movl 4(%esp), %eax
- cltd
- xorl %edx, %eax
- subl %edx, %eax
- ret
-
-//===---------------------------------------------------------------------===//
-
-Take the following code (from
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
-
-extern unsigned char first_one[65536];
-int FirstOnet(unsigned long long arg1)
-{
- if (arg1 >> 48)
- return (first_one[arg1 >> 48]);
- return 0;
-}
-
-
-The following code is currently generated:
-FirstOnet:
- movl 8(%esp), %eax
- cmpl $65536, %eax
- movl 4(%esp), %ecx
- jb .LBB1_2 # UnifiedReturnBlock
-.LBB1_1: # ifthen
- shrl $16, %eax
- movzbl first_one(%eax), %eax
- ret
-.LBB1_2: # UnifiedReturnBlock
- xorl %eax, %eax
- ret
-
-We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
-lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
-
-//===---------------------------------------------------------------------===//
-
-We compile this function:
-
-define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
-entry:
- %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
- br i1 %tmp2, label %bb7, label %bb
-
-bb: ; preds = %entry
- %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
- ret i32 %tmp6
-
-bb7: ; preds = %entry
- %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
- ret i32 %tmp10
-}
-
-to:
-
-foo: # @foo
-# %bb.0: # %entry
- movl 4(%esp), %ecx
- cmpb $0, 16(%esp)
- je .LBB0_2
-# %bb.1: # %bb
- movl 8(%esp), %eax
- addl %ecx, %eax
- ret
-.LBB0_2: # %bb7
- movl 12(%esp), %edx
- movl %ecx, %eax
- subl %edx, %eax
- ret
-
-There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
-couple more movls by putting 4(%esp) into %eax instead of %ecx.
-
-//===---------------------------------------------------------------------===//
-
-See rdar://4653682.
-
-From flops:
-
-LBB1_15: # bb310
- cvtss2sd LCPI1_0, %xmm1
- addsd %xmm1, %xmm0
- movsd 176(%esp), %xmm2
- mulsd %xmm0, %xmm2
- movapd %xmm2, %xmm3
- mulsd %xmm3, %xmm3
- movapd %xmm3, %xmm4
- mulsd LCPI1_23, %xmm4
- addsd LCPI1_24, %xmm4
- mulsd %xmm3, %xmm4
- addsd LCPI1_25, %xmm4
- mulsd %xmm3, %xmm4
- addsd LCPI1_26, %xmm4
- mulsd %xmm3, %xmm4
- addsd LCPI1_27, %xmm4
- mulsd %xmm3, %xmm4
- addsd LCPI1_28, %xmm4
- mulsd %xmm3, %xmm4
- addsd %xmm1, %xmm4
- mulsd %xmm2, %xmm4
- movsd 152(%esp), %xmm1
- addsd %xmm4, %xmm1
- movsd %xmm1, 152(%esp)
- incl %eax
- cmpl %eax, %esi
- jge LBB1_15 # bb310
-LBB1_16: # bb358.loopexit
- movsd 152(%esp), %xmm0
- addsd %xmm0, %xmm0
- addsd LCPI1_22, %xmm0
- movsd %xmm0, 152(%esp)
-
-Rather than spilling the result of the last addsd in the loop, we should have
-insert a copy to split the interval (one for the duration of the loop, one
-extending to the fall through). The register pressure in the loop isn't high
-enough to warrant the spill.
-
-Also check why xmm7 is not used at all in the function.
-
-//===---------------------------------------------------------------------===//
-
-Take the following:
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
-target triple = "i386-apple-darwin8"
-@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
-define fastcc void @abort_gzip() noreturn nounwind {
-entry:
- %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
- br i1 %tmp.b.i, label %bb.i, label %bb4.i
-bb.i: ; preds = %entry
- tail call void @exit( i32 1 ) noreturn nounwind
- unreachable
-bb4.i: ; preds = %entry
- store i1 true, i1* @in_exit.4870.b
- tail call void @exit( i32 1 ) noreturn nounwind
- unreachable
-}
-declare void @exit(i32) noreturn nounwind
-
-This compiles into:
-_abort_gzip: ## @abort_gzip
-## %bb.0: ## %entry
- subl $12, %esp
- movb _in_exit.4870.b, %al
- cmpb $1, %al
- jne LBB0_2
-
-We somehow miss folding the movb into the cmpb.
-
-//===---------------------------------------------------------------------===//
-
-We compile:
-
-int test(int x, int y) {
- return x-y-1;
-}
-
-into (-m64):
-
-_test:
- decl %edi
- movl %edi, %eax
- subl %esi, %eax
- ret
-
-it would be better to codegen as: x+~y (notl+addl)
-
-//===---------------------------------------------------------------------===//
-
-This code:
-
-int foo(const char *str,...)
-{
- __builtin_va_list a; int x;
- __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
- return x;
-}
-
-gets compiled into this on x86-64:
- subq $200, %rsp
- movaps %xmm7, 160(%rsp)
- movaps %xmm6, 144(%rsp)
- movaps %xmm5, 128(%rsp)
- movaps %xmm4, 112(%rsp)
- movaps %xmm3, 96(%rsp)
- movaps %xmm2, 80(%rsp)
- movaps %xmm1, 64(%rsp)
- movaps %xmm0, 48(%rsp)
- movq %r9, 40(%rsp)
- movq %r8, 32(%rsp)
- movq %rcx, 24(%rsp)
- movq %rdx, 16(%rsp)
- movq %rsi, 8(%rsp)
- leaq (%rsp), %rax
- movq %rax, 192(%rsp)
- leaq 208(%rsp), %rax
- movq %rax, 184(%rsp)
- movl $48, 180(%rsp)
- movl $8, 176(%rsp)
- movl 176(%rsp), %eax
- cmpl $47, %eax
- jbe .LBB1_3 # bb
-.LBB1_1: # bb3
- movq 184(%rsp), %rcx
- leaq 8(%rcx), %rax
- movq %rax, 184(%rsp)
-.LBB1_2: # bb4
- movl (%rcx), %eax
- addq $200, %rsp
- ret
-.LBB1_3: # bb
- movl %eax, %ecx
- addl $8, %eax
- addq 192(%rsp), %rcx
- movl %eax, 176(%rsp)
- jmp .LBB1_2 # bb4
-
-gcc 4.3 generates:
- subq $96, %rsp
-.LCFI0:
- leaq 104(%rsp), %rax
- movq %rsi, -80(%rsp)
- movl $8, -120(%rsp)
- movq %rax, -112(%rsp)
- leaq -88(%rsp), %rax
- movq %rax, -104(%rsp)
- movl $8, %eax
- cmpl $48, %eax
- jb .L6
- movq -112(%rsp), %rdx
- movl (%rdx), %eax
- addq $96, %rsp
- ret
- .p2align 4,,10
- .p2align 3
-.L6:
- mov %eax, %edx
- addq -104(%rsp), %rdx
- addl $8, %eax
- movl %eax, -120(%rsp)
- movl (%rdx), %eax
- addq $96, %rsp
- ret
-
-and it gets compiled into this on x86:
- pushl %ebp
- movl %esp, %ebp
- subl $4, %esp
- leal 12(%ebp), %eax
- movl %eax, -4(%ebp)
- leal 16(%ebp), %eax
- movl %eax, -4(%ebp)
- movl 12(%ebp), %eax
- addl $4, %esp
- popl %ebp
- ret
-
-gcc 4.3 generates:
- pushl %ebp
- movl %esp, %ebp
- movl 12(%ebp), %eax
- popl %ebp
- ret
-
-//===---------------------------------------------------------------------===//
-
-Teach tblgen not to check bitconvert source type in some cases. This allows us
-to consolidate the following patterns in X86InstrMMX.td:
-
-def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
- (iPTR 0))))),
- (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
-def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
- (iPTR 0))))),
- (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
-def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
- (iPTR 0))))),
- (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
-
-There are other cases in various td files.
-
-//===---------------------------------------------------------------------===//
-
-Take something like the following on x86-32:
-unsigned a(unsigned long long x, unsigned y) {return x % y;}
-
-We currently generate a libcall, but we really shouldn't: the expansion is
-shorter and likely faster than the libcall. The expected code is something
-like the following:
-
- movl 12(%ebp), %eax
- movl 16(%ebp), %ecx
- xorl %edx, %edx
- divl %ecx
- movl 8(%ebp), %eax
- divl %ecx
- movl %edx, %eax
- ret
-
-A similar code sequence works for division.
-
-//===---------------------------------------------------------------------===//
-
-We currently compile this:
-
-define i32 @func1(i32 %v1, i32 %v2) nounwind {
-entry:
- %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
- %sum = extractvalue {i32, i1} %t, 0
- %obit = extractvalue {i32, i1} %t, 1
- br i1 %obit, label %overflow, label %normal
-normal:
- ret i32 %sum
-overflow:
- call void @llvm.trap()
- unreachable
-}
-declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
-declare void @llvm.trap()
-
-to:
-
-_func1:
- movl 4(%esp), %eax
- addl 8(%esp), %eax
- jo LBB1_2 ## overflow
-LBB1_1: ## normal
- ret
-LBB1_2: ## overflow
- ud2
-
-it would be nice to produce "into" someday.
-
-//===---------------------------------------------------------------------===//
-
-Test instructions can be eliminated by using EFLAGS values from arithmetic
-instructions. This is currently not done for mul, and, or, xor, neg, shl,
-sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
-for read-modify-write instructions. It is also current not done if the
-OF or CF flags are needed.
-
-The shift operators have the complication that when the shift count is
-zero, EFLAGS is not set, so they can only subsume a test instruction if
-the shift count is known to be non-zero. Also, using the EFLAGS value
-from a shift is apparently very slow on some x86 implementations.
-
-In read-modify-write instructions, the root node in the isel match is
-the store, and isel has no way for the use of the EFLAGS result of the
-arithmetic to be remapped to the new node.
-
-Add and subtract instructions set OF on signed overflow and CF on unsiged
-overflow, while test instructions always clear OF and CF. In order to
-replace a test with an add or subtract in a situation where OF or CF is
-needed, codegen must be able to prove that the operation cannot see
-signed or unsigned overflow, respectively.
-
-//===---------------------------------------------------------------------===//
-
-memcpy/memmove do not lower to SSE copies when possible. A silly example is:
-define <16 x float> @foo(<16 x float> %A) nounwind {
- %tmp = alloca <16 x float>, align 16
- %tmp2 = alloca <16 x float>, align 16
- store <16 x float> %A, <16 x float>* %tmp
- %s = bitcast <16 x float>* %tmp to i8*
- %s2 = bitcast <16 x float>* %tmp2 to i8*
- call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
- %R = load <16 x float>* %tmp2
- ret <16 x float> %R
-}
-
-declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
-
-which compiles to:
-
-_foo:
- subl $140, %esp
- movaps %xmm3, 112(%esp)
- movaps %xmm2, 96(%esp)
- movaps %xmm1, 80(%esp)
- movaps %xmm0, 64(%esp)
- movl 60(%esp), %eax
- movl %eax, 124(%esp)
- movl 56(%esp), %eax
- movl %eax, 120(%esp)
- movl 52(%esp), %eax
- <many many more 32-bit copies>
- movaps (%esp), %xmm0
- movaps 16(%esp), %xmm1
- movaps 32(%esp), %xmm2
- movaps 48(%esp), %xmm3
- addl $140, %esp
- ret
-
-On Nehalem, it may even be cheaper to just use movups when unaligned than to
-fall back to lower-granularity chunks.
-
-//===---------------------------------------------------------------------===//
-
-Implement processor-specific optimizations for parity with GCC on these
-processors. GCC does two optimizations:
-
-1. ix86_pad_returns inserts a noop before ret instructions if immediately
- preceded by a conditional branch or is the target of a jump.
-2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
- code contains more than 3 branches.
-
-The first one is done for all AMDs, Core2, and "Generic"
-The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
- Core 2, and "Generic"
-
-//===---------------------------------------------------------------------===//
-Testcase:
-int x(int a) { return (a&0xf0)>>4; }
-
-Current output:
- movl 4(%esp), %eax
- shrl $4, %eax
- andl $15, %eax
- ret
-
-Ideal output:
- movzbl 4(%esp), %eax
- shrl $4, %eax
- ret
-
-//===---------------------------------------------------------------------===//
-
-Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
-properly.
-
-When the return value is not used (i.e. only care about the value in the
-memory), x86 does not have to use add to implement these. Instead, it can use
-add, sub, inc, dec instructions with the "lock" prefix.
-
-This is currently implemented using a bit of instruction selection trick. The
-issue is the target independent pattern produces one output and a chain and we
-want to map it into one that just output a chain. The current trick is to select
-it into a MERGE_VALUES with the first definition being an implicit_def. The
-proper solution is to add new ISD opcodes for the no-output variant. DAG
-combiner can then transform the node before it gets to target node selection.
-
-Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
-fact these instructions are identical to the non-lock versions. We need a way to
-add target specific information to target nodes and have this information
-carried over to machine instructions. Asm printer (or JIT) can use this
-information to add the "lock" prefix.
-
-//===---------------------------------------------------------------------===//
-
-struct B {
- unsigned char y0 : 1;
-};
-
-int bar(struct B* a) { return a->y0; }
-
-define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
- %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
- %2 = load i8* %1, align 1
- %3 = and i8 %2, 1
- %4 = zext i8 %3 to i32
- ret i32 %4
-}
-
-bar: # @bar
-# %bb.0:
- movb (%rdi), %al
- andb $1, %al
- movzbl %al, %eax
- ret
-
-Missed optimization: should be movl+andl.
-
-//===---------------------------------------------------------------------===//
-
-The x86_64 abi says:
-
-Booleans, when stored in a memory object, are stored as single byte objects the
-value of which is always 0 (false) or 1 (true).
-
-We are not using this fact:
-
-int bar(_Bool *a) { return *a; }
-
-define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
- %1 = load i8* %a, align 1, !tbaa !0
- %tmp = and i8 %1, 1
- %2 = zext i8 %tmp to i32
- ret i32 %2
-}
-
-bar:
- movb (%rdi), %al
- andb $1, %al
- movzbl %al, %eax
- ret
-
-GCC produces
-
-bar:
- movzbl (%rdi), %eax
- ret
-
-//===---------------------------------------------------------------------===//
-
-Take the following C code:
-int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
-
-We generate the following IR with clang:
-define i32 @f(i32 %a, i32 %b) nounwind readnone {
-entry:
- %tmp = xor i32 %b, %a ; <i32> [#uses=1]
- %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1]
- %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1]
- %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
- ret i32 %conv5
-}
-
-And the following x86 code:
- xorl %esi, %edi
- testb $-1, %dil
- sete %al
- movzbl %al, %eax
- ret
-
-A cmpb instead of the xorl+testb would be one instruction shorter.
-
-//===---------------------------------------------------------------------===//
-
-Given the following C code:
-int f(int a, int b) { return (signed char)a == (signed char)b; }
-
-We generate the following IR with clang:
-define i32 @f(i32 %a, i32 %b) nounwind readnone {
-entry:
- %sext = shl i32 %a, 24 ; <i32> [#uses=1]
- %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1]
- %sext6 = shl i32 %b, 24 ; <i32> [#uses=1]
- %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1]
- %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1]
- %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
- ret i32 %conv5
-}
-
-And the following x86 code:
- movsbl %sil, %eax
- movsbl %dil, %ecx
- cmpl %eax, %ecx
- sete %al
- movzbl %al, %eax
- ret
-
-
-It should be possible to eliminate the sign extensions.
-
-//===---------------------------------------------------------------------===//
-
-LLVM misses a load+store narrowing opportunity in this code:
-
-%struct.bf = type { i64, i16, i16, i32 }
-
-@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2]
-
-define void @t1() nounwind ssp {
-entry:
- %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
- %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
- %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2]
- %3 = load i32* %2, align 1 ; <i32> [#uses=1]
- %4 = and i32 %3, -65537 ; <i32> [#uses=1]
- store i32 %4, i32* %2, align 1
- %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
- %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
- %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2]
- %8 = load i32* %7, align 1 ; <i32> [#uses=1]
- %9 = and i32 %8, -131073 ; <i32> [#uses=1]
- store i32 %9, i32* %7, align 1
- ret void
-}
-
-LLVM currently emits this:
-
- movq bfi(%rip), %rax
- andl $-65537, 8(%rax)
- movq bfi(%rip), %rax
- andl $-131073, 8(%rax)
- ret
-
-It could narrow the loads and stores to emit this:
-
- movq bfi(%rip), %rax
- andb $-2, 10(%rax)
- movq bfi(%rip), %rax
- andb $-3, 10(%rax)
- ret
-
-The trouble is that there is a TokenFactor between the store and the
-load, making it non-trivial to determine if there's anything between
-the load and the store which would prohibit narrowing.
-
-//===---------------------------------------------------------------------===//
-
-This code:
-void foo(unsigned x) {
- if (x == 0) bar();
- else if (x == 1) qux();
-}
-
-currently compiles into:
-_foo:
- movl 4(%esp), %eax
- cmpl $1, %eax
- je LBB0_3
- testl %eax, %eax
- jne LBB0_4
-
-the testl could be removed:
-_foo:
- movl 4(%esp), %eax
- cmpl $1, %eax
- je LBB0_3
- jb LBB0_4
-
-0 is the only unsigned number < 1.
-
-//===---------------------------------------------------------------------===//
-
-This code:
-
-%0 = type { i32, i1 }
-
-define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
-entry:
- %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
- %cmp = extractvalue %0 %uadd, 1
- %inc = zext i1 %cmp to i32
- %add = add i32 %x, %sum
- %z.0 = add i32 %add, %inc
- ret i32 %z.0
-}
-
-declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
-
-compiles to:
-
-_add32carry: ## @add32carry
- addl %esi, %edi
- sbbl %ecx, %ecx
- movl %edi, %eax
- subl %ecx, %eax
- ret
-
-But it could be:
-
-_add32carry:
- leal (%rsi,%rdi), %eax
- cmpl %esi, %eax
- adcl $0, %eax
- ret
-
-//===---------------------------------------------------------------------===//
-
-The hot loop of 256.bzip2 contains code that looks a bit like this:
-
-int foo(char *P, char *Q, int x, int y) {
- if (P[0] != Q[0])
- return P[0] < Q[0];
- if (P[1] != Q[1])
- return P[1] < Q[1];
- if (P[2] != Q[2])
- return P[2] < Q[2];
- return P[3] < Q[3];
-}
-
-In the real code, we get a lot more wrong than this. However, even in this
-code we generate:
-
-_foo: ## @foo
-## %bb.0: ## %entry
- movb (%rsi), %al
- movb (%rdi), %cl
- cmpb %al, %cl
- je LBB0_2
-LBB0_1: ## %if.then
- cmpb %al, %cl
- jmp LBB0_5
-LBB0_2: ## %if.end
- movb 1(%rsi), %al
- movb 1(%rdi), %cl
- cmpb %al, %cl
- jne LBB0_1
-## %bb.3: ## %if.end38
- movb 2(%rsi), %al
- movb 2(%rdi), %cl
- cmpb %al, %cl
- jne LBB0_1
-## %bb.4: ## %if.end60
- movb 3(%rdi), %al
- cmpb 3(%rsi), %al
-LBB0_5: ## %if.end60
- setl %al
- movzbl %al, %eax
- ret
-
-Note that we generate jumps to LBB0_1 which does a redundant compare. The
-redundant compare also forces the register values to be live, which prevents
-folding one of the loads into the compare. In contrast, GCC 4.2 produces:
-
-_foo:
- movzbl (%rsi), %eax
- cmpb %al, (%rdi)
- jne L10
-L12:
- movzbl 1(%rsi), %eax
- cmpb %al, 1(%rdi)
- jne L10
- movzbl 2(%rsi), %eax
- cmpb %al, 2(%rdi)
- jne L10
- movzbl 3(%rdi), %eax
- cmpb 3(%rsi), %al
-L10:
- setl %al
- movzbl %al, %eax
- ret
-
-which is "perfect".
-
-//===---------------------------------------------------------------------===//
-
-For the branch in the following code:
-int a();
-int b(int x, int y) {
- if (x & (1<<(y&7)))
- return a();
- return y;
-}
-
-We currently generate:
- movb %sil, %al
- andb $7, %al
- movzbl %al, %eax
- btl %eax, %edi
- jae .LBB0_2
-
-movl+andl would be shorter than the movb+andb+movzbl sequence.
-
-//===---------------------------------------------------------------------===//
-
-For the following:
-struct u1 {
- float x, y;
-};
-float foo(struct u1 u) {
- return u.x + u.y;
-}
-
-We currently generate:
- movdqa %xmm0, %xmm1
- pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0]
- addss %xmm1, %xmm0
- ret
-
-We could save an instruction here by commuting the addss.
-
-//===---------------------------------------------------------------------===//
-
-This (from PR9661):
-
-float clamp_float(float a) {
- if (a > 1.0f)
- return 1.0f;
- else if (a < 0.0f)
- return 0.0f;
- else
- return a;
-}
-
-Could compile to:
-
-clamp_float: # @clamp_float
- movss .LCPI0_0(%rip), %xmm1
- minss %xmm1, %xmm0
- pxor %xmm1, %xmm1
- maxss %xmm1, %xmm0
- ret
-
-with -ffast-math.
-
-//===---------------------------------------------------------------------===//
-
-This function (from PR9803):
-
-int clamp2(int a) {
- if (a > 5)
- a = 5;
- if (a < 0)
- return 0;
- return a;
-}
-
-Compiles to:
-
-_clamp2: ## @clamp2
- pushq %rbp
- movq %rsp, %rbp
- cmpl $5, %edi
- movl $5, %ecx
- cmovlel %edi, %ecx
- testl %ecx, %ecx
- movl $0, %eax
- cmovnsl %ecx, %eax
- popq %rbp
- ret
-
-The move of 0 could be scheduled above the test to make it is xor reg,reg.
-
-//===---------------------------------------------------------------------===//
-
-GCC PR48986. We currently compile this:
-
-void bar(void);
-void yyy(int* p) {
- if (__sync_fetch_and_add(p, -1) == 1)
- bar();
-}
-
-into:
- movl $-1, %eax
- lock
- xaddl %eax, (%rdi)
- cmpl $1, %eax
- je LBB0_2
-
-Instead we could generate:
-
- lock
- dec %rdi
- je LBB0_2
-
-The trick is to match "fetch_and_add(X, -C) == C".
-
-//===---------------------------------------------------------------------===//
-
-unsigned t(unsigned a, unsigned b) {
- return a <= b ? 5 : -5;
-}
-
-We generate:
- movl $5, %ecx
- cmpl %esi, %edi
- movl $-5, %eax
- cmovbel %ecx, %eax
-
-GCC:
- cmpl %edi, %esi
- sbbl %eax, %eax
- andl $-10, %eax
- addl $5, %eax
-
-//===---------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend.
+//===---------------------------------------------------------------------===//
+
+Improvements to the multiply -> shift/add algorithm:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
+
+//===---------------------------------------------------------------------===//
+
+Improve code like this (occurs fairly frequently, e.g. in LLVM):
+long long foo(int x) { return 1LL << x; }
+
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
+
+Another useful one would be ~0ULL >> X and ~0ULL << X.
+
+One better solution for 1LL << x is:
+ xorl %eax, %eax
+ xorl %edx, %edx
+ testb $32, %cl
+ sete %al
+ setne %dl
+ sall %cl, %eax
+ sall %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+Also, this might be better. It's an extra shift, but it's one instruction
+shorter, and doesn't stress 8-bit subreg support.
+(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
+but without the unnecessary and.)
+ movl %ecx, %eax
+ shrl $5, %eax
+ movl %eax, %edx
+ xorl $1, %edx
+ sall %cl, %eax
+ sall %cl. %edx
+
+64-bit shifts (in general) expand to really bad code. Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
+
+//===---------------------------------------------------------------------===//
+
+Some isel ideas:
+
+1. Dynamic programming based approach when compile time is not an
+ issue.
+2. Code duplication (addressing mode) during isel.
+3. Other ideas from "Register-Sensitive Selection, Duplication, and
+ Sequencing of Instructions".
+4. Scheduling for reduced register pressure. E.g. "Minimum Register
+ Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
+ and other related papers.
+ http://citeseer.ist.psu.edu/govindarajan01minimum.html
+
+//===---------------------------------------------------------------------===//
+
+Should we promote i16 to i32 to avoid partial register update stalls?
+
+//===---------------------------------------------------------------------===//
+
+Leave any_extend as pseudo instruction and hint to register
+allocator. Delay codegen until post register allocation.
+Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
+the coalescer how to deal with it though.
+
+//===---------------------------------------------------------------------===//
+
+It appears icc use push for parameter passing. Need to investigate.
+
+//===---------------------------------------------------------------------===//
+
+The instruction selector sometimes misses folding a load into a compare. The
+pattern is written as (cmp reg, (load p)). Because the compare isn't
+commutative, it is not matched with the load on both sides. The dag combiner
+should be made smart enough to canonicalize the load into the RHS of a compare
+when it can invert the result of the compare for free.
+
+//===---------------------------------------------------------------------===//
+
+In many cases, LLVM generates code like this:
+
+_test:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ setl %al
+ movzbl %al, %eax
+ ret
+
+on some processors (which ones?), it is more efficient to do this:
+
+_test:
+ movl 8(%esp), %ebx
+ xor %eax, %eax
+ cmpl %ebx, 4(%esp)
+ setl %al
+ ret
+
+Doing this correctly is tricky though, as the xor clobbers the flags.
+
+//===---------------------------------------------------------------------===//
+
+We should generate bts/btr/etc instructions on targets where they are cheap or
+when codesize is important. e.g., for:
+
+void setbit(int *target, int bit) {
+ *target |= (1 << bit);
+}
+void clearbit(int *target, int bit) {
+ *target &= ~(1 << bit);
+}
+
+//===---------------------------------------------------------------------===//
+
+Instead of the following for memset char*, 1, 10:
+
+ movl $16843009, 4(%edx)
+ movl $16843009, (%edx)
+ movw $257, 8(%edx)
+
+It might be better to generate
+
+ movl $16843009, %eax
+ movl %eax, 4(%edx)
+ movl %eax, (%edx)
+ movw al, 8(%edx)
+
+when we can spare a register. It reduces code size.
+
+//===---------------------------------------------------------------------===//
+
+Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
+get this:
+
+define i32 @test1(i32 %X) {
+ %Y = sdiv i32 %X, 8
+ ret i32 %Y
+}
+
+_test1:
+ movl 4(%esp), %eax
+ movl %eax, %ecx
+ sarl $31, %ecx
+ shrl $29, %ecx
+ addl %ecx, %eax
+ sarl $3, %eax
+ ret
+
+GCC knows several different ways to codegen it, one of which is this:
+
+_test1:
+ movl 4(%esp), %eax
+ cmpl $-1, %eax
+ leal 7(%eax), %ecx
+ cmovle %ecx, %eax
+ sarl $3, %eax
+ ret
+
+which is probably slower, but it's interesting at least :)
+
+//===---------------------------------------------------------------------===//
+
+We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
+We should leave these as libcalls for everything over a much lower threshold,
+since libc is hand tuned for medium and large mem ops (avoiding RFO for large
+stores, TLB preheating, etc)
+
+//===---------------------------------------------------------------------===//
+
+Optimize this into something reasonable:
+ x * copysign(1.0, y) * copysign(1.0, z)
+
+//===---------------------------------------------------------------------===//
+
+Optimize copysign(x, *y) to use an integer load from y.
+
+//===---------------------------------------------------------------------===//
+
+The following tests perform worse with LSR:
+
+lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
+
+//===---------------------------------------------------------------------===//
+
+Adding to the list of cmp / test poor codegen issues:
+
+int test(__m128 *A, __m128 *B) {
+ if (_mm_comige_ss(*A, *B))
+ return 3;
+ else
+ return 4;
+}
+
+_test:
+ movl 8(%esp), %eax
+ movaps (%eax), %xmm0
+ movl 4(%esp), %eax
+ movaps (%eax), %xmm1
+ comiss %xmm0, %xmm1
+ setae %al
+ movzbl %al, %ecx
+ movl $3, %eax
+ movl $4, %edx
+ cmpl $0, %ecx
+ cmove %edx, %eax
+ ret
+
+Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
+are a number of issues. 1) We are introducing a setcc between the result of the
+intrisic call and select. 2) The intrinsic is expected to produce a i32 value
+so a any extend (which becomes a zero extend) is added.
+
+We probably need some kind of target DAG combine hook to fix this.
+
+//===---------------------------------------------------------------------===//
+
+We generate significantly worse code for this than GCC:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
+
+There is also one case we do worse on PPC.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+int test(int a)
+{
+ return a * 3;
+}
+
+We currently emits
+ imull $3, 4(%esp), %eax
+
+Perhaps this is what we really should generate is? Is imull three or four
+cycles? Note: ICC generates this:
+ movl 4(%esp), %eax
+ leal (%eax,%eax,2), %eax
+
+The current instruction priority is based on pattern complexity. The former is
+more "complex" because it folds a load so the latter will not be emitted.
+
+Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
+should always try to match LEA first since the LEA matching code does some
+estimate to determine whether the match is profitable.
+
+However, if we care more about code size, then imull is better. It's two bytes
+shorter than movl + leal.
+
+On a Pentium M, both variants have the same characteristics with regard
+to throughput; however, the multiplication has a latency of four cycles, as
+opposed to two cycles for the movl+lea variant.
+
+//===---------------------------------------------------------------------===//
+
+It appears gcc place string data with linkonce linkage in
+.section __TEXT,__const_coal,coalesced instead of
+.section __DATA,__const_coal,coalesced.
+Take a look at darwin.h, there are other Darwin assembler directives that we
+do not make use of.
+
+//===---------------------------------------------------------------------===//
+
+define i32 @foo(i32* %a, i32 %t) {
+entry:
+ br label %cond_true
+
+cond_true: ; preds = %cond_true, %entry
+ %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]
+ %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]
+ %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]
+ %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
+ %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]
+ %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]
+ %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]
+ %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]
+ br i1 %tmp, label %bb12, label %cond_true
+
+bb12: ; preds = %cond_true
+ ret i32 %tmp7
+}
+is pessimized by -loop-reduce and -indvars
+
+//===---------------------------------------------------------------------===//
+
+u32 to float conversion improvement:
+
+float uint32_2_float( unsigned u ) {
+ float fl = (int) (u & 0xffff);
+ float fh = (int) (u >> 16);
+ fh *= 0x1.0p16f;
+ return fh + fl;
+}
+
+00000000 subl $0x04,%esp
+00000003 movl 0x08(%esp,1),%eax
+00000007 movl %eax,%ecx
+00000009 shrl $0x10,%ecx
+0000000c cvtsi2ss %ecx,%xmm0
+00000010 andl $0x0000ffff,%eax
+00000015 cvtsi2ss %eax,%xmm1
+00000019 mulss 0x00000078,%xmm0
+00000021 addss %xmm1,%xmm0
+00000025 movss %xmm0,(%esp,1)
+0000002a flds (%esp,1)
+0000002d addl $0x04,%esp
+00000030 ret
+
+//===---------------------------------------------------------------------===//
+
+When using fastcc abi, align stack slot of argument of type double on 8 byte
+boundary to improve performance.
+
+//===---------------------------------------------------------------------===//
+
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b".
+
+//===---------------------------------------------------------------------===//
+
+Consider the expansion of:
+
+define i32 @test3(i32 %X) {
+ %tmp1 = urem i32 %X, 255
+ ret i32 %tmp1
+}
+
+Currently it compiles to:
+
+...
+ movl $2155905153, %ecx
+ movl 8(%esp), %esi
+ movl %esi, %eax
+ mull %ecx
+...
+
+This could be "reassociated" into:
+
+ movl $2155905153, %eax
+ movl 8(%esp), %ecx
+ mull %ecx
+
+to avoid the copy. In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction. I guess this has
+to be done at isel time based on the #uses to mul?
+
+//===---------------------------------------------------------------------===//
+
+Make sure the instruction which starts a loop does not cross a cacheline
+boundary. This requires knowning the exact length of each machine instruction.
+That is somewhat complicated, but doable. Example 256.bzip2:
+
+In the new trace, the hot loop has an instruction which crosses a cacheline
+boundary. In addition to potential cache misses, this can't help decoding as I
+imagine there has to be some kind of complicated decoder reset and realignment
+to grab the bytes from the next cacheline.
+
+532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
+942 942 0x3d03 movl %dh, (1809(%esp, %esi)
+937 937 0x3d0a incl %esi
+3 3 0x3d0b cmpb %bl, %dl
+27 27 0x3d0d jnz 0x000062db <main+11707>
+
+//===---------------------------------------------------------------------===//
+
+In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
+
+//===---------------------------------------------------------------------===//
+
+This could be a single 16-bit load.
+
+int f(char *p) {
+ if ((p[0] == 1) & (p[1] == 2)) return 1;
+ return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+We should inline lrintf and probably other libc functions.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void test(int X) {
+ if (X) abort();
+}
+
+is currently compiled to:
+
+_test:
+ subl $12, %esp
+ cmpl $0, 16(%esp)
+ jne LBB1_1
+ addl $12, %esp
+ ret
+LBB1_1:
+ call L_abort$stub
+
+It would be better to produce:
+
+_test:
+ subl $12, %esp
+ cmpl $0, 16(%esp)
+ jne L_abort$stub
+ addl $12, %esp
+ ret
+
+This can be applied to any no-return function call that takes no arguments etc.
+Alternatively, the stack save/restore logic could be shrink-wrapped, producing
+something like this:
+
+_test:
+ cmpl $0, 4(%esp)
+ jne LBB1_1
+ ret
+LBB1_1:
+ subl $12, %esp
+ call L_abort$stub
+
+Both are useful in different situations. Finally, it could be shrink-wrapped
+and tail called, like this:
+
+_test:
+ cmpl $0, 4(%esp)
+ jne LBB1_1
+ ret
+LBB1_1:
+ pop %eax # realign stack.
+ call L_abort$stub
+
+Though this probably isn't worth it.
+
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction. Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+ movl $7, %eax
+ movsbl 4(%esp), %ecx
+ subl %ecx, %eax
+ ret
+
+We would use one fewer register if codegen'd as:
+
+ movsbl 4(%esp), %eax
+ neg %eax
+ add $7, %eax
+ ret
+
+Note that this isn't beneficial if the load can be folded into the sub. In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+ movl $7, %eax
+ subl 4(%esp), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Leaf functions that require one 4-byte spill slot have a prolog like this:
+
+_foo:
+ pushl %esi
+ subl $4, %esp
+...
+and an epilog like this:
+ addl $4, %esp
+ popl %esi
+ ret
+
+It would be smaller, and potentially faster, to push eax on entry and to
+pop into a dummy register instead of using addl/subl of esp. Just don't pop
+into any return registers :)
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should fold (branch (or (setcc, setcc))) into multiple
+branches. We generate really poor code for:
+
+double testf(double a) {
+ return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
+}
+
+For example, the entry BB is:
+
+_testf:
+ subl $20, %esp
+ pxor %xmm0, %xmm0
+ movsd 24(%esp), %xmm1
+ ucomisd %xmm0, %xmm1
+ setnp %al
+ sete %cl
+ testb %cl, %al
+ jne LBB1_5 # UnifiedReturnBlock
+LBB1_1: # cond_true
+
+
+it would be better to replace the last four instructions with:
+
+ jp LBB1_1
+ je LBB1_5
+LBB1_1:
+
+We also codegen the inner ?: into a diamond:
+
+ cvtss2sd LCPI1_0(%rip), %xmm2
+ cvtss2sd LCPI1_1(%rip), %xmm3
+ ucomisd %xmm1, %xmm0
+ ja LBB1_3 # cond_true
+LBB1_2: # cond_true
+ movapd %xmm3, %xmm2
+LBB1_3: # cond_true
+ movapd %xmm2, %xmm0
+ ret
+
+We should sink the load into xmm3 into the LBB1_2 block. This should
+be pretty easy, and will nuke all the copies.
+
+//===---------------------------------------------------------------------===//
+
+This:
+ #include <algorithm>
+ inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+ { return std::make_pair(a + b, a + b < a); }
+ bool no_overflow(unsigned a, unsigned b)
+ { return !full_add(a, b).second; }
+
+Should compile to:
+ addl %esi, %edi
+ setae %al
+ movzbl %al, %eax
+ ret
+
+on x86-64, instead of the rather stupid-looking:
+ addl %esi, %edi
+ setb %al
+ xorb $1, %al
+ movzbl %al, %eax
+ ret
+
+
+//===---------------------------------------------------------------------===//
+
+The following code:
+
+bb114.preheader: ; preds = %cond_next94
+ %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
+ %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
+ %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
+ %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
+ %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
+ %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
+ %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
+ %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
+ %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
+ %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
+ %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
+ br label %bb114
+
+produces:
+
+LBB3_5: # bb114.preheader
+ movswl -68(%ebp), %eax
+ movl $32, %ecx
+ movl %ecx, -80(%ebp)
+ subl %eax, -80(%ebp)
+ movswl -52(%ebp), %eax
+ movl %ecx, -84(%ebp)
+ subl %eax, -84(%ebp)
+ movswl -70(%ebp), %eax
+ movl %ecx, -88(%ebp)
+ subl %eax, -88(%ebp)
+ movswl -50(%ebp), %eax
+ subl %eax, %ecx
+ movl %ecx, -76(%ebp)
+ movswl -42(%ebp), %eax
+ movl %eax, -92(%ebp)
+ movswl -66(%ebp), %eax
+ movl %eax, -96(%ebp)
+ movw $0, -98(%ebp)
+
+This appears to be bad because the RA is not folding the store to the stack
+slot into the movl. The above instructions could be:
+ movl $32, -80(%ebp)
+...
+ movl $32, -84(%ebp)
+...
+This seems like a cross between remat and spill folding.
+
+This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
+change, so we could simply subtract %eax from %ecx first and then use %ecx (or
+vice-versa).
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+ %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
+ br i1 %tmp659, label %cond_true662, label %cond_next715
+
+produces this:
+
+ testw %cx, %cx
+ movswl %cx, %esi
+ jns LBB4_109 # cond_next715
+
+Shark tells us that using %cx in the testw instruction is sub-optimal. It
+suggests using the 32-bit register (which is what ICC uses).
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+void compare (long long foo) {
+ if (foo < 4294967297LL)
+ abort();
+}
+
+to:
+
+compare:
+ subl $4, %esp
+ cmpl $0, 8(%esp)
+ setne %al
+ movzbw %al, %ax
+ cmpl $1, 12(%esp)
+ setg %cl
+ movzbw %cl, %cx
+ cmove %ax, %cx
+ testb $1, %cl
+ jne .LBB1_2 # UnifiedReturnBlock
+.LBB1_1: # ifthen
+ call abort
+.LBB1_2: # UnifiedReturnBlock
+ addl $4, %esp
+ ret
+
+(also really horrible code on ppc). This is due to the expand code for 64-bit
+compares. GCC produces multiple branches, which is much nicer:
+
+compare:
+ subl $12, %esp
+ movl 20(%esp), %edx
+ movl 16(%esp), %eax
+ decl %edx
+ jle .L7
+.L5:
+ addl $12, %esp
+ ret
+ .p2align 4,,7
+.L7:
+ jl .L4
+ cmpl $0, %eax
+ .p2align 4,,8
+ ja .L5
+.L4:
+ .p2align 4,,9
+ call abort
+
+//===---------------------------------------------------------------------===//
+
+Tail call optimization improvements: Tail call optimization currently
+pushes all arguments on the top of the stack (their normal place for
+non-tail call optimized calls) that source from the callers arguments
+or that source from a virtual register (also possibly sourcing from
+callers arguments).
+This is done to prevent overwriting of parameters (see example
+below) that might be used later.
+
+example:
+
+int callee(int32, int64);
+int caller(int32 arg1, int32 arg2) {
+ int64 local = arg2 * 2;
+ return callee(arg2, (int64)local);
+}
+
+[arg1] [!arg2 no longer valid since we moved local onto it]
+[arg2] -> [(int64)
+[RETADDR] local ]
+
+Moving arg1 onto the stack slot of callee function would overwrite
+arg2 of the caller.
+
+Possible optimizations:
+
+
+ - Analyse the actual parameters of the callee to see which would
+ overwrite a caller parameter which is used by the callee and only
+ push them onto the top of the stack.
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg1,arg2);
+ }
+
+ Here we don't need to write any variables to the top of the stack
+ since they don't overwrite each other.
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg2,arg1);
+ }
+
+ Here we need to push the arguments because they overwrite each
+ other.
+
+//===---------------------------------------------------------------------===//
+
+main ()
+{
+ int i = 0;
+ unsigned long int z = 0;
+
+ do {
+ z -= 0x00004000;
+ i++;
+ if (i > 0x00040000)
+ abort ();
+ } while (z > 0);
+ exit (0);
+}
+
+gcc compiles this to:
+
+_main:
+ subl $28, %esp
+ xorl %eax, %eax
+ jmp L2
+L3:
+ cmpl $262144, %eax
+ je L10
+L2:
+ addl $1, %eax
+ cmpl $262145, %eax
+ jne L3
+ call L_abort$stub
+L10:
+ movl $0, (%esp)
+ call L_exit$stub
+
+llvm:
+
+_main:
+ subl $12, %esp
+ movl $1, %eax
+ movl $16384, %ecx
+LBB1_1: # bb
+ cmpl $262145, %eax
+ jge LBB1_4 # cond_true
+LBB1_2: # cond_next
+ incl %eax
+ addl $4294950912, %ecx
+ cmpl $16384, %ecx
+ jne LBB1_1 # bb
+LBB1_3: # bb11
+ xorl %eax, %eax
+ addl $12, %esp
+ ret
+LBB1_4: # cond_true
+ call L_abort$stub
+
+1. LSR should rewrite the first cmp with induction variable %ecx.
+2. DAG combiner should fold
+ leal 1(%eax), %edx
+ cmpl $262145, %edx
+ =>
+ cmpl $262144, %eax
+
+//===---------------------------------------------------------------------===//
+
+define i64 @test(double %X) {
+ %Y = fptosi double %X to i64
+ ret i64 %Y
+}
+
+compiles to:
+
+_test:
+ subl $20, %esp
+ movsd 24(%esp), %xmm0
+ movsd %xmm0, 8(%esp)
+ fldl 8(%esp)
+ fisttpll (%esp)
+ movl 4(%esp), %edx
+ movl (%esp), %eax
+ addl $20, %esp
+ #FP_REG_KILL
+ ret
+
+This should just fldl directly from the input stack slot.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+int foo (int x) { return (x & 65535) | 255; }
+
+Should compile into:
+
+_foo:
+ movzwl 4(%esp), %eax
+ orl $255, %eax
+ ret
+
+instead of:
+_foo:
+ movl $65280, %eax
+ andl 4(%esp), %eax
+ orl $255, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We're codegen'ing multiply of long longs inefficiently:
+
+unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
+ return arg1 * arg2;
+}
+
+We compile to (fomit-frame-pointer):
+
+_LLM:
+ pushl %esi
+ movl 8(%esp), %ecx
+ movl 16(%esp), %esi
+ movl %esi, %eax
+ mull %ecx
+ imull 12(%esp), %esi
+ addl %edx, %esi
+ imull 20(%esp), %ecx
+ movl %esi, %edx
+ addl %ecx, %edx
+ popl %esi
+ ret
+
+This looks like a scheduling deficiency and lack of remat of the load from
+the argument area. ICC apparently produces:
+
+ movl 8(%esp), %ecx
+ imull 12(%esp), %ecx
+ movl 16(%esp), %eax
+ imull 4(%esp), %eax
+ addl %eax, %ecx
+ movl 4(%esp), %eax
+ mull 12(%esp)
+ addl %ecx, %edx
+ ret
+
+Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
+
+//===---------------------------------------------------------------------===//
+
+We can fold a store into "zeroing a reg". Instead of:
+
+xorl %eax, %eax
+movl %eax, 124(%esp)
+
+we should get:
+
+movl $0, 124(%esp)
+
+if the flags of the xor are dead.
+
+Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
+be folded into: shl [mem], 1
+
+//===---------------------------------------------------------------------===//
+
+In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
+or and instruction, for example:
+
+ xorpd LCPI1_0, %xmm2
+
+However, if xmm2 gets spilled, we end up with really ugly code like this:
+
+ movsd (%esp), %xmm0
+ xorpd LCPI1_0, %xmm0
+ movsd %xmm0, (%esp)
+
+Since we 'know' that this is a 'neg', we can actually "fold" the spill into
+the neg/abs instruction, turning it into an *integer* operation, like this:
+
+ xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
+
+you could also use xorb, but xorl is less likely to lead to a partial register
+stall. Here is a contrived testcase:
+
+double a, b, c;
+void test(double *P) {
+ double X = *P;
+ a = X;
+ bar();
+ X = -X;
+ b = X;
+ bar();
+ c = X;
+}
+
+//===---------------------------------------------------------------------===//
+
+The generated code on x86 for checking for signed overflow on a multiply the
+obvious way is much longer than it needs to be.
+
+int x(int a, int b) {
+ long long prod = (long long)a*b;
+ return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
+}
+
+See PR2053 for more details.
+
+//===---------------------------------------------------------------------===//
+
+We should investigate using cdq/ctld (effect: edx = sar eax, 31)
+more aggressively; it should cost the same as a move+shift on any modern
+processor, but it's a lot shorter. Downside is that it puts more
+pressure on register allocation because it has fixed operands.
+
+Example:
+int abs(int x) {return x < 0 ? -x : x;}
+
+gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
+abs:
+ movl 4(%esp), %eax
+ cltd
+ xorl %edx, %eax
+ subl %edx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Take the following code (from
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
+
+extern unsigned char first_one[65536];
+int FirstOnet(unsigned long long arg1)
+{
+ if (arg1 >> 48)
+ return (first_one[arg1 >> 48]);
+ return 0;
+}
+
+
+The following code is currently generated:
+FirstOnet:
+ movl 8(%esp), %eax
+ cmpl $65536, %eax
+ movl 4(%esp), %ecx
+ jb .LBB1_2 # UnifiedReturnBlock
+.LBB1_1: # ifthen
+ shrl $16, %eax
+ movzbl first_one(%eax), %eax
+ ret
+.LBB1_2: # UnifiedReturnBlock
+ xorl %eax, %eax
+ ret
+
+We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
+lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
+
+//===---------------------------------------------------------------------===//
+
+We compile this function:
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
+entry:
+ %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
+ br i1 %tmp2, label %bb7, label %bb
+
+bb: ; preds = %entry
+ %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
+ ret i32 %tmp6
+
+bb7: ; preds = %entry
+ %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
+ ret i32 %tmp10
+}
+
+to:
+
+foo: # @foo
+# %bb.0: # %entry
+ movl 4(%esp), %ecx
+ cmpb $0, 16(%esp)
+ je .LBB0_2
+# %bb.1: # %bb
+ movl 8(%esp), %eax
+ addl %ecx, %eax
+ ret
+.LBB0_2: # %bb7
+ movl 12(%esp), %edx
+ movl %ecx, %eax
+ subl %edx, %eax
+ ret
+
+There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
+couple more movls by putting 4(%esp) into %eax instead of %ecx.
+
+//===---------------------------------------------------------------------===//
+
+See rdar://4653682.
+
+From flops:
+
+LBB1_15: # bb310
+ cvtss2sd LCPI1_0, %xmm1
+ addsd %xmm1, %xmm0
+ movsd 176(%esp), %xmm2
+ mulsd %xmm0, %xmm2
+ movapd %xmm2, %xmm3
+ mulsd %xmm3, %xmm3
+ movapd %xmm3, %xmm4
+ mulsd LCPI1_23, %xmm4
+ addsd LCPI1_24, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_25, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_26, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_27, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_28, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd %xmm1, %xmm4
+ mulsd %xmm2, %xmm4
+ movsd 152(%esp), %xmm1
+ addsd %xmm4, %xmm1
+ movsd %xmm1, 152(%esp)
+ incl %eax
+ cmpl %eax, %esi
+ jge LBB1_15 # bb310
+LBB1_16: # bb358.loopexit
+ movsd 152(%esp), %xmm0
+ addsd %xmm0, %xmm0
+ addsd LCPI1_22, %xmm0
+ movsd %xmm0, 152(%esp)
+
+Rather than spilling the result of the last addsd in the loop, we should have
+insert a copy to split the interval (one for the duration of the loop, one
+extending to the fall through). The register pressure in the loop isn't high
+enough to warrant the spill.
+
+Also check why xmm7 is not used at all in the function.
+
+//===---------------------------------------------------------------------===//
+
+Take the following:
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
+target triple = "i386-apple-darwin8"
+@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
+define fastcc void @abort_gzip() noreturn nounwind {
+entry:
+ %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
+ br i1 %tmp.b.i, label %bb.i, label %bb4.i
+bb.i: ; preds = %entry
+ tail call void @exit( i32 1 ) noreturn nounwind
+ unreachable
+bb4.i: ; preds = %entry
+ store i1 true, i1* @in_exit.4870.b
+ tail call void @exit( i32 1 ) noreturn nounwind
+ unreachable
+}
+declare void @exit(i32) noreturn nounwind
+
+This compiles into:
+_abort_gzip: ## @abort_gzip
+## %bb.0: ## %entry
+ subl $12, %esp
+ movb _in_exit.4870.b, %al
+ cmpb $1, %al
+ jne LBB0_2
+
+We somehow miss folding the movb into the cmpb.
+
+//===---------------------------------------------------------------------===//
+
+We compile:
+
+int test(int x, int y) {
+ return x-y-1;
+}
+
+into (-m64):
+
+_test:
+ decl %edi
+ movl %edi, %eax
+ subl %esi, %eax
+ ret
+
+it would be better to codegen as: x+~y (notl+addl)
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+int foo(const char *str,...)
+{
+ __builtin_va_list a; int x;
+ __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
+ return x;
+}
+
+gets compiled into this on x86-64:
+ subq $200, %rsp
+ movaps %xmm7, 160(%rsp)
+ movaps %xmm6, 144(%rsp)
+ movaps %xmm5, 128(%rsp)
+ movaps %xmm4, 112(%rsp)
+ movaps %xmm3, 96(%rsp)
+ movaps %xmm2, 80(%rsp)
+ movaps %xmm1, 64(%rsp)
+ movaps %xmm0, 48(%rsp)
+ movq %r9, 40(%rsp)
+ movq %r8, 32(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 8(%rsp)
+ leaq (%rsp), %rax
+ movq %rax, 192(%rsp)
+ leaq 208(%rsp), %rax
+ movq %rax, 184(%rsp)
+ movl $48, 180(%rsp)
+ movl $8, 176(%rsp)
+ movl 176(%rsp), %eax
+ cmpl $47, %eax
+ jbe .LBB1_3 # bb
+.LBB1_1: # bb3
+ movq 184(%rsp), %rcx
+ leaq 8(%rcx), %rax
+ movq %rax, 184(%rsp)
+.LBB1_2: # bb4
+ movl (%rcx), %eax
+ addq $200, %rsp
+ ret
+.LBB1_3: # bb
+ movl %eax, %ecx
+ addl $8, %eax
+ addq 192(%rsp), %rcx
+ movl %eax, 176(%rsp)
+ jmp .LBB1_2 # bb4
+
+gcc 4.3 generates:
+ subq $96, %rsp
+.LCFI0:
+ leaq 104(%rsp), %rax
+ movq %rsi, -80(%rsp)
+ movl $8, -120(%rsp)
+ movq %rax, -112(%rsp)
+ leaq -88(%rsp), %rax
+ movq %rax, -104(%rsp)
+ movl $8, %eax
+ cmpl $48, %eax
+ jb .L6
+ movq -112(%rsp), %rdx
+ movl (%rdx), %eax
+ addq $96, %rsp
+ ret
+ .p2align 4,,10
+ .p2align 3
+.L6:
+ mov %eax, %edx
+ addq -104(%rsp), %rdx
+ addl $8, %eax
+ movl %eax, -120(%rsp)
+ movl (%rdx), %eax
+ addq $96, %rsp
+ ret
+
+and it gets compiled into this on x86:
+ pushl %ebp
+ movl %esp, %ebp
+ subl $4, %esp
+ leal 12(%ebp), %eax
+ movl %eax, -4(%ebp)
+ leal 16(%ebp), %eax
+ movl %eax, -4(%ebp)
+ movl 12(%ebp), %eax
+ addl $4, %esp
+ popl %ebp
+ ret
+
+gcc 4.3 generates:
+ pushl %ebp
+ movl %esp, %ebp
+ movl 12(%ebp), %eax
+ popl %ebp
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Teach tblgen not to check bitconvert source type in some cases. This allows us
+to consolidate the following patterns in X86InstrMMX.td:
+
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+There are other cases in various td files.
+
+//===---------------------------------------------------------------------===//
+
+Take something like the following on x86-32:
+unsigned a(unsigned long long x, unsigned y) {return x % y;}
+
+We currently generate a libcall, but we really shouldn't: the expansion is
+shorter and likely faster than the libcall. The expected code is something
+like the following:
+
+ movl 12(%ebp), %eax
+ movl 16(%ebp), %ecx
+ xorl %edx, %edx
+ divl %ecx
+ movl 8(%ebp), %eax
+ divl %ecx
+ movl %edx, %eax
+ ret
+
+A similar code sequence works for division.
+
+//===---------------------------------------------------------------------===//
+
+We currently compile this:
+
+define i32 @func1(i32 %v1, i32 %v2) nounwind {
+entry:
+ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %sum = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ br i1 %obit, label %overflow, label %normal
+normal:
+ ret i32 %sum
+overflow:
+ call void @llvm.trap()
+ unreachable
+}
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare void @llvm.trap()
+
+to:
+
+_func1:
+ movl 4(%esp), %eax
+ addl 8(%esp), %eax
+ jo LBB1_2 ## overflow
+LBB1_1: ## normal
+ ret
+LBB1_2: ## overflow
+ ud2
+
+it would be nice to produce "into" someday.
+
+//===---------------------------------------------------------------------===//
+
+Test instructions can be eliminated by using EFLAGS values from arithmetic
+instructions. This is currently not done for mul, and, or, xor, neg, shl,
+sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
+for read-modify-write instructions. It is also current not done if the
+OF or CF flags are needed.
+
+The shift operators have the complication that when the shift count is
+zero, EFLAGS is not set, so they can only subsume a test instruction if
+the shift count is known to be non-zero. Also, using the EFLAGS value
+from a shift is apparently very slow on some x86 implementations.
+
+In read-modify-write instructions, the root node in the isel match is
+the store, and isel has no way for the use of the EFLAGS result of the
+arithmetic to be remapped to the new node.
+
+Add and subtract instructions set OF on signed overflow and CF on unsiged
+overflow, while test instructions always clear OF and CF. In order to
+replace a test with an add or subtract in a situation where OF or CF is
+needed, codegen must be able to prove that the operation cannot see
+signed or unsigned overflow, respectively.
+
+//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible. A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+ %tmp = alloca <16 x float>, align 16
+ %tmp2 = alloca <16 x float>, align 16
+ store <16 x float> %A, <16 x float>* %tmp
+ %s = bitcast <16 x float>* %tmp to i8*
+ %s2 = bitcast <16 x float>* %tmp2 to i8*
+ call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+ %R = load <16 x float>* %tmp2
+ ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+ subl $140, %esp
+ movaps %xmm3, 112(%esp)
+ movaps %xmm2, 96(%esp)
+ movaps %xmm1, 80(%esp)
+ movaps %xmm0, 64(%esp)
+ movl 60(%esp), %eax
+ movl %eax, 124(%esp)
+ movl 56(%esp), %eax
+ movl %eax, 120(%esp)
+ movl 52(%esp), %eax
+ <many many more 32-bit copies>
+ movaps (%esp), %xmm0
+ movaps 16(%esp), %xmm1
+ movaps 32(%esp), %xmm2
+ movaps 48(%esp), %xmm3
+ addl $140, %esp
+ ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//
+
+Implement processor-specific optimizations for parity with GCC on these
+processors. GCC does two optimizations:
+
+1. ix86_pad_returns inserts a noop before ret instructions if immediately
+ preceded by a conditional branch or is the target of a jump.
+2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
+ code contains more than 3 branches.
+
+The first one is done for all AMDs, Core2, and "Generic"
+The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
+ Core 2, and "Generic"
+
+//===---------------------------------------------------------------------===//
+Testcase:
+int x(int a) { return (a&0xf0)>>4; }
+
+Current output:
+ movl 4(%esp), %eax
+ shrl $4, %eax
+ andl $15, %eax
+ ret
+
+Ideal output:
+ movzbl 4(%esp), %eax
+ shrl $4, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
+properly.
+
+When the return value is not used (i.e. only care about the value in the
+memory), x86 does not have to use add to implement these. Instead, it can use
+add, sub, inc, dec instructions with the "lock" prefix.
+
+This is currently implemented using a bit of instruction selection trick. The
+issue is the target independent pattern produces one output and a chain and we
+want to map it into one that just output a chain. The current trick is to select
+it into a MERGE_VALUES with the first definition being an implicit_def. The
+proper solution is to add new ISD opcodes for the no-output variant. DAG
+combiner can then transform the node before it gets to target node selection.
+
+Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
+fact these instructions are identical to the non-lock versions. We need a way to
+add target specific information to target nodes and have this information
+carried over to machine instructions. Asm printer (or JIT) can use this
+information to add the "lock" prefix.
+
+//===---------------------------------------------------------------------===//
+
+struct B {
+ unsigned char y0 : 1;
+};
+
+int bar(struct B* a) { return a->y0; }
+
+define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
+ %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
+ %2 = load i8* %1, align 1
+ %3 = and i8 %2, 1
+ %4 = zext i8 %3 to i32
+ ret i32 %4
+}
+
+bar: # @bar
+# %bb.0:
+ movb (%rdi), %al
+ andb $1, %al
+ movzbl %al, %eax
+ ret
+
+Missed optimization: should be movl+andl.
+
+//===---------------------------------------------------------------------===//
+
+The x86_64 abi says:
+
+Booleans, when stored in a memory object, are stored as single byte objects the
+value of which is always 0 (false) or 1 (true).
+
+We are not using this fact:
+
+int bar(_Bool *a) { return *a; }
+
+define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
+ %1 = load i8* %a, align 1, !tbaa !0
+ %tmp = and i8 %1, 1
+ %2 = zext i8 %tmp to i32
+ ret i32 %2
+}
+
+bar:
+ movb (%rdi), %al
+ andb $1, %al
+ movzbl %al, %eax
+ ret
+
+GCC produces
+
+bar:
+ movzbl (%rdi), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Take the following C code:
+int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %tmp = xor i32 %b, %a ; <i32> [#uses=1]
+ %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1]
+ %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1]
+ %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
+ ret i32 %conv5
+}
+
+And the following x86 code:
+ xorl %esi, %edi
+ testb $-1, %dil
+ sete %al
+ movzbl %al, %eax
+ ret
+
+A cmpb instead of the xorl+testb would be one instruction shorter.
+
+//===---------------------------------------------------------------------===//
+
+Given the following C code:
+int f(int a, int b) { return (signed char)a == (signed char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %sext = shl i32 %a, 24 ; <i32> [#uses=1]
+ %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1]
+ %sext6 = shl i32 %b, 24 ; <i32> [#uses=1]
+ %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1]
+ %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1]
+ %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
+ ret i32 %conv5
+}
+
+And the following x86 code:
+ movsbl %sil, %eax
+ movsbl %dil, %ecx
+ cmpl %eax, %ecx
+ sete %al
+ movzbl %al, %eax
+ ret
+
+
+It should be possible to eliminate the sign extensions.
+
+//===---------------------------------------------------------------------===//
+
+LLVM misses a load+store narrowing opportunity in this code:
+
+%struct.bf = type { i64, i16, i16, i32 }
+
+@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2]
+
+define void @t1() nounwind ssp {
+entry:
+ %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
+ %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
+ %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2]
+ %3 = load i32* %2, align 1 ; <i32> [#uses=1]
+ %4 = and i32 %3, -65537 ; <i32> [#uses=1]
+ store i32 %4, i32* %2, align 1
+ %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
+ %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
+ %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2]
+ %8 = load i32* %7, align 1 ; <i32> [#uses=1]
+ %9 = and i32 %8, -131073 ; <i32> [#uses=1]
+ store i32 %9, i32* %7, align 1
+ ret void
+}
+
+LLVM currently emits this:
+
+ movq bfi(%rip), %rax
+ andl $-65537, 8(%rax)
+ movq bfi(%rip), %rax
+ andl $-131073, 8(%rax)
+ ret
+
+It could narrow the loads and stores to emit this:
+
+ movq bfi(%rip), %rax
+ andb $-2, 10(%rax)
+ movq bfi(%rip), %rax
+ andb $-3, 10(%rax)
+ ret
+
+The trouble is that there is a TokenFactor between the store and the
+load, making it non-trivial to determine if there's anything between
+the load and the store which would prohibit narrowing.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+void foo(unsigned x) {
+ if (x == 0) bar();
+ else if (x == 1) qux();
+}
+
+currently compiles into:
+_foo:
+ movl 4(%esp), %eax
+ cmpl $1, %eax
+ je LBB0_3
+ testl %eax, %eax
+ jne LBB0_4
+
+the testl could be removed:
+_foo:
+ movl 4(%esp), %eax
+ cmpl $1, %eax
+ je LBB0_3
+ jb LBB0_4
+
+0 is the only unsigned number < 1.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+%0 = type { i32, i1 }
+
+define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
+entry:
+ %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
+ %cmp = extractvalue %0 %uadd, 1
+ %inc = zext i1 %cmp to i32
+ %add = add i32 %x, %sum
+ %z.0 = add i32 %add, %inc
+ ret i32 %z.0
+}
+
+declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+
+compiles to:
+
+_add32carry: ## @add32carry
+ addl %esi, %edi
+ sbbl %ecx, %ecx
+ movl %edi, %eax
+ subl %ecx, %eax
+ ret
+
+But it could be:
+
+_add32carry:
+ leal (%rsi,%rdi), %eax
+ cmpl %esi, %eax
+ adcl $0, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+The hot loop of 256.bzip2 contains code that looks a bit like this:
+
+int foo(char *P, char *Q, int x, int y) {
+ if (P[0] != Q[0])
+ return P[0] < Q[0];
+ if (P[1] != Q[1])
+ return P[1] < Q[1];
+ if (P[2] != Q[2])
+ return P[2] < Q[2];
+ return P[3] < Q[3];
+}
+
+In the real code, we get a lot more wrong than this. However, even in this
+code we generate:
+
+_foo: ## @foo
+## %bb.0: ## %entry
+ movb (%rsi), %al
+ movb (%rdi), %cl
+ cmpb %al, %cl
+ je LBB0_2
+LBB0_1: ## %if.then
+ cmpb %al, %cl
+ jmp LBB0_5
+LBB0_2: ## %if.end
+ movb 1(%rsi), %al
+ movb 1(%rdi), %cl
+ cmpb %al, %cl
+ jne LBB0_1
+## %bb.3: ## %if.end38
+ movb 2(%rsi), %al
+ movb 2(%rdi), %cl
+ cmpb %al, %cl
+ jne LBB0_1
+## %bb.4: ## %if.end60
+ movb 3(%rdi), %al
+ cmpb 3(%rsi), %al
+LBB0_5: ## %if.end60
+ setl %al
+ movzbl %al, %eax
+ ret
+
+Note that we generate jumps to LBB0_1 which does a redundant compare. The
+redundant compare also forces the register values to be live, which prevents
+folding one of the loads into the compare. In contrast, GCC 4.2 produces:
+
+_foo:
+ movzbl (%rsi), %eax
+ cmpb %al, (%rdi)
+ jne L10
+L12:
+ movzbl 1(%rsi), %eax
+ cmpb %al, 1(%rdi)
+ jne L10
+ movzbl 2(%rsi), %eax
+ cmpb %al, 2(%rdi)
+ jne L10
+ movzbl 3(%rdi), %eax
+ cmpb 3(%rsi), %al
+L10:
+ setl %al
+ movzbl %al, %eax
+ ret
+
+which is "perfect".
+
+//===---------------------------------------------------------------------===//
+
+For the branch in the following code:
+int a();
+int b(int x, int y) {
+ if (x & (1<<(y&7)))
+ return a();
+ return y;
+}
+
+We currently generate:
+ movb %sil, %al
+ andb $7, %al
+ movzbl %al, %eax
+ btl %eax, %edi
+ jae .LBB0_2
+
+movl+andl would be shorter than the movb+andb+movzbl sequence.
+
+//===---------------------------------------------------------------------===//
+
+For the following:
+struct u1 {
+ float x, y;
+};
+float foo(struct u1 u) {
+ return u.x + u.y;
+}
+
+We currently generate:
+ movdqa %xmm0, %xmm1
+ pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0]
+ addss %xmm1, %xmm0
+ ret
+
+We could save an instruction here by commuting the addss.
+
+//===---------------------------------------------------------------------===//
+
+This (from PR9661):
+
+float clamp_float(float a) {
+ if (a > 1.0f)
+ return 1.0f;
+ else if (a < 0.0f)
+ return 0.0f;
+ else
+ return a;
+}
+
+Could compile to:
+
+clamp_float: # @clamp_float
+ movss .LCPI0_0(%rip), %xmm1
+ minss %xmm1, %xmm0
+ pxor %xmm1, %xmm1
+ maxss %xmm1, %xmm0
+ ret
+
+with -ffast-math.
+
+//===---------------------------------------------------------------------===//
+
+This function (from PR9803):
+
+int clamp2(int a) {
+ if (a > 5)
+ a = 5;
+ if (a < 0)
+ return 0;
+ return a;
+}
+
+Compiles to:
+
+_clamp2: ## @clamp2
+ pushq %rbp
+ movq %rsp, %rbp
+ cmpl $5, %edi
+ movl $5, %ecx
+ cmovlel %edi, %ecx
+ testl %ecx, %ecx
+ movl $0, %eax
+ cmovnsl %ecx, %eax
+ popq %rbp
+ ret
+
+The move of 0 could be scheduled above the test to make it is xor reg,reg.
+
+//===---------------------------------------------------------------------===//
+
+GCC PR48986. We currently compile this:
+
+void bar(void);
+void yyy(int* p) {
+ if (__sync_fetch_and_add(p, -1) == 1)
+ bar();
+}
+
+into:
+ movl $-1, %eax
+ lock
+ xaddl %eax, (%rdi)
+ cmpl $1, %eax
+ je LBB0_2
+
+Instead we could generate:
+
+ lock
+ dec %rdi
+ je LBB0_2
+
+The trick is to match "fetch_and_add(X, -C) == C".
+
+//===---------------------------------------------------------------------===//
+
+unsigned t(unsigned a, unsigned b) {
+ return a <= b ? 5 : -5;
+}
+
+We generate:
+ movl $5, %ecx
+ cmpl %esi, %edi
+ movl $-5, %eax
+ cmovbel %ecx, %eax
+
+GCC:
+ cmpl %edi, %esi
+ sbbl %eax, %eax
+ andl $-10, %eax
+ addl $5, %eax
+
+//===---------------------------------------------------------------------===//
diff --git a/contrib/libs/llvm12/lib/Target/X86/TargetInfo/.yandex_meta/licenses.list.txt b/contrib/libs/llvm12/lib/Target/X86/TargetInfo/.yandex_meta/licenses.list.txt
index c62d353021..a4433625d4 100644
--- a/contrib/libs/llvm12/lib/Target/X86/TargetInfo/.yandex_meta/licenses.list.txt
+++ b/contrib/libs/llvm12/lib/Target/X86/TargetInfo/.yandex_meta/licenses.list.txt
@@ -1,7 +1,7 @@
-====================Apache-2.0 WITH LLVM-exception====================
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-
-
-====================Apache-2.0 WITH LLVM-exception====================
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+====================Apache-2.0 WITH LLVM-exception====================
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+
+
+====================Apache-2.0 WITH LLVM-exception====================
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make b/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make
index 2f30db941e..9048b1b373 100644
--- a/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make
+++ b/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make
@@ -2,15 +2,15 @@
LIBRARY()
-OWNER(
- orivej
- g:cpp-contrib
-)
-
-LICENSE(Apache-2.0 WITH LLVM-exception)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
+OWNER(
+ orivej
+ g:cpp-contrib
+)
+
+LICENSE(Apache-2.0 WITH LLVM-exception)
+
+LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
+
PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/lib/Support
diff --git a/contrib/libs/llvm12/lib/Target/X86/ya.make b/contrib/libs/llvm12/lib/Target/X86/ya.make
index 1df03a55e7..eda842b955 100644
--- a/contrib/libs/llvm12/lib/Target/X86/ya.make
+++ b/contrib/libs/llvm12/lib/Target/X86/ya.make
@@ -2,18 +2,18 @@
LIBRARY()
-OWNER(
- orivej
- g:cpp-contrib
-)
-
-LICENSE(
- Apache-2.0 WITH LLVM-exception AND
- NCSA
-)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
+OWNER(
+ orivej
+ g:cpp-contrib
+)
+LICENSE(
+ Apache-2.0 WITH LLVM-exception AND
+ NCSA
+)
+
+LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
+
PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include