diff options
author | heretic <heretic@yandex-team.ru> | 2022-02-10 16:45:43 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:43 +0300 |
commit | 397cbe258b9e064f49c4ca575279f02f39fef76e (patch) | |
tree | a0b0eb3cca6a14e4e8ea715393637672fa651284 /contrib/libs/llvm12/lib/Target/X86 | |
parent | 43f5a35593ebc9f6bcea619bb170394ea7ae468e (diff) | |
download | ydb-397cbe258b9e064f49c4ca575279f02f39fef76e.tar.gz |
Restoring authorship annotation for <heretic@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Target/X86')
14 files changed, 3275 insertions, 3275 deletions
diff --git a/contrib/libs/llvm12/lib/Target/X86/.yandex_meta/licenses.list.txt b/contrib/libs/llvm12/lib/Target/X86/.yandex_meta/licenses.list.txt index f08f43f1d8..92fbe1c084 100644 --- a/contrib/libs/llvm12/lib/Target/X86/.yandex_meta/licenses.list.txt +++ b/contrib/libs/llvm12/lib/Target/X86/.yandex_meta/licenses.list.txt @@ -1,309 +1,309 @@ -====================Apache-2.0 WITH LLVM-exception==================== -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. - - -====================Apache-2.0 WITH LLVM-exception==================== -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - - -====================COPYRIGHT==================== - Trampoline->setComdat(C); - BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline); - IRBuilder<> Builder(EntryBB); - - -====================File: LICENSE.TXT==================== -============================================================================== -The LLVM Project is under the Apache License v2.0 with LLVM Exceptions: -============================================================================== - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - ----- LLVM Exceptions to the Apache 2.0 License ---- - -As an exception, if, as a result of your compiling your source code, portions -of this Software are embedded into an Object form of such source code, you -may redistribute such embedded portions in such Object form without complying -with the conditions of Sections 4(a), 4(b) and 4(d) of the License. - -In addition, if you combine or link compiled forms of this Software with -software that is licensed under the GPLv2 ("Combined Software") and if a -court of competent jurisdiction determines that the patent provision (Section -3), the indemnity provision (Section 9) or other Section of the License -conflicts with the conditions of the GPLv2, you may retroactively and -prospectively choose to deem waived or otherwise exclude such Section(s) of -the License, but only in their entirety and only with respect to the Combined -Software. - -============================================================================== -Software from third parties included in the LLVM Project: -============================================================================== -The LLVM Project contains third party software which is under different license -terms. All such code will be identified clearly using at least one of two -mechanisms: -1) It will be in a separate directory tree with its own `LICENSE.txt` or - `LICENSE` file at the top containing the specific license and restrictions - which apply to that software, or -2) It will contain specific license and restriction terms at the top of every - file. - -============================================================================== -Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy): -============================================================================== -University of Illinois/NCSA -Open Source License - -Copyright (c) 2003-2019 University of Illinois at Urbana-Champaign. -All rights reserved. - -Developed by: - - LLVM Team - - University of Illinois at Urbana-Champaign - - http://llvm.org - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal with -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimers. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimers in the - documentation and/or other materials provided with the distribution. - - * Neither the names of the LLVM Team, University of Illinois at - Urbana-Champaign, nor the names of its contributors may be used to - endorse or promote products derived from this Software without specific - prior written permission. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE -SOFTWARE. - - - -====================File: include/llvm/Support/LICENSE.TXT==================== -LLVM System Interface Library -------------------------------------------------------------------------------- -The LLVM System Interface Library is licensed under the Illinois Open Source -License and has the following additional copyright: - -Copyright (C) 2004 eXtensible Systems, Inc. - - -====================NCSA==================== -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +====================Apache-2.0 WITH LLVM-exception==================== +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. + + +====================Apache-2.0 WITH LLVM-exception==================== +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + + +====================COPYRIGHT==================== + Trampoline->setComdat(C); + BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline); + IRBuilder<> Builder(EntryBB); + + +====================File: LICENSE.TXT==================== +============================================================================== +The LLVM Project is under the Apache License v2.0 with LLVM Exceptions: +============================================================================== + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +---- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + +============================================================================== +Software from third parties included in the LLVM Project: +============================================================================== +The LLVM Project contains third party software which is under different license +terms. All such code will be identified clearly using at least one of two +mechanisms: +1) It will be in a separate directory tree with its own `LICENSE.txt` or + `LICENSE` file at the top containing the specific license and restrictions + which apply to that software, or +2) It will contain specific license and restriction terms at the top of every + file. + +============================================================================== +Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy): +============================================================================== +University of Illinois/NCSA +Open Source License + +Copyright (c) 2003-2019 University of Illinois at Urbana-Champaign. +All rights reserved. + +Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at + Urbana-Champaign, nor the names of its contributors may be used to + endorse or promote products derived from this Software without specific + prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. + + + +====================File: include/llvm/Support/LICENSE.TXT==================== +LLVM System Interface Library +------------------------------------------------------------------------------- +The LLVM System Interface Library is licensed under the Illinois Open Source +License and has the following additional copyright: + +Copyright (C) 2004 eXtensible Systems, Inc. + + +====================NCSA==================== +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. diff --git a/contrib/libs/llvm12/lib/Target/X86/AsmParser/.yandex_meta/licenses.list.txt b/contrib/libs/llvm12/lib/Target/X86/AsmParser/.yandex_meta/licenses.list.txt index c62d353021..a4433625d4 100644 --- a/contrib/libs/llvm12/lib/Target/X86/AsmParser/.yandex_meta/licenses.list.txt +++ b/contrib/libs/llvm12/lib/Target/X86/AsmParser/.yandex_meta/licenses.list.txt @@ -1,7 +1,7 @@ -====================Apache-2.0 WITH LLVM-exception==================== -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. - - -====================Apache-2.0 WITH LLVM-exception==================== -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +====================Apache-2.0 WITH LLVM-exception==================== +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. + + +====================Apache-2.0 WITH LLVM-exception==================== +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make b/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make index f88283f4e5..c30cd4cf65 100644 --- a/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make +++ b/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make @@ -2,15 +2,15 @@ LIBRARY() -OWNER( - orivej - g:cpp-contrib -) - -LICENSE(Apache-2.0 WITH LLVM-exception) - -LICENSE_TEXTS(.yandex_meta/licenses.list.txt) - +OWNER( + orivej + g:cpp-contrib +) + +LICENSE(Apache-2.0 WITH LLVM-exception) + +LICENSE_TEXTS(.yandex_meta/licenses.list.txt) + PEERDIR( contrib/libs/llvm12 contrib/libs/llvm12/include diff --git a/contrib/libs/llvm12/lib/Target/X86/Disassembler/.yandex_meta/licenses.list.txt b/contrib/libs/llvm12/lib/Target/X86/Disassembler/.yandex_meta/licenses.list.txt index c62d353021..a4433625d4 100644 --- a/contrib/libs/llvm12/lib/Target/X86/Disassembler/.yandex_meta/licenses.list.txt +++ b/contrib/libs/llvm12/lib/Target/X86/Disassembler/.yandex_meta/licenses.list.txt @@ -1,7 +1,7 @@ -====================Apache-2.0 WITH LLVM-exception==================== -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. - - -====================Apache-2.0 WITH LLVM-exception==================== -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +====================Apache-2.0 WITH LLVM-exception==================== +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. + + +====================Apache-2.0 WITH LLVM-exception==================== +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make b/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make index b55833692f..d1c75366c1 100644 --- a/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make +++ b/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make @@ -2,15 +2,15 @@ LIBRARY() -OWNER( - orivej - g:cpp-contrib -) - -LICENSE(Apache-2.0 WITH LLVM-exception) - -LICENSE_TEXTS(.yandex_meta/licenses.list.txt) - +OWNER( + orivej + g:cpp-contrib +) + +LICENSE(Apache-2.0 WITH LLVM-exception) + +LICENSE_TEXTS(.yandex_meta/licenses.list.txt) + PEERDIR( contrib/libs/llvm12 contrib/libs/llvm12/include diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/.yandex_meta/licenses.list.txt b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/.yandex_meta/licenses.list.txt index c62d353021..a4433625d4 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/.yandex_meta/licenses.list.txt +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/.yandex_meta/licenses.list.txt @@ -1,7 +1,7 @@ -====================Apache-2.0 WITH LLVM-exception==================== -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. - - -====================Apache-2.0 WITH LLVM-exception==================== -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +====================Apache-2.0 WITH LLVM-exception==================== +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. + + +====================Apache-2.0 WITH LLVM-exception==================== +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make index 8da0d02f5b..565dda72f5 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make @@ -2,15 +2,15 @@ LIBRARY() -OWNER( - orivej - g:cpp-contrib -) - -LICENSE(Apache-2.0 WITH LLVM-exception) - -LICENSE_TEXTS(.yandex_meta/licenses.list.txt) - +OWNER( + orivej + g:cpp-contrib +) + +LICENSE(Apache-2.0 WITH LLVM-exception) + +LICENSE_TEXTS(.yandex_meta/licenses.list.txt) + PEERDIR( contrib/libs/llvm12 contrib/libs/llvm12/include diff --git a/contrib/libs/llvm12/lib/Target/X86/README-FPStack.txt b/contrib/libs/llvm12/lib/Target/X86/README-FPStack.txt index 39efd2dbcf..aab9759b35 100644 --- a/contrib/libs/llvm12/lib/Target/X86/README-FPStack.txt +++ b/contrib/libs/llvm12/lib/Target/X86/README-FPStack.txt @@ -1,85 +1,85 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the X86 backend: FP stack related stuff -//===---------------------------------------------------------------------===// - -//===---------------------------------------------------------------------===// - -Some targets (e.g. athlons) prefer freep to fstp ST(0): -http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html - -//===---------------------------------------------------------------------===// - -This should use fiadd on chips where it is profitable: -double foo(double P, int *I) { return P+*I; } - -We have fiadd patterns now but the followings have the same cost and -complexity. We need a way to specify the later is more profitable. - -def FpADD32m : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP:$dst, (fadd RFP:$src1, - (extloadf64f32 addr:$src2)))]>; - // ST(0) = ST(0) + [mem32] - -def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP:$dst, (fadd RFP:$src1, - (X86fild addr:$src2, i32)))]>; - // ST(0) = ST(0) + [mem32int] - -//===---------------------------------------------------------------------===// - -The FP stackifier should handle simple permutates to reduce number of shuffle -instructions, e.g. turning: - -fld P -> fld Q -fld Q fld P -fxch - -or: - -fxch -> fucomi -fucomi jl X -jg X - -Ideas: -http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html - - -//===---------------------------------------------------------------------===// - -Add a target specific hook to DAG combiner to handle SINT_TO_FP and -FP_TO_SINT when the source operand is already in memory. - -//===---------------------------------------------------------------------===// - -Open code rint,floor,ceil,trunc: -http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html -http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html - -Opencode the sincos[f] libcall. - -//===---------------------------------------------------------------------===// - -None of the FPStack instructions are handled in -X86RegisterInfo::foldMemoryOperand, which prevents the spiller from -folding spill code into the instructions. - -//===---------------------------------------------------------------------===// - -Currently the x86 codegen isn't very good at mixing SSE and FPStack -code: - -unsigned int foo(double x) { return x; } - -foo: - subl $20, %esp - movsd 24(%esp), %xmm0 - movsd %xmm0, 8(%esp) - fldl 8(%esp) - fisttpll (%esp) - movl (%esp), %eax - addl $20, %esp - ret - -This just requires being smarter when custom expanding fptoui. - -//===---------------------------------------------------------------------===// +//===---------------------------------------------------------------------===// +// Random ideas for the X86 backend: FP stack related stuff +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// + +Some targets (e.g. athlons) prefer freep to fstp ST(0): +http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html + +//===---------------------------------------------------------------------===// + +This should use fiadd on chips where it is profitable: +double foo(double P, int *I) { return P+*I; } + +We have fiadd patterns now but the followings have the same cost and +complexity. We need a way to specify the later is more profitable. + +def FpADD32m : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW, + [(set RFP:$dst, (fadd RFP:$src1, + (extloadf64f32 addr:$src2)))]>; + // ST(0) = ST(0) + [mem32] + +def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW, + [(set RFP:$dst, (fadd RFP:$src1, + (X86fild addr:$src2, i32)))]>; + // ST(0) = ST(0) + [mem32int] + +//===---------------------------------------------------------------------===// + +The FP stackifier should handle simple permutates to reduce number of shuffle +instructions, e.g. turning: + +fld P -> fld Q +fld Q fld P +fxch + +or: + +fxch -> fucomi +fucomi jl X +jg X + +Ideas: +http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html + + +//===---------------------------------------------------------------------===// + +Add a target specific hook to DAG combiner to handle SINT_TO_FP and +FP_TO_SINT when the source operand is already in memory. + +//===---------------------------------------------------------------------===// + +Open code rint,floor,ceil,trunc: +http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html +http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html + +Opencode the sincos[f] libcall. + +//===---------------------------------------------------------------------===// + +None of the FPStack instructions are handled in +X86RegisterInfo::foldMemoryOperand, which prevents the spiller from +folding spill code into the instructions. + +//===---------------------------------------------------------------------===// + +Currently the x86 codegen isn't very good at mixing SSE and FPStack +code: + +unsigned int foo(double x) { return x; } + +foo: + subl $20, %esp + movsd 24(%esp), %xmm0 + movsd %xmm0, 8(%esp) + fldl 8(%esp) + fisttpll (%esp) + movl (%esp), %eax + addl $20, %esp + ret + +This just requires being smarter when custom expanding fptoui. + +//===---------------------------------------------------------------------===// diff --git a/contrib/libs/llvm12/lib/Target/X86/README-SSE.txt b/contrib/libs/llvm12/lib/Target/X86/README-SSE.txt index d52840e5c4..40f526b478 100644 --- a/contrib/libs/llvm12/lib/Target/X86/README-SSE.txt +++ b/contrib/libs/llvm12/lib/Target/X86/README-SSE.txt @@ -1,829 +1,829 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the X86 backend: SSE-specific stuff. -//===---------------------------------------------------------------------===// - -//===---------------------------------------------------------------------===// - -SSE Variable shift can be custom lowered to something like this, which uses a -small table + unaligned load + shuffle instead of going through memory. - -__m128i_shift_right: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - -... -__m128i shift_right(__m128i value, unsigned long offset) { - return _mm_shuffle_epi8(value, - _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); -} - -//===---------------------------------------------------------------------===// - -SSE has instructions for doing operations on complex numbers, we should pattern -match them. For example, this should turn into a horizontal add: - -typedef float __attribute__((vector_size(16))) v4f32; -float f32(v4f32 A) { - return A[0]+A[1]+A[2]+A[3]; -} - -Instead we get this: - -_f32: ## @f32 - pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] - addss %xmm0, %xmm1 - pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] - movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] - movaps %xmm0, %xmm3 - addss %xmm1, %xmm3 - movdqa %xmm2, %xmm0 - addss %xmm3, %xmm0 - ret - -Also, there are cases where some simple local SLP would improve codegen a bit. -compiling this: - -_Complex float f32(_Complex float A, _Complex float B) { - return A+B; -} - -into: - -_f32: ## @f32 - movdqa %xmm0, %xmm2 - addss %xmm1, %xmm2 - pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] - pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] - addss %xmm1, %xmm3 - movaps %xmm2, %xmm0 - unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] - ret - -seems silly when it could just be one addps. - - -//===---------------------------------------------------------------------===// - -Expand libm rounding functions inline: Significant speedups possible. -http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html - -//===---------------------------------------------------------------------===// - -When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and -other fast SSE modes. - -//===---------------------------------------------------------------------===// - -Think about doing i64 math in SSE regs on x86-32. - -//===---------------------------------------------------------------------===// - -This testcase should have no SSE instructions in it, and only one load from -a constant pool: - -double %test3(bool %B) { - %C = select bool %B, double 123.412, double 523.01123123 - ret double %C -} - -Currently, the select is being lowered, which prevents the dag combiner from -turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' - -The pattern isel got this one right. - -//===---------------------------------------------------------------------===// - -Lower memcpy / memset to a series of SSE 128 bit move instructions when it's -feasible. - -//===---------------------------------------------------------------------===// - -Codegen: - if (copysign(1.0, x) == copysign(1.0, y)) -into: - if (x^y & mask) -when using SSE. - -//===---------------------------------------------------------------------===// - -Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half -of a v4sf value. - -//===---------------------------------------------------------------------===// - -Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. -Perhaps use pxor / xorp* to clear a XMM register first? - -//===---------------------------------------------------------------------===// - -External test Nurbs exposed some problems. Look for -__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc -emits: - - movaps (%edx), %xmm2 #59.21 - movaps (%edx), %xmm5 #60.21 - movaps (%edx), %xmm4 #61.21 - movaps (%edx), %xmm3 #62.21 - movl 40(%ecx), %ebp #69.49 - shufps $0, %xmm2, %xmm5 #60.21 - movl 100(%esp), %ebx #69.20 - movl (%ebx), %edi #69.20 - imull %ebp, %edi #69.49 - addl (%eax), %edi #70.33 - shufps $85, %xmm2, %xmm4 #61.21 - shufps $170, %xmm2, %xmm3 #62.21 - shufps $255, %xmm2, %xmm2 #63.21 - lea (%ebp,%ebp,2), %ebx #69.49 - negl %ebx #69.49 - lea -3(%edi,%ebx), %ebx #70.33 - shll $4, %ebx #68.37 - addl 32(%ecx), %ebx #68.37 - testb $15, %bl #91.13 - jne L_B1.24 # Prob 5% #91.13 - -This is the llvm code after instruction scheduling: - -cond_next140 (0xa910740, LLVM BB @0xa90beb0): - %reg1078 = MOV32ri -3 - %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0 - %reg1037 = MOV32rm %reg1024, 1, %noreg, 40 - %reg1080 = IMUL32rr %reg1079, %reg1037 - %reg1081 = MOV32rm %reg1058, 1, %noreg, 0 - %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 - %reg1036 = MOV32rm %reg1024, 1, %noreg, 32 - %reg1082 = SHL32ri %reg1038, 4 - %reg1039 = ADD32rr %reg1036, %reg1082 - %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0 - %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 - %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 - %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 - %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 - %reg1040 = MOV32rr %reg1039 - %reg1084 = AND32ri8 %reg1039, 15 - CMP32ri8 %reg1084, 0 - JE mbb<cond_next204,0xa914d30> - -Still ok. After register allocation: - -cond_next140 (0xa910740, LLVM BB @0xa90beb0): - %eax = MOV32ri -3 - %edx = MOV32rm %stack.3, 1, %noreg, 0 - ADD32rm %eax<def&use>, %edx, 1, %noreg, 0 - %edx = MOV32rm %stack.7, 1, %noreg, 0 - %edx = MOV32rm %edx, 1, %noreg, 40 - IMUL32rr %eax<def&use>, %edx - %esi = MOV32rm %stack.5, 1, %noreg, 0 - %esi = MOV32rm %esi, 1, %noreg, 0 - MOV32mr %stack.4, 1, %noreg, 0, %esi - %eax = LEA32r %esi, 1, %eax, -3 - %esi = MOV32rm %stack.7, 1, %noreg, 0 - %esi = MOV32rm %esi, 1, %noreg, 32 - %edi = MOV32rr %eax - SHL32ri %edi<def&use>, 4 - ADD32rr %edi<def&use>, %esi - %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0 - %xmm1 = MOVAPSrr %xmm0 - SHUFPSrr %xmm1<def&use>, %xmm1, 170 - %xmm2 = MOVAPSrr %xmm0 - SHUFPSrr %xmm2<def&use>, %xmm2, 0 - %xmm3 = MOVAPSrr %xmm0 - SHUFPSrr %xmm3<def&use>, %xmm3, 255 - SHUFPSrr %xmm0<def&use>, %xmm0, 85 - %ebx = MOV32rr %edi - AND32ri8 %ebx<def&use>, 15 - CMP32ri8 %ebx, 0 - JE mbb<cond_next204,0xa914d30> - -This looks really bad. The problem is shufps is a destructive opcode. Since it -appears as operand two in more than one shufps ops. It resulted in a number of -copies. Note icc also suffers from the same problem. Either the instruction -selector should select pshufd or The register allocator can made the two-address -to three-address transformation. - -It also exposes some other problems. See MOV32ri -3 and the spills. - -//===---------------------------------------------------------------------===// - -Consider: - -__m128 test(float a) { - return _mm_set_ps(0.0, 0.0, 0.0, a*a); -} - -This compiles into: - -movss 4(%esp), %xmm1 -mulss %xmm1, %xmm1 -xorps %xmm0, %xmm0 -movss %xmm1, %xmm0 -ret - -Because mulss doesn't modify the top 3 elements, the top elements of -xmm1 are already zero'd. We could compile this to: - -movss 4(%esp), %xmm0 -mulss %xmm0, %xmm0 -ret - -//===---------------------------------------------------------------------===// - -Here's a sick and twisted idea. Consider code like this: - -__m128 test(__m128 a) { - float b = *(float*)&A; - ... - return _mm_set_ps(0.0, 0.0, 0.0, b); -} - -This might compile to this code: - -movaps c(%esp), %xmm1 -xorps %xmm0, %xmm0 -movss %xmm1, %xmm0 -ret - -Now consider if the ... code caused xmm1 to get spilled. This might produce -this code: - -movaps c(%esp), %xmm1 -movaps %xmm1, c2(%esp) -... - -xorps %xmm0, %xmm0 -movaps c2(%esp), %xmm1 -movss %xmm1, %xmm0 -ret - -However, since the reload is only used by these instructions, we could -"fold" it into the uses, producing something like this: - -movaps c(%esp), %xmm1 -movaps %xmm1, c2(%esp) -... - -movss c2(%esp), %xmm0 -ret - -... saving two instructions. - -The basic idea is that a reload from a spill slot, can, if only one 4-byte -chunk is used, bring in 3 zeros the one element instead of 4 elements. -This can be used to simplify a variety of shuffle operations, where the -elements are fixed zeros. - -//===---------------------------------------------------------------------===// - -This code generates ugly code, probably due to costs being off or something: - -define void @test(float* %P, <4 x float>* %P2 ) { - %xFloat0.688 = load float* %P - %tmp = load <4 x float>* %P2 - %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 - store <4 x float> %inFloat3.713, <4 x float>* %P2 - ret void -} - -Generates: - -_test: - movl 8(%esp), %eax - movaps (%eax), %xmm0 - pxor %xmm1, %xmm1 - movaps %xmm0, %xmm2 - shufps $50, %xmm1, %xmm2 - shufps $132, %xmm2, %xmm0 - movaps %xmm0, (%eax) - ret - -Would it be better to generate: - -_test: - movl 8(%esp), %ecx - movaps (%ecx), %xmm0 - xor %eax, %eax - pinsrw $6, %eax, %xmm0 - pinsrw $7, %eax, %xmm0 - movaps %xmm0, (%ecx) - ret - -? - -//===---------------------------------------------------------------------===// - -Some useful information in the Apple Altivec / SSE Migration Guide: - -http://developer.apple.com/documentation/Performance/Conceptual/ -Accelerate_sse_migration/index.html - -e.g. SSE select using and, andnot, or. Various SSE compare translations. - -//===---------------------------------------------------------------------===// - -Add hooks to commute some CMPP operations. - -//===---------------------------------------------------------------------===// - -Apply the same transformation that merged four float into a single 128-bit load -to loads from constant pool. - -//===---------------------------------------------------------------------===// - -Floating point max / min are commutable when -enable-unsafe-fp-path is -specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other -nodes which are selected to max / min instructions that are marked commutable. - -//===---------------------------------------------------------------------===// - -We should materialize vector constants like "all ones" and "signbit" with -code like: - - cmpeqps xmm1, xmm1 ; xmm1 = all-ones - -and: - cmpeqps xmm1, xmm1 ; xmm1 = all-ones - psrlq xmm1, 31 ; xmm1 = all 100000000000... - -instead of using a load from the constant pool. The later is important for -ABS/NEG/copysign etc. - -//===---------------------------------------------------------------------===// - -These functions: - -#include <xmmintrin.h> -__m128i a; -void x(unsigned short n) { - a = _mm_slli_epi32 (a, n); -} -void y(unsigned n) { - a = _mm_slli_epi32 (a, n); -} - -compile to ( -O3 -static -fomit-frame-pointer): -_x: - movzwl 4(%esp), %eax - movd %eax, %xmm0 - movaps _a, %xmm1 - pslld %xmm0, %xmm1 - movaps %xmm1, _a - ret -_y: - movd 4(%esp), %xmm0 - movaps _a, %xmm1 - pslld %xmm0, %xmm1 - movaps %xmm1, _a - ret - -"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems -like movd would be sufficient in both cases as the value is already zero -extended in the 32-bit stack slot IIRC. For signed short, it should also be -save, as a really-signed value would be undefined for pslld. - - -//===---------------------------------------------------------------------===// - -#include <math.h> -int t1(double d) { return signbit(d); } - -This currently compiles to: - subl $12, %esp - movsd 16(%esp), %xmm0 - movsd %xmm0, (%esp) - movl 4(%esp), %eax - shrl $31, %eax - addl $12, %esp - ret - -We should use movmskp{s|d} instead. - -//===---------------------------------------------------------------------===// - -CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single -(aligned) vector load. This functionality has a couple of problems. - -1. The code to infer alignment from loads of globals is in the X86 backend, - not the dag combiner. This is because dagcombine2 needs to be able to see - through the X86ISD::Wrapper node, which DAGCombine can't really do. -2. The code for turning 4 x load into a single vector load is target - independent and should be moved to the dag combiner. -3. The code for turning 4 x load into a vector load can only handle a direct - load from a global or a direct load from the stack. It should be generalized - to handle any load from P, P+4, P+8, P+12, where P can be anything. -4. The alignment inference code cannot handle loads from globals in non-static - mode because it doesn't look through the extra dyld stub load. If you try - vec_align.ll without -relocation-model=static, you'll see what I mean. - -//===---------------------------------------------------------------------===// - -We should lower store(fneg(load p), q) into an integer load+xor+store, which -eliminates a constant pool load. For example, consider: - -define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { -entry: - %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] - %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly - ret i64 %tmp20 -} -declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly - -This currently compiles to: - -LCPI1_0: # <4 x float> - .long 2147483648 # float -0 - .long 2147483648 # float -0 - .long 2147483648 # float -0 - .long 2147483648 # float -0 -_ccosf: - subl $12, %esp - movss 16(%esp), %xmm0 - movss %xmm0, 4(%esp) - movss 20(%esp), %xmm0 - xorps LCPI1_0, %xmm0 - movss %xmm0, (%esp) - call L_ccoshf$stub - addl $12, %esp - ret - -Note the load into xmm0, then xor (to negate), then store. In PIC mode, -this code computes the pic base and does two loads to do the constant pool -load, so the improvement is much bigger. - -The tricky part about this xform is that the argument load/store isn't exposed -until post-legalize, and at that point, the fneg has been custom expanded into -an X86 fxor. This means that we need to handle this case in the x86 backend -instead of in target independent code. - -//===---------------------------------------------------------------------===// - -Non-SSE4 insert into 16 x i8 is atrociously bad. - -//===---------------------------------------------------------------------===// - -<2 x i64> extract is substantially worse than <2 x f64>, even if the destination -is memory. - -//===---------------------------------------------------------------------===// - -INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert -any number of 0.0 simultaneously. Currently we only use it for simple -insertions. - -See comments in LowerINSERT_VECTOR_ELT_SSE4. - -//===---------------------------------------------------------------------===// - -On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not -Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are -legal, it'll just take a few extra patterns written in the .td file. - -Note: this is not a code quality issue; the custom lowered code happens to be -right, but we shouldn't have to custom lower anything. This is probably related -to <2 x i64> ops being so bad. - -//===---------------------------------------------------------------------===// - -LLVM currently generates stack realignment code, when it is not necessary -needed. The problem is that we need to know about stack alignment too early, -before RA runs. - -At that point we don't know, whether there will be vector spill, or not. -Stack realignment logic is overly conservative here, but otherwise we can -produce unaligned loads/stores. - -Fixing this will require some huge RA changes. - -Testcase: -#include <emmintrin.h> - -typedef short vSInt16 __attribute__ ((__vector_size__ (16))); - -static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, -- 22725, - 12873};; - -vSInt16 madd(vSInt16 b) -{ - return _mm_madd_epi16(a, b); -} - -Generated code (x86-32, linux): -madd: - pushl %ebp - movl %esp, %ebp - andl $-16, %esp - movaps .LCPI1_0, %xmm1 - pmaddwd %xmm1, %xmm0 - movl %ebp, %esp - popl %ebp - ret - -//===---------------------------------------------------------------------===// - -Consider: +//===---------------------------------------------------------------------===// +// Random ideas for the X86 backend: SSE-specific stuff. +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// + +SSE Variable shift can be custom lowered to something like this, which uses a +small table + unaligned load + shuffle instead of going through memory. + +__m128i_shift_right: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + +... +__m128i shift_right(__m128i value, unsigned long offset) { + return _mm_shuffle_epi8(value, + _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); +} + +//===---------------------------------------------------------------------===// + +SSE has instructions for doing operations on complex numbers, we should pattern +match them. For example, this should turn into a horizontal add: + +typedef float __attribute__((vector_size(16))) v4f32; +float f32(v4f32 A) { + return A[0]+A[1]+A[2]+A[3]; +} + +Instead we get this: + +_f32: ## @f32 + pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] + addss %xmm0, %xmm1 + pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] + movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] + movaps %xmm0, %xmm3 + addss %xmm1, %xmm3 + movdqa %xmm2, %xmm0 + addss %xmm3, %xmm0 + ret + +Also, there are cases where some simple local SLP would improve codegen a bit. +compiling this: + +_Complex float f32(_Complex float A, _Complex float B) { + return A+B; +} + +into: + +_f32: ## @f32 + movdqa %xmm0, %xmm2 + addss %xmm1, %xmm2 + pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] + pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] + addss %xmm1, %xmm3 + movaps %xmm2, %xmm0 + unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] + ret + +seems silly when it could just be one addps. + + +//===---------------------------------------------------------------------===// + +Expand libm rounding functions inline: Significant speedups possible. +http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html + +//===---------------------------------------------------------------------===// + +When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and +other fast SSE modes. + +//===---------------------------------------------------------------------===// + +Think about doing i64 math in SSE regs on x86-32. + +//===---------------------------------------------------------------------===// + +This testcase should have no SSE instructions in it, and only one load from +a constant pool: + +double %test3(bool %B) { + %C = select bool %B, double 123.412, double 523.01123123 + ret double %C +} + +Currently, the select is being lowered, which prevents the dag combiner from +turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' + +The pattern isel got this one right. + +//===---------------------------------------------------------------------===// + +Lower memcpy / memset to a series of SSE 128 bit move instructions when it's +feasible. + +//===---------------------------------------------------------------------===// + +Codegen: + if (copysign(1.0, x) == copysign(1.0, y)) +into: + if (x^y & mask) +when using SSE. + +//===---------------------------------------------------------------------===// + +Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half +of a v4sf value. + +//===---------------------------------------------------------------------===// + +Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. +Perhaps use pxor / xorp* to clear a XMM register first? + +//===---------------------------------------------------------------------===// + +External test Nurbs exposed some problems. Look for +__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc +emits: + + movaps (%edx), %xmm2 #59.21 + movaps (%edx), %xmm5 #60.21 + movaps (%edx), %xmm4 #61.21 + movaps (%edx), %xmm3 #62.21 + movl 40(%ecx), %ebp #69.49 + shufps $0, %xmm2, %xmm5 #60.21 + movl 100(%esp), %ebx #69.20 + movl (%ebx), %edi #69.20 + imull %ebp, %edi #69.49 + addl (%eax), %edi #70.33 + shufps $85, %xmm2, %xmm4 #61.21 + shufps $170, %xmm2, %xmm3 #62.21 + shufps $255, %xmm2, %xmm2 #63.21 + lea (%ebp,%ebp,2), %ebx #69.49 + negl %ebx #69.49 + lea -3(%edi,%ebx), %ebx #70.33 + shll $4, %ebx #68.37 + addl 32(%ecx), %ebx #68.37 + testb $15, %bl #91.13 + jne L_B1.24 # Prob 5% #91.13 + +This is the llvm code after instruction scheduling: + +cond_next140 (0xa910740, LLVM BB @0xa90beb0): + %reg1078 = MOV32ri -3 + %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0 + %reg1037 = MOV32rm %reg1024, 1, %noreg, 40 + %reg1080 = IMUL32rr %reg1079, %reg1037 + %reg1081 = MOV32rm %reg1058, 1, %noreg, 0 + %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 + %reg1036 = MOV32rm %reg1024, 1, %noreg, 32 + %reg1082 = SHL32ri %reg1038, 4 + %reg1039 = ADD32rr %reg1036, %reg1082 + %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0 + %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 + %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 + %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 + %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 + %reg1040 = MOV32rr %reg1039 + %reg1084 = AND32ri8 %reg1039, 15 + CMP32ri8 %reg1084, 0 + JE mbb<cond_next204,0xa914d30> + +Still ok. After register allocation: + +cond_next140 (0xa910740, LLVM BB @0xa90beb0): + %eax = MOV32ri -3 + %edx = MOV32rm %stack.3, 1, %noreg, 0 + ADD32rm %eax<def&use>, %edx, 1, %noreg, 0 + %edx = MOV32rm %stack.7, 1, %noreg, 0 + %edx = MOV32rm %edx, 1, %noreg, 40 + IMUL32rr %eax<def&use>, %edx + %esi = MOV32rm %stack.5, 1, %noreg, 0 + %esi = MOV32rm %esi, 1, %noreg, 0 + MOV32mr %stack.4, 1, %noreg, 0, %esi + %eax = LEA32r %esi, 1, %eax, -3 + %esi = MOV32rm %stack.7, 1, %noreg, 0 + %esi = MOV32rm %esi, 1, %noreg, 32 + %edi = MOV32rr %eax + SHL32ri %edi<def&use>, 4 + ADD32rr %edi<def&use>, %esi + %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0 + %xmm1 = MOVAPSrr %xmm0 + SHUFPSrr %xmm1<def&use>, %xmm1, 170 + %xmm2 = MOVAPSrr %xmm0 + SHUFPSrr %xmm2<def&use>, %xmm2, 0 + %xmm3 = MOVAPSrr %xmm0 + SHUFPSrr %xmm3<def&use>, %xmm3, 255 + SHUFPSrr %xmm0<def&use>, %xmm0, 85 + %ebx = MOV32rr %edi + AND32ri8 %ebx<def&use>, 15 + CMP32ri8 %ebx, 0 + JE mbb<cond_next204,0xa914d30> + +This looks really bad. The problem is shufps is a destructive opcode. Since it +appears as operand two in more than one shufps ops. It resulted in a number of +copies. Note icc also suffers from the same problem. Either the instruction +selector should select pshufd or The register allocator can made the two-address +to three-address transformation. + +It also exposes some other problems. See MOV32ri -3 and the spills. + +//===---------------------------------------------------------------------===// + +Consider: + +__m128 test(float a) { + return _mm_set_ps(0.0, 0.0, 0.0, a*a); +} + +This compiles into: + +movss 4(%esp), %xmm1 +mulss %xmm1, %xmm1 +xorps %xmm0, %xmm0 +movss %xmm1, %xmm0 +ret + +Because mulss doesn't modify the top 3 elements, the top elements of +xmm1 are already zero'd. We could compile this to: + +movss 4(%esp), %xmm0 +mulss %xmm0, %xmm0 +ret + +//===---------------------------------------------------------------------===// + +Here's a sick and twisted idea. Consider code like this: + +__m128 test(__m128 a) { + float b = *(float*)&A; + ... + return _mm_set_ps(0.0, 0.0, 0.0, b); +} + +This might compile to this code: + +movaps c(%esp), %xmm1 +xorps %xmm0, %xmm0 +movss %xmm1, %xmm0 +ret + +Now consider if the ... code caused xmm1 to get spilled. This might produce +this code: + +movaps c(%esp), %xmm1 +movaps %xmm1, c2(%esp) +... + +xorps %xmm0, %xmm0 +movaps c2(%esp), %xmm1 +movss %xmm1, %xmm0 +ret + +However, since the reload is only used by these instructions, we could +"fold" it into the uses, producing something like this: + +movaps c(%esp), %xmm1 +movaps %xmm1, c2(%esp) +... + +movss c2(%esp), %xmm0 +ret + +... saving two instructions. + +The basic idea is that a reload from a spill slot, can, if only one 4-byte +chunk is used, bring in 3 zeros the one element instead of 4 elements. +This can be used to simplify a variety of shuffle operations, where the +elements are fixed zeros. + +//===---------------------------------------------------------------------===// + +This code generates ugly code, probably due to costs being off or something: + +define void @test(float* %P, <4 x float>* %P2 ) { + %xFloat0.688 = load float* %P + %tmp = load <4 x float>* %P2 + %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 + store <4 x float> %inFloat3.713, <4 x float>* %P2 + ret void +} + +Generates: + +_test: + movl 8(%esp), %eax + movaps (%eax), %xmm0 + pxor %xmm1, %xmm1 + movaps %xmm0, %xmm2 + shufps $50, %xmm1, %xmm2 + shufps $132, %xmm2, %xmm0 + movaps %xmm0, (%eax) + ret + +Would it be better to generate: + +_test: + movl 8(%esp), %ecx + movaps (%ecx), %xmm0 + xor %eax, %eax + pinsrw $6, %eax, %xmm0 + pinsrw $7, %eax, %xmm0 + movaps %xmm0, (%ecx) + ret + +? + +//===---------------------------------------------------------------------===// + +Some useful information in the Apple Altivec / SSE Migration Guide: + +http://developer.apple.com/documentation/Performance/Conceptual/ +Accelerate_sse_migration/index.html + +e.g. SSE select using and, andnot, or. Various SSE compare translations. + +//===---------------------------------------------------------------------===// + +Add hooks to commute some CMPP operations. + +//===---------------------------------------------------------------------===// + +Apply the same transformation that merged four float into a single 128-bit load +to loads from constant pool. + +//===---------------------------------------------------------------------===// + +Floating point max / min are commutable when -enable-unsafe-fp-path is +specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other +nodes which are selected to max / min instructions that are marked commutable. + +//===---------------------------------------------------------------------===// + +We should materialize vector constants like "all ones" and "signbit" with +code like: + + cmpeqps xmm1, xmm1 ; xmm1 = all-ones + +and: + cmpeqps xmm1, xmm1 ; xmm1 = all-ones + psrlq xmm1, 31 ; xmm1 = all 100000000000... + +instead of using a load from the constant pool. The later is important for +ABS/NEG/copysign etc. + +//===---------------------------------------------------------------------===// + +These functions: + +#include <xmmintrin.h> +__m128i a; +void x(unsigned short n) { + a = _mm_slli_epi32 (a, n); +} +void y(unsigned n) { + a = _mm_slli_epi32 (a, n); +} + +compile to ( -O3 -static -fomit-frame-pointer): +_x: + movzwl 4(%esp), %eax + movd %eax, %xmm0 + movaps _a, %xmm1 + pslld %xmm0, %xmm1 + movaps %xmm1, _a + ret +_y: + movd 4(%esp), %xmm0 + movaps _a, %xmm1 + pslld %xmm0, %xmm1 + movaps %xmm1, _a + ret + +"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems +like movd would be sufficient in both cases as the value is already zero +extended in the 32-bit stack slot IIRC. For signed short, it should also be +save, as a really-signed value would be undefined for pslld. + + +//===---------------------------------------------------------------------===// + +#include <math.h> +int t1(double d) { return signbit(d); } + +This currently compiles to: + subl $12, %esp + movsd 16(%esp), %xmm0 + movsd %xmm0, (%esp) + movl 4(%esp), %eax + shrl $31, %eax + addl $12, %esp + ret + +We should use movmskp{s|d} instead. + +//===---------------------------------------------------------------------===// + +CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single +(aligned) vector load. This functionality has a couple of problems. + +1. The code to infer alignment from loads of globals is in the X86 backend, + not the dag combiner. This is because dagcombine2 needs to be able to see + through the X86ISD::Wrapper node, which DAGCombine can't really do. +2. The code for turning 4 x load into a single vector load is target + independent and should be moved to the dag combiner. +3. The code for turning 4 x load into a vector load can only handle a direct + load from a global or a direct load from the stack. It should be generalized + to handle any load from P, P+4, P+8, P+12, where P can be anything. +4. The alignment inference code cannot handle loads from globals in non-static + mode because it doesn't look through the extra dyld stub load. If you try + vec_align.ll without -relocation-model=static, you'll see what I mean. + +//===---------------------------------------------------------------------===// + +We should lower store(fneg(load p), q) into an integer load+xor+store, which +eliminates a constant pool load. For example, consider: + +define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { +entry: + %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] + %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly + ret i64 %tmp20 +} +declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly + +This currently compiles to: + +LCPI1_0: # <4 x float> + .long 2147483648 # float -0 + .long 2147483648 # float -0 + .long 2147483648 # float -0 + .long 2147483648 # float -0 +_ccosf: + subl $12, %esp + movss 16(%esp), %xmm0 + movss %xmm0, 4(%esp) + movss 20(%esp), %xmm0 + xorps LCPI1_0, %xmm0 + movss %xmm0, (%esp) + call L_ccoshf$stub + addl $12, %esp + ret + +Note the load into xmm0, then xor (to negate), then store. In PIC mode, +this code computes the pic base and does two loads to do the constant pool +load, so the improvement is much bigger. + +The tricky part about this xform is that the argument load/store isn't exposed +until post-legalize, and at that point, the fneg has been custom expanded into +an X86 fxor. This means that we need to handle this case in the x86 backend +instead of in target independent code. + +//===---------------------------------------------------------------------===// + +Non-SSE4 insert into 16 x i8 is atrociously bad. + +//===---------------------------------------------------------------------===// + +<2 x i64> extract is substantially worse than <2 x f64>, even if the destination +is memory. + +//===---------------------------------------------------------------------===// + +INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert +any number of 0.0 simultaneously. Currently we only use it for simple +insertions. + +See comments in LowerINSERT_VECTOR_ELT_SSE4. + +//===---------------------------------------------------------------------===// + +On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not +Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are +legal, it'll just take a few extra patterns written in the .td file. + +Note: this is not a code quality issue; the custom lowered code happens to be +right, but we shouldn't have to custom lower anything. This is probably related +to <2 x i64> ops being so bad. + +//===---------------------------------------------------------------------===// + +LLVM currently generates stack realignment code, when it is not necessary +needed. The problem is that we need to know about stack alignment too early, +before RA runs. + +At that point we don't know, whether there will be vector spill, or not. +Stack realignment logic is overly conservative here, but otherwise we can +produce unaligned loads/stores. + +Fixing this will require some huge RA changes. + +Testcase: #include <emmintrin.h> -__m128 foo2 (float x) { - return _mm_set_ps (0, 0, x, 0); -} - -In x86-32 mode, we generate this spiffy code: - -_foo2: - movss 4(%esp), %xmm0 - pshufd $81, %xmm0, %xmm0 - ret - -in x86-64 mode, we generate this code, which could be better: - -_foo2: - xorps %xmm1, %xmm1 - movss %xmm0, %xmm1 - pshufd $81, %xmm1, %xmm0 - ret - -In sse4 mode, we could use insertps to make both better. - -Here's another testcase that could use insertps [mem]: - -#include <xmmintrin.h> -extern float x2, x3; -__m128 foo1 (float x1, float x4) { - return _mm_set_ps (x2, x1, x3, x4); -} - -gcc mainline compiles it to: - -foo1: - insertps $0x10, x2(%rip), %xmm0 - insertps $0x10, x3(%rip), %xmm1 - movaps %xmm1, %xmm2 - movlhps %xmm0, %xmm2 - movaps %xmm2, %xmm0 - ret - -//===---------------------------------------------------------------------===// - -We compile vector multiply-by-constant into poor code: - -define <4 x i32> @f(<4 x i32> %i) nounwind { - %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > - ret <4 x i32> %A -} - -On targets without SSE4.1, this compiles into: - -LCPI1_0: ## <4 x i32> - .long 10 - .long 10 - .long 10 - .long 10 - .text - .align 4,0x90 - .globl _f -_f: - pshufd $3, %xmm0, %xmm1 - movd %xmm1, %eax - imull LCPI1_0+12, %eax - movd %eax, %xmm1 - pshufd $1, %xmm0, %xmm2 - movd %xmm2, %eax - imull LCPI1_0+4, %eax - movd %eax, %xmm2 - punpckldq %xmm1, %xmm2 - movd %xmm0, %eax - imull LCPI1_0, %eax - movd %eax, %xmm1 - movhlps %xmm0, %xmm0 - movd %xmm0, %eax - imull LCPI1_0+8, %eax - movd %eax, %xmm0 - punpckldq %xmm0, %xmm1 - movaps %xmm1, %xmm0 - punpckldq %xmm2, %xmm0 - ret - -It would be better to synthesize integer vector multiplication by constants -using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, -simple cases such as multiplication by powers of two would be better as -vector shifts than as multiplications. - -//===---------------------------------------------------------------------===// - -We compile this: - -__m128i -foo2 (char x) -{ - return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); -} - -into: - movl $1, %eax - xorps %xmm0, %xmm0 - pinsrw $2, %eax, %xmm0 - movzbl 4(%esp), %eax - pinsrw $3, %eax, %xmm0 - movl $256, %eax - pinsrw $7, %eax, %xmm0 - ret - - -gcc-4.2: - subl $12, %esp - movzbl 16(%esp), %eax - movdqa LC0, %xmm0 - pinsrw $3, %eax, %xmm0 - addl $12, %esp - ret - .const - .align 4 -LC0: - .word 0 - .word 0 - .word 1 - .word 0 - .word 0 - .word 0 - .word 0 - .word 256 - -With SSE4, it should be - movdqa .LC0(%rip), %xmm0 - pinsrb $6, %edi, %xmm0 - -//===---------------------------------------------------------------------===// - -We should transform a shuffle of two vectors of constants into a single vector -of constants. Also, insertelement of a constant into a vector of constants -should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. - -We compiled it to something horrible: - - .align 4 -LCPI1_1: ## float - .long 1065353216 ## float 1 - .const - - .align 4 -LCPI1_0: ## <4 x float> - .space 4 - .long 1065353216 ## float 1 - .space 4 - .long 1065353216 ## float 1 - .text - .align 4,0x90 - .globl _t -_t: - xorps %xmm0, %xmm0 - movhps LCPI1_0, %xmm0 - movss LCPI1_1, %xmm1 - movaps %xmm0, %xmm2 - shufps $2, %xmm1, %xmm2 - shufps $132, %xmm2, %xmm0 - movaps %xmm0, 0 - -//===---------------------------------------------------------------------===// -rdar://5907648 - -This function: - -float foo(unsigned char x) { - return x; -} - -compiles to (x86-32): - -define float @foo(i8 zeroext %x) nounwind { - %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] - ret float %tmp12 -} - -compiles to: - -_foo: - subl $4, %esp - movzbl 8(%esp), %eax - cvtsi2ss %eax, %xmm0 - movss %xmm0, (%esp) - flds (%esp) - addl $4, %esp - ret - -We should be able to use: - cvtsi2ss 8($esp), %xmm0 -since we know the stack slot is already zext'd. - -//===---------------------------------------------------------------------===// - -Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) -when code size is critical. movlps is slower than movsd on core2 but it's one -byte shorter. - -//===---------------------------------------------------------------------===// - -We should use a dynamic programming based approach to tell when using FPStack -operations is cheaper than SSE. SciMark montecarlo contains code like this -for example: - -double MonteCarlo_num_flops(int Num_samples) { - return ((double) Num_samples)* 4.0; -} - -In fpstack mode, this compiles into: - -LCPI1_0: - .long 1082130432 ## float 4.000000e+00 -_MonteCarlo_num_flops: - subl $4, %esp - movl 8(%esp), %eax - movl %eax, (%esp) - fildl (%esp) - fmuls LCPI1_0 - addl $4, %esp - ret - -in SSE mode, it compiles into significantly slower code: - -_MonteCarlo_num_flops: - subl $12, %esp - cvtsi2sd 16(%esp), %xmm0 - mulsd LCPI1_0, %xmm0 - movsd %xmm0, (%esp) - fldl (%esp) - addl $12, %esp - ret - -There are also other cases in scimark where using fpstack is better, it is -cheaper to do fld1 than load from a constant pool for example, so -"load, add 1.0, store" is better done in the fp stack, etc. - -//===---------------------------------------------------------------------===// - -These should compile into the same code (PR6214): Perhaps instcombine should -canonicalize the former into the later? - -define float @foo(float %x) nounwind { - %t = bitcast float %x to i32 - %s = and i32 %t, 2147483647 - %d = bitcast i32 %s to float - ret float %d -} - -declare float @fabsf(float %n) -define float @bar(float %x) nounwind { - %d = call float @fabsf(float %x) - ret float %d -} - -//===---------------------------------------------------------------------===// - -This IR (from PR6194): - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-apple-darwin10.0.0" - -%0 = type { double, double } -%struct.float3 = type { float, float, float } - -define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { -entry: - %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] - %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] - %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] - %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] - %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] - %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] - %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] - store float %tmp12, float* %tmp5 - ret void -} - -Compiles to: - -_test: ## @test - movd %xmm0, %rax - shrq $32, %rax - movl %eax, 4(%rdi) - ret - -This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and -doing a shuffle from v[1] to v[0] then a float store. - -//===---------------------------------------------------------------------===// - -[UNSAFE FP] - -void foo(double, double, double); -void norm(double x, double y, double z) { - double scale = __builtin_sqrt(x*x + y*y + z*z); - foo(x/scale, y/scale, z/scale); -} - -We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is -slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first -and emit 3 mulsd in place of the divs. This can be done as a target-independent -transform. - -If we're dealing with floats instead of doubles we could even replace the sqrtss -and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the -cost of reduced accuracy. - -//===---------------------------------------------------------------------===// + +typedef short vSInt16 __attribute__ ((__vector_size__ (16))); + +static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, +- 22725, - 12873};; + +vSInt16 madd(vSInt16 b) +{ + return _mm_madd_epi16(a, b); +} + +Generated code (x86-32, linux): +madd: + pushl %ebp + movl %esp, %ebp + andl $-16, %esp + movaps .LCPI1_0, %xmm1 + pmaddwd %xmm1, %xmm0 + movl %ebp, %esp + popl %ebp + ret + +//===---------------------------------------------------------------------===// + +Consider: +#include <emmintrin.h> +__m128 foo2 (float x) { + return _mm_set_ps (0, 0, x, 0); +} + +In x86-32 mode, we generate this spiffy code: + +_foo2: + movss 4(%esp), %xmm0 + pshufd $81, %xmm0, %xmm0 + ret + +in x86-64 mode, we generate this code, which could be better: + +_foo2: + xorps %xmm1, %xmm1 + movss %xmm0, %xmm1 + pshufd $81, %xmm1, %xmm0 + ret + +In sse4 mode, we could use insertps to make both better. + +Here's another testcase that could use insertps [mem]: + +#include <xmmintrin.h> +extern float x2, x3; +__m128 foo1 (float x1, float x4) { + return _mm_set_ps (x2, x1, x3, x4); +} + +gcc mainline compiles it to: + +foo1: + insertps $0x10, x2(%rip), %xmm0 + insertps $0x10, x3(%rip), %xmm1 + movaps %xmm1, %xmm2 + movlhps %xmm0, %xmm2 + movaps %xmm2, %xmm0 + ret + +//===---------------------------------------------------------------------===// + +We compile vector multiply-by-constant into poor code: + +define <4 x i32> @f(<4 x i32> %i) nounwind { + %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > + ret <4 x i32> %A +} + +On targets without SSE4.1, this compiles into: + +LCPI1_0: ## <4 x i32> + .long 10 + .long 10 + .long 10 + .long 10 + .text + .align 4,0x90 + .globl _f +_f: + pshufd $3, %xmm0, %xmm1 + movd %xmm1, %eax + imull LCPI1_0+12, %eax + movd %eax, %xmm1 + pshufd $1, %xmm0, %xmm2 + movd %xmm2, %eax + imull LCPI1_0+4, %eax + movd %eax, %xmm2 + punpckldq %xmm1, %xmm2 + movd %xmm0, %eax + imull LCPI1_0, %eax + movd %eax, %xmm1 + movhlps %xmm0, %xmm0 + movd %xmm0, %eax + imull LCPI1_0+8, %eax + movd %eax, %xmm0 + punpckldq %xmm0, %xmm1 + movaps %xmm1, %xmm0 + punpckldq %xmm2, %xmm0 + ret + +It would be better to synthesize integer vector multiplication by constants +using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, +simple cases such as multiplication by powers of two would be better as +vector shifts than as multiplications. + +//===---------------------------------------------------------------------===// + +We compile this: + +__m128i +foo2 (char x) +{ + return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); +} + +into: + movl $1, %eax + xorps %xmm0, %xmm0 + pinsrw $2, %eax, %xmm0 + movzbl 4(%esp), %eax + pinsrw $3, %eax, %xmm0 + movl $256, %eax + pinsrw $7, %eax, %xmm0 + ret + + +gcc-4.2: + subl $12, %esp + movzbl 16(%esp), %eax + movdqa LC0, %xmm0 + pinsrw $3, %eax, %xmm0 + addl $12, %esp + ret + .const + .align 4 +LC0: + .word 0 + .word 0 + .word 1 + .word 0 + .word 0 + .word 0 + .word 0 + .word 256 + +With SSE4, it should be + movdqa .LC0(%rip), %xmm0 + pinsrb $6, %edi, %xmm0 + +//===---------------------------------------------------------------------===// + +We should transform a shuffle of two vectors of constants into a single vector +of constants. Also, insertelement of a constant into a vector of constants +should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. + +We compiled it to something horrible: + + .align 4 +LCPI1_1: ## float + .long 1065353216 ## float 1 + .const + + .align 4 +LCPI1_0: ## <4 x float> + .space 4 + .long 1065353216 ## float 1 + .space 4 + .long 1065353216 ## float 1 + .text + .align 4,0x90 + .globl _t +_t: + xorps %xmm0, %xmm0 + movhps LCPI1_0, %xmm0 + movss LCPI1_1, %xmm1 + movaps %xmm0, %xmm2 + shufps $2, %xmm1, %xmm2 + shufps $132, %xmm2, %xmm0 + movaps %xmm0, 0 + +//===---------------------------------------------------------------------===// +rdar://5907648 + +This function: + +float foo(unsigned char x) { + return x; +} + +compiles to (x86-32): + +define float @foo(i8 zeroext %x) nounwind { + %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] + ret float %tmp12 +} + +compiles to: + +_foo: + subl $4, %esp + movzbl 8(%esp), %eax + cvtsi2ss %eax, %xmm0 + movss %xmm0, (%esp) + flds (%esp) + addl $4, %esp + ret + +We should be able to use: + cvtsi2ss 8($esp), %xmm0 +since we know the stack slot is already zext'd. + +//===---------------------------------------------------------------------===// + +Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) +when code size is critical. movlps is slower than movsd on core2 but it's one +byte shorter. + +//===---------------------------------------------------------------------===// + +We should use a dynamic programming based approach to tell when using FPStack +operations is cheaper than SSE. SciMark montecarlo contains code like this +for example: + +double MonteCarlo_num_flops(int Num_samples) { + return ((double) Num_samples)* 4.0; +} + +In fpstack mode, this compiles into: + +LCPI1_0: + .long 1082130432 ## float 4.000000e+00 +_MonteCarlo_num_flops: + subl $4, %esp + movl 8(%esp), %eax + movl %eax, (%esp) + fildl (%esp) + fmuls LCPI1_0 + addl $4, %esp + ret + +in SSE mode, it compiles into significantly slower code: + +_MonteCarlo_num_flops: + subl $12, %esp + cvtsi2sd 16(%esp), %xmm0 + mulsd LCPI1_0, %xmm0 + movsd %xmm0, (%esp) + fldl (%esp) + addl $12, %esp + ret + +There are also other cases in scimark where using fpstack is better, it is +cheaper to do fld1 than load from a constant pool for example, so +"load, add 1.0, store" is better done in the fp stack, etc. + +//===---------------------------------------------------------------------===// + +These should compile into the same code (PR6214): Perhaps instcombine should +canonicalize the former into the later? + +define float @foo(float %x) nounwind { + %t = bitcast float %x to i32 + %s = and i32 %t, 2147483647 + %d = bitcast i32 %s to float + ret float %d +} + +declare float @fabsf(float %n) +define float @bar(float %x) nounwind { + %d = call float @fabsf(float %x) + ret float %d +} + +//===---------------------------------------------------------------------===// + +This IR (from PR6194): + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-darwin10.0.0" + +%0 = type { double, double } +%struct.float3 = type { float, float, float } + +define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { +entry: + %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] + %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] + %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] + %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] + %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] + %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] + %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] + store float %tmp12, float* %tmp5 + ret void +} + +Compiles to: + +_test: ## @test + movd %xmm0, %rax + shrq $32, %rax + movl %eax, 4(%rdi) + ret + +This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and +doing a shuffle from v[1] to v[0] then a float store. + +//===---------------------------------------------------------------------===// + +[UNSAFE FP] + +void foo(double, double, double); +void norm(double x, double y, double z) { + double scale = __builtin_sqrt(x*x + y*y + z*z); + foo(x/scale, y/scale, z/scale); +} + +We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is +slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first +and emit 3 mulsd in place of the divs. This can be done as a target-independent +transform. + +If we're dealing with floats instead of doubles we could even replace the sqrtss +and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the +cost of reduced accuracy. + +//===---------------------------------------------------------------------===// diff --git a/contrib/libs/llvm12/lib/Target/X86/README-X86-64.txt b/contrib/libs/llvm12/lib/Target/X86/README-X86-64.txt index a3ea4595ac..d919c697bd 100644 --- a/contrib/libs/llvm12/lib/Target/X86/README-X86-64.txt +++ b/contrib/libs/llvm12/lib/Target/X86/README-X86-64.txt @@ -1,184 +1,184 @@ -//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===// - -AMD64 Optimization Manual 8.2 has some nice information about optimizing integer -multiplication by a constant. How much of it applies to Intel's X86-64 -implementation? There are definite trade-offs to consider: latency vs. register -pressure vs. code size. - -//===---------------------------------------------------------------------===// - -Are we better off using branches instead of cmove to implement FP to -unsigned i64? - -_conv: - ucomiss LC0(%rip), %xmm0 - cvttss2siq %xmm0, %rdx - jb L3 - subss LC0(%rip), %xmm0 - movabsq $-9223372036854775808, %rax - cvttss2siq %xmm0, %rdx - xorq %rax, %rdx -L3: - movq %rdx, %rax - ret - -instead of - -_conv: - movss LCPI1_0(%rip), %xmm1 - cvttss2siq %xmm0, %rcx - movaps %xmm0, %xmm2 - subss %xmm1, %xmm2 - cvttss2siq %xmm2, %rax - movabsq $-9223372036854775808, %rdx - xorq %rdx, %rax - ucomiss %xmm1, %xmm0 - cmovb %rcx, %rax - ret - -Seems like the jb branch has high likelihood of being taken. It would have -saved a few instructions. - -//===---------------------------------------------------------------------===// - -It's not possible to reference AH, BH, CH, and DH registers in an instruction -requiring REX prefix. However, divb and mulb both produce results in AH. If isel -emits a CopyFromReg which gets turned into a movb and that can be allocated a -r8b - r15b. - -To get around this, isel emits a CopyFromReg from AX and then right shift it -down by 8 and truncate it. It's not pretty but it works. We need some register -allocation magic to make the hack go away (e.g. putting additional constraints -on the result of the movb). - -//===---------------------------------------------------------------------===// - -The x86-64 ABI for hidden-argument struct returns requires that the -incoming value of %rdi be copied into %rax by the callee upon return. - -The idea is that it saves callers from having to remember this value, -which would often require a callee-saved register. Callees usually -need to keep this value live for most of their body anyway, so it -doesn't add a significant burden on them. - -We currently implement this in codegen, however this is suboptimal -because it means that it would be quite awkward to implement the -optimization for callers. - -A better implementation would be to relax the LLVM IR rules for sret -arguments to allow a function with an sret argument to have a non-void -return type, and to have the front-end to set up the sret argument value -as the return value of the function. The front-end could more easily -emit uses of the returned struct value to be in terms of the function's -lowered return value, and it would free non-C frontends from a -complication only required by a C-based ABI. - -//===---------------------------------------------------------------------===// - -We get a redundant zero extension for code like this: - -int mask[1000]; -int foo(unsigned x) { - if (x < 10) - x = x * 45; - else - x = x * 78; - return mask[x]; -} - -_foo: -LBB1_0: ## entry - cmpl $9, %edi - jbe LBB1_3 ## bb -LBB1_1: ## bb1 - imull $78, %edi, %eax -LBB1_2: ## bb2 - movl %eax, %eax <---- - movq _mask@GOTPCREL(%rip), %rcx - movl (%rcx,%rax,4), %eax - ret -LBB1_3: ## bb - imull $45, %edi, %eax - jmp LBB1_2 ## bb2 - -Before regalloc, we have: - - %reg1025 = IMUL32rri8 %reg1024, 45, implicit-def %eflags - JMP mbb<bb2,0x203afb0> - Successors according to CFG: 0x203afb0 (#3) - -bb1: 0x203af60, LLVM BB @0x1e02310, ID#2: - Predecessors according to CFG: 0x203aec0 (#0) - %reg1026 = IMUL32rri8 %reg1024, 78, implicit-def %eflags - Successors according to CFG: 0x203afb0 (#3) - -bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3: - Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2) - %reg1027 = PHI %reg1025, mbb<bb,0x203af10>, - %reg1026, mbb<bb1,0x203af60> - %reg1029 = MOVZX64rr32 %reg1027 - -so we'd have to know that IMUL32rri8 leaves the high word zero extended and to -be able to recognize the zero extend. This could also presumably be implemented -if we have whole-function selectiondags. - -//===---------------------------------------------------------------------===// - -Take the following code -(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653): -extern unsigned long table[]; -unsigned long foo(unsigned char *p) { - unsigned long tag = *p; - return table[tag >> 4] + table[tag & 0xf]; -} - -Current code generated: - movzbl (%rdi), %eax - movq %rax, %rcx - andq $240, %rcx - shrq %rcx - andq $15, %rax - movq table(,%rax,8), %rax - addq table(%rcx), %rax - ret - -Issues: -1. First movq should be movl; saves a byte. -2. Both andq's should be andl; saves another two bytes. I think this was - implemented at one point, but subsequently regressed. -3. shrq should be shrl; saves another byte. -4. The first andq can be completely eliminated by using a slightly more - expensive addressing mode. - -//===---------------------------------------------------------------------===// - -Consider the following (contrived testcase, but contains common factors): - -#include <stdarg.h> -int test(int x, ...) { - int sum, i; - va_list l; - va_start(l, x); - for (i = 0; i < x; i++) - sum += va_arg(l, int); - va_end(l); - return sum; -} - -Testcase given in C because fixing it will likely involve changing the IR -generated for it. The primary issue with the result is that it doesn't do any -of the optimizations which are possible if we know the address of a va_list -in the current function is never taken: -1. We shouldn't spill the XMM registers because we only call va_arg with "int". -2. It would be nice if we could sroa the va_list. -3. Probably overkill, but it'd be cool if we could peel off the first five -iterations of the loop. - -Other optimizations involving functions which use va_arg on floats which don't -have the address of a va_list taken: -1. Conversely to the above, we shouldn't spill general registers if we only - call va_arg on "double". -2. If we know nothing more than 64 bits wide is read from the XMM registers, - we can change the spilling code to reduce the amount of stack used by half. - -//===---------------------------------------------------------------------===// +//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===// + +AMD64 Optimization Manual 8.2 has some nice information about optimizing integer +multiplication by a constant. How much of it applies to Intel's X86-64 +implementation? There are definite trade-offs to consider: latency vs. register +pressure vs. code size. + +//===---------------------------------------------------------------------===// + +Are we better off using branches instead of cmove to implement FP to +unsigned i64? + +_conv: + ucomiss LC0(%rip), %xmm0 + cvttss2siq %xmm0, %rdx + jb L3 + subss LC0(%rip), %xmm0 + movabsq $-9223372036854775808, %rax + cvttss2siq %xmm0, %rdx + xorq %rax, %rdx +L3: + movq %rdx, %rax + ret + +instead of + +_conv: + movss LCPI1_0(%rip), %xmm1 + cvttss2siq %xmm0, %rcx + movaps %xmm0, %xmm2 + subss %xmm1, %xmm2 + cvttss2siq %xmm2, %rax + movabsq $-9223372036854775808, %rdx + xorq %rdx, %rax + ucomiss %xmm1, %xmm0 + cmovb %rcx, %rax + ret + +Seems like the jb branch has high likelihood of being taken. It would have +saved a few instructions. + +//===---------------------------------------------------------------------===// + +It's not possible to reference AH, BH, CH, and DH registers in an instruction +requiring REX prefix. However, divb and mulb both produce results in AH. If isel +emits a CopyFromReg which gets turned into a movb and that can be allocated a +r8b - r15b. + +To get around this, isel emits a CopyFromReg from AX and then right shift it +down by 8 and truncate it. It's not pretty but it works. We need some register +allocation magic to make the hack go away (e.g. putting additional constraints +on the result of the movb). + +//===---------------------------------------------------------------------===// + +The x86-64 ABI for hidden-argument struct returns requires that the +incoming value of %rdi be copied into %rax by the callee upon return. + +The idea is that it saves callers from having to remember this value, +which would often require a callee-saved register. Callees usually +need to keep this value live for most of their body anyway, so it +doesn't add a significant burden on them. + +We currently implement this in codegen, however this is suboptimal +because it means that it would be quite awkward to implement the +optimization for callers. + +A better implementation would be to relax the LLVM IR rules for sret +arguments to allow a function with an sret argument to have a non-void +return type, and to have the front-end to set up the sret argument value +as the return value of the function. The front-end could more easily +emit uses of the returned struct value to be in terms of the function's +lowered return value, and it would free non-C frontends from a +complication only required by a C-based ABI. + +//===---------------------------------------------------------------------===// + +We get a redundant zero extension for code like this: + +int mask[1000]; +int foo(unsigned x) { + if (x < 10) + x = x * 45; + else + x = x * 78; + return mask[x]; +} + +_foo: +LBB1_0: ## entry + cmpl $9, %edi + jbe LBB1_3 ## bb +LBB1_1: ## bb1 + imull $78, %edi, %eax +LBB1_2: ## bb2 + movl %eax, %eax <---- + movq _mask@GOTPCREL(%rip), %rcx + movl (%rcx,%rax,4), %eax + ret +LBB1_3: ## bb + imull $45, %edi, %eax + jmp LBB1_2 ## bb2 + +Before regalloc, we have: + + %reg1025 = IMUL32rri8 %reg1024, 45, implicit-def %eflags + JMP mbb<bb2,0x203afb0> + Successors according to CFG: 0x203afb0 (#3) + +bb1: 0x203af60, LLVM BB @0x1e02310, ID#2: + Predecessors according to CFG: 0x203aec0 (#0) + %reg1026 = IMUL32rri8 %reg1024, 78, implicit-def %eflags + Successors according to CFG: 0x203afb0 (#3) + +bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3: + Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2) + %reg1027 = PHI %reg1025, mbb<bb,0x203af10>, + %reg1026, mbb<bb1,0x203af60> + %reg1029 = MOVZX64rr32 %reg1027 + +so we'd have to know that IMUL32rri8 leaves the high word zero extended and to +be able to recognize the zero extend. This could also presumably be implemented +if we have whole-function selectiondags. + +//===---------------------------------------------------------------------===// + +Take the following code +(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653): +extern unsigned long table[]; +unsigned long foo(unsigned char *p) { + unsigned long tag = *p; + return table[tag >> 4] + table[tag & 0xf]; +} + +Current code generated: + movzbl (%rdi), %eax + movq %rax, %rcx + andq $240, %rcx + shrq %rcx + andq $15, %rax + movq table(,%rax,8), %rax + addq table(%rcx), %rax + ret + +Issues: +1. First movq should be movl; saves a byte. +2. Both andq's should be andl; saves another two bytes. I think this was + implemented at one point, but subsequently regressed. +3. shrq should be shrl; saves another byte. +4. The first andq can be completely eliminated by using a slightly more + expensive addressing mode. + +//===---------------------------------------------------------------------===// + +Consider the following (contrived testcase, but contains common factors): + +#include <stdarg.h> +int test(int x, ...) { + int sum, i; + va_list l; + va_start(l, x); + for (i = 0; i < x; i++) + sum += va_arg(l, int); + va_end(l); + return sum; +} + +Testcase given in C because fixing it will likely involve changing the IR +generated for it. The primary issue with the result is that it doesn't do any +of the optimizations which are possible if we know the address of a va_list +in the current function is never taken: +1. We shouldn't spill the XMM registers because we only call va_arg with "int". +2. It would be nice if we could sroa the va_list. +3. Probably overkill, but it'd be cool if we could peel off the first five +iterations of the loop. + +Other optimizations involving functions which use va_arg on floats which don't +have the address of a va_list taken: +1. Conversely to the above, we shouldn't spill general registers if we only + call va_arg on "double". +2. If we know nothing more than 64 bits wide is read from the XMM registers, + we can change the spilling code to reduce the amount of stack used by half. + +//===---------------------------------------------------------------------===// diff --git a/contrib/libs/llvm12/lib/Target/X86/README.txt b/contrib/libs/llvm12/lib/Target/X86/README.txt index c06a7b1ade..6bc6e74b26 100644 --- a/contrib/libs/llvm12/lib/Target/X86/README.txt +++ b/contrib/libs/llvm12/lib/Target/X86/README.txt @@ -1,1794 +1,1794 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the X86 backend. -//===---------------------------------------------------------------------===// - -Improvements to the multiply -> shift/add algorithm: -http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html - -//===---------------------------------------------------------------------===// - -Improve code like this (occurs fairly frequently, e.g. in LLVM): -long long foo(int x) { return 1LL << x; } - -http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html -http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html -http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html - -Another useful one would be ~0ULL >> X and ~0ULL << X. - -One better solution for 1LL << x is: - xorl %eax, %eax - xorl %edx, %edx - testb $32, %cl - sete %al - setne %dl - sall %cl, %eax - sall %cl, %edx - -But that requires good 8-bit subreg support. - -Also, this might be better. It's an extra shift, but it's one instruction -shorter, and doesn't stress 8-bit subreg support. -(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html, -but without the unnecessary and.) - movl %ecx, %eax - shrl $5, %eax - movl %eax, %edx - xorl $1, %edx - sall %cl, %eax - sall %cl. %edx - -64-bit shifts (in general) expand to really bad code. Instead of using -cmovs, we should expand to a conditional branch like GCC produces. - -//===---------------------------------------------------------------------===// - -Some isel ideas: - -1. Dynamic programming based approach when compile time is not an - issue. -2. Code duplication (addressing mode) during isel. -3. Other ideas from "Register-Sensitive Selection, Duplication, and - Sequencing of Instructions". -4. Scheduling for reduced register pressure. E.g. "Minimum Register - Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" - and other related papers. - http://citeseer.ist.psu.edu/govindarajan01minimum.html - -//===---------------------------------------------------------------------===// - -Should we promote i16 to i32 to avoid partial register update stalls? - -//===---------------------------------------------------------------------===// - -Leave any_extend as pseudo instruction and hint to register -allocator. Delay codegen until post register allocation. -Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach -the coalescer how to deal with it though. - -//===---------------------------------------------------------------------===// - -It appears icc use push for parameter passing. Need to investigate. - -//===---------------------------------------------------------------------===// - -The instruction selector sometimes misses folding a load into a compare. The -pattern is written as (cmp reg, (load p)). Because the compare isn't -commutative, it is not matched with the load on both sides. The dag combiner -should be made smart enough to canonicalize the load into the RHS of a compare -when it can invert the result of the compare for free. - -//===---------------------------------------------------------------------===// - -In many cases, LLVM generates code like this: - -_test: - movl 8(%esp), %eax - cmpl %eax, 4(%esp) - setl %al - movzbl %al, %eax - ret - -on some processors (which ones?), it is more efficient to do this: - -_test: - movl 8(%esp), %ebx - xor %eax, %eax - cmpl %ebx, 4(%esp) - setl %al - ret - -Doing this correctly is tricky though, as the xor clobbers the flags. - -//===---------------------------------------------------------------------===// - -We should generate bts/btr/etc instructions on targets where they are cheap or -when codesize is important. e.g., for: - -void setbit(int *target, int bit) { - *target |= (1 << bit); -} -void clearbit(int *target, int bit) { - *target &= ~(1 << bit); -} - -//===---------------------------------------------------------------------===// - -Instead of the following for memset char*, 1, 10: - - movl $16843009, 4(%edx) - movl $16843009, (%edx) - movw $257, 8(%edx) - -It might be better to generate - - movl $16843009, %eax - movl %eax, 4(%edx) - movl %eax, (%edx) - movw al, 8(%edx) - -when we can spare a register. It reduces code size. - -//===---------------------------------------------------------------------===// - -Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently -get this: - -define i32 @test1(i32 %X) { - %Y = sdiv i32 %X, 8 - ret i32 %Y -} - -_test1: - movl 4(%esp), %eax - movl %eax, %ecx - sarl $31, %ecx - shrl $29, %ecx - addl %ecx, %eax - sarl $3, %eax - ret - -GCC knows several different ways to codegen it, one of which is this: - -_test1: - movl 4(%esp), %eax - cmpl $-1, %eax - leal 7(%eax), %ecx - cmovle %ecx, %eax - sarl $3, %eax - ret - -which is probably slower, but it's interesting at least :) - -//===---------------------------------------------------------------------===// - -We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl -We should leave these as libcalls for everything over a much lower threshold, -since libc is hand tuned for medium and large mem ops (avoiding RFO for large -stores, TLB preheating, etc) - -//===---------------------------------------------------------------------===// - -Optimize this into something reasonable: - x * copysign(1.0, y) * copysign(1.0, z) - -//===---------------------------------------------------------------------===// - -Optimize copysign(x, *y) to use an integer load from y. - -//===---------------------------------------------------------------------===// - -The following tests perform worse with LSR: - -lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. - -//===---------------------------------------------------------------------===// - -Adding to the list of cmp / test poor codegen issues: - -int test(__m128 *A, __m128 *B) { - if (_mm_comige_ss(*A, *B)) - return 3; - else - return 4; -} - -_test: - movl 8(%esp), %eax - movaps (%eax), %xmm0 - movl 4(%esp), %eax - movaps (%eax), %xmm1 - comiss %xmm0, %xmm1 - setae %al - movzbl %al, %ecx - movl $3, %eax - movl $4, %edx - cmpl $0, %ecx - cmove %edx, %eax - ret - -Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There -are a number of issues. 1) We are introducing a setcc between the result of the -intrisic call and select. 2) The intrinsic is expected to produce a i32 value -so a any extend (which becomes a zero extend) is added. - -We probably need some kind of target DAG combine hook to fix this. - -//===---------------------------------------------------------------------===// - -We generate significantly worse code for this than GCC: -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150 -http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701 - -There is also one case we do worse on PPC. - -//===---------------------------------------------------------------------===// - -For this: - -int test(int a) -{ - return a * 3; -} - -We currently emits - imull $3, 4(%esp), %eax - -Perhaps this is what we really should generate is? Is imull three or four -cycles? Note: ICC generates this: - movl 4(%esp), %eax - leal (%eax,%eax,2), %eax - -The current instruction priority is based on pattern complexity. The former is -more "complex" because it folds a load so the latter will not be emitted. - -Perhaps we should use AddedComplexity to give LEA32r a higher priority? We -should always try to match LEA first since the LEA matching code does some -estimate to determine whether the match is profitable. - -However, if we care more about code size, then imull is better. It's two bytes -shorter than movl + leal. - -On a Pentium M, both variants have the same characteristics with regard -to throughput; however, the multiplication has a latency of four cycles, as -opposed to two cycles for the movl+lea variant. - -//===---------------------------------------------------------------------===// - -It appears gcc place string data with linkonce linkage in -.section __TEXT,__const_coal,coalesced instead of -.section __DATA,__const_coal,coalesced. -Take a look at darwin.h, there are other Darwin assembler directives that we -do not make use of. - -//===---------------------------------------------------------------------===// - -define i32 @foo(i32* %a, i32 %t) { -entry: - br label %cond_true - -cond_true: ; preds = %cond_true, %entry - %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3] - %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1] - %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1] - %tmp3 = load i32* %tmp2 ; <i32> [#uses=1] - %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1] - %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2] - %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2] - %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1] - br i1 %tmp, label %bb12, label %cond_true - -bb12: ; preds = %cond_true - ret i32 %tmp7 -} -is pessimized by -loop-reduce and -indvars - -//===---------------------------------------------------------------------===// - -u32 to float conversion improvement: - -float uint32_2_float( unsigned u ) { - float fl = (int) (u & 0xffff); - float fh = (int) (u >> 16); - fh *= 0x1.0p16f; - return fh + fl; -} - -00000000 subl $0x04,%esp -00000003 movl 0x08(%esp,1),%eax -00000007 movl %eax,%ecx -00000009 shrl $0x10,%ecx -0000000c cvtsi2ss %ecx,%xmm0 -00000010 andl $0x0000ffff,%eax -00000015 cvtsi2ss %eax,%xmm1 -00000019 mulss 0x00000078,%xmm0 -00000021 addss %xmm1,%xmm0 -00000025 movss %xmm0,(%esp,1) -0000002a flds (%esp,1) -0000002d addl $0x04,%esp -00000030 ret - -//===---------------------------------------------------------------------===// - -When using fastcc abi, align stack slot of argument of type double on 8 byte -boundary to improve performance. - -//===---------------------------------------------------------------------===// - -GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting -simplifications for integer "x cmp y ? a : b". - -//===---------------------------------------------------------------------===// - -Consider the expansion of: - -define i32 @test3(i32 %X) { - %tmp1 = urem i32 %X, 255 - ret i32 %tmp1 -} - -Currently it compiles to: - -... - movl $2155905153, %ecx - movl 8(%esp), %esi - movl %esi, %eax - mull %ecx -... - -This could be "reassociated" into: - - movl $2155905153, %eax - movl 8(%esp), %ecx - mull %ecx - -to avoid the copy. In fact, the existing two-address stuff would do this -except that mul isn't a commutative 2-addr instruction. I guess this has -to be done at isel time based on the #uses to mul? - -//===---------------------------------------------------------------------===// - -Make sure the instruction which starts a loop does not cross a cacheline -boundary. This requires knowning the exact length of each machine instruction. -That is somewhat complicated, but doable. Example 256.bzip2: - -In the new trace, the hot loop has an instruction which crosses a cacheline -boundary. In addition to potential cache misses, this can't help decoding as I -imagine there has to be some kind of complicated decoder reset and realignment -to grab the bytes from the next cacheline. - -532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines -942 942 0x3d03 movl %dh, (1809(%esp, %esi) -937 937 0x3d0a incl %esi -3 3 0x3d0b cmpb %bl, %dl -27 27 0x3d0d jnz 0x000062db <main+11707> - -//===---------------------------------------------------------------------===// - -In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. - -//===---------------------------------------------------------------------===// - -This could be a single 16-bit load. - -int f(char *p) { - if ((p[0] == 1) & (p[1] == 2)) return 1; - return 0; -} - -//===---------------------------------------------------------------------===// - -We should inline lrintf and probably other libc functions. - -//===---------------------------------------------------------------------===// - -This code: - -void test(int X) { - if (X) abort(); -} - -is currently compiled to: - -_test: - subl $12, %esp - cmpl $0, 16(%esp) - jne LBB1_1 - addl $12, %esp - ret -LBB1_1: - call L_abort$stub - -It would be better to produce: - -_test: - subl $12, %esp - cmpl $0, 16(%esp) - jne L_abort$stub - addl $12, %esp - ret - -This can be applied to any no-return function call that takes no arguments etc. -Alternatively, the stack save/restore logic could be shrink-wrapped, producing -something like this: - -_test: - cmpl $0, 4(%esp) - jne LBB1_1 - ret -LBB1_1: - subl $12, %esp - call L_abort$stub - -Both are useful in different situations. Finally, it could be shrink-wrapped -and tail called, like this: - -_test: - cmpl $0, 4(%esp) - jne LBB1_1 - ret -LBB1_1: - pop %eax # realign stack. - call L_abort$stub - -Though this probably isn't worth it. - -//===---------------------------------------------------------------------===// - -Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with -a neg instead of a sub instruction. Consider: - -int test(char X) { return 7-X; } - -we currently produce: -_test: - movl $7, %eax - movsbl 4(%esp), %ecx - subl %ecx, %eax - ret - -We would use one fewer register if codegen'd as: - - movsbl 4(%esp), %eax - neg %eax - add $7, %eax - ret - -Note that this isn't beneficial if the load can be folded into the sub. In -this case, we want a sub: - -int test(int X) { return 7-X; } -_test: - movl $7, %eax - subl 4(%esp), %eax - ret - -//===---------------------------------------------------------------------===// - -Leaf functions that require one 4-byte spill slot have a prolog like this: - -_foo: - pushl %esi - subl $4, %esp -... -and an epilog like this: - addl $4, %esp - popl %esi - ret - -It would be smaller, and potentially faster, to push eax on entry and to -pop into a dummy register instead of using addl/subl of esp. Just don't pop -into any return registers :) - -//===---------------------------------------------------------------------===// - -The X86 backend should fold (branch (or (setcc, setcc))) into multiple -branches. We generate really poor code for: - -double testf(double a) { - return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); -} - -For example, the entry BB is: - -_testf: - subl $20, %esp - pxor %xmm0, %xmm0 - movsd 24(%esp), %xmm1 - ucomisd %xmm0, %xmm1 - setnp %al - sete %cl - testb %cl, %al - jne LBB1_5 # UnifiedReturnBlock -LBB1_1: # cond_true - - -it would be better to replace the last four instructions with: - - jp LBB1_1 - je LBB1_5 -LBB1_1: - -We also codegen the inner ?: into a diamond: - - cvtss2sd LCPI1_0(%rip), %xmm2 - cvtss2sd LCPI1_1(%rip), %xmm3 - ucomisd %xmm1, %xmm0 - ja LBB1_3 # cond_true -LBB1_2: # cond_true - movapd %xmm3, %xmm2 -LBB1_3: # cond_true - movapd %xmm2, %xmm0 - ret - -We should sink the load into xmm3 into the LBB1_2 block. This should -be pretty easy, and will nuke all the copies. - -//===---------------------------------------------------------------------===// - -This: - #include <algorithm> - inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b) - { return std::make_pair(a + b, a + b < a); } - bool no_overflow(unsigned a, unsigned b) - { return !full_add(a, b).second; } - -Should compile to: - addl %esi, %edi - setae %al - movzbl %al, %eax - ret - -on x86-64, instead of the rather stupid-looking: - addl %esi, %edi - setb %al - xorb $1, %al - movzbl %al, %eax - ret - - -//===---------------------------------------------------------------------===// - -The following code: - -bb114.preheader: ; preds = %cond_next94 - %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1] - %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1] - %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1] - %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1] - %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1] - %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2] - %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1] - %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1] - %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1] - %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1] - %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1] - br label %bb114 - -produces: - -LBB3_5: # bb114.preheader - movswl -68(%ebp), %eax - movl $32, %ecx - movl %ecx, -80(%ebp) - subl %eax, -80(%ebp) - movswl -52(%ebp), %eax - movl %ecx, -84(%ebp) - subl %eax, -84(%ebp) - movswl -70(%ebp), %eax - movl %ecx, -88(%ebp) - subl %eax, -88(%ebp) - movswl -50(%ebp), %eax - subl %eax, %ecx - movl %ecx, -76(%ebp) - movswl -42(%ebp), %eax - movl %eax, -92(%ebp) - movswl -66(%ebp), %eax - movl %eax, -96(%ebp) - movw $0, -98(%ebp) - -This appears to be bad because the RA is not folding the store to the stack -slot into the movl. The above instructions could be: - movl $32, -80(%ebp) -... - movl $32, -84(%ebp) -... -This seems like a cross between remat and spill folding. - -This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't -change, so we could simply subtract %eax from %ecx first and then use %ecx (or -vice-versa). - -//===---------------------------------------------------------------------===// - -This code: - - %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1] - br i1 %tmp659, label %cond_true662, label %cond_next715 - -produces this: - - testw %cx, %cx - movswl %cx, %esi - jns LBB4_109 # cond_next715 - -Shark tells us that using %cx in the testw instruction is sub-optimal. It -suggests using the 32-bit register (which is what ICC uses). - -//===---------------------------------------------------------------------===// - -We compile this: - -void compare (long long foo) { - if (foo < 4294967297LL) - abort(); -} - -to: - -compare: - subl $4, %esp - cmpl $0, 8(%esp) - setne %al - movzbw %al, %ax - cmpl $1, 12(%esp) - setg %cl - movzbw %cl, %cx - cmove %ax, %cx - testb $1, %cl - jne .LBB1_2 # UnifiedReturnBlock -.LBB1_1: # ifthen - call abort -.LBB1_2: # UnifiedReturnBlock - addl $4, %esp - ret - -(also really horrible code on ppc). This is due to the expand code for 64-bit -compares. GCC produces multiple branches, which is much nicer: - -compare: - subl $12, %esp - movl 20(%esp), %edx - movl 16(%esp), %eax - decl %edx - jle .L7 -.L5: - addl $12, %esp - ret - .p2align 4,,7 -.L7: - jl .L4 - cmpl $0, %eax - .p2align 4,,8 - ja .L5 -.L4: - .p2align 4,,9 - call abort - -//===---------------------------------------------------------------------===// - -Tail call optimization improvements: Tail call optimization currently -pushes all arguments on the top of the stack (their normal place for -non-tail call optimized calls) that source from the callers arguments -or that source from a virtual register (also possibly sourcing from -callers arguments). -This is done to prevent overwriting of parameters (see example -below) that might be used later. - -example: - -int callee(int32, int64); -int caller(int32 arg1, int32 arg2) { - int64 local = arg2 * 2; - return callee(arg2, (int64)local); -} - -[arg1] [!arg2 no longer valid since we moved local onto it] -[arg2] -> [(int64) -[RETADDR] local ] - -Moving arg1 onto the stack slot of callee function would overwrite -arg2 of the caller. - -Possible optimizations: - - - - Analyse the actual parameters of the callee to see which would - overwrite a caller parameter which is used by the callee and only - push them onto the top of the stack. - - int callee (int32 arg1, int32 arg2); - int caller (int32 arg1, int32 arg2) { - return callee(arg1,arg2); - } - - Here we don't need to write any variables to the top of the stack - since they don't overwrite each other. - - int callee (int32 arg1, int32 arg2); - int caller (int32 arg1, int32 arg2) { - return callee(arg2,arg1); - } - - Here we need to push the arguments because they overwrite each - other. - -//===---------------------------------------------------------------------===// - -main () -{ - int i = 0; - unsigned long int z = 0; - - do { - z -= 0x00004000; - i++; - if (i > 0x00040000) - abort (); - } while (z > 0); - exit (0); -} - -gcc compiles this to: - -_main: - subl $28, %esp - xorl %eax, %eax - jmp L2 -L3: - cmpl $262144, %eax - je L10 -L2: - addl $1, %eax - cmpl $262145, %eax - jne L3 - call L_abort$stub -L10: - movl $0, (%esp) - call L_exit$stub - -llvm: - -_main: - subl $12, %esp - movl $1, %eax - movl $16384, %ecx -LBB1_1: # bb - cmpl $262145, %eax - jge LBB1_4 # cond_true -LBB1_2: # cond_next - incl %eax - addl $4294950912, %ecx - cmpl $16384, %ecx - jne LBB1_1 # bb -LBB1_3: # bb11 - xorl %eax, %eax - addl $12, %esp - ret -LBB1_4: # cond_true - call L_abort$stub - -1. LSR should rewrite the first cmp with induction variable %ecx. -2. DAG combiner should fold - leal 1(%eax), %edx - cmpl $262145, %edx - => - cmpl $262144, %eax - -//===---------------------------------------------------------------------===// - -define i64 @test(double %X) { - %Y = fptosi double %X to i64 - ret i64 %Y -} - -compiles to: - -_test: - subl $20, %esp - movsd 24(%esp), %xmm0 - movsd %xmm0, 8(%esp) - fldl 8(%esp) - fisttpll (%esp) - movl 4(%esp), %edx - movl (%esp), %eax - addl $20, %esp - #FP_REG_KILL - ret - -This should just fldl directly from the input stack slot. - -//===---------------------------------------------------------------------===// - -This code: -int foo (int x) { return (x & 65535) | 255; } - -Should compile into: - -_foo: - movzwl 4(%esp), %eax - orl $255, %eax - ret - -instead of: -_foo: - movl $65280, %eax - andl 4(%esp), %eax - orl $255, %eax - ret - -//===---------------------------------------------------------------------===// - -We're codegen'ing multiply of long longs inefficiently: - -unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) { - return arg1 * arg2; -} - -We compile to (fomit-frame-pointer): - -_LLM: - pushl %esi - movl 8(%esp), %ecx - movl 16(%esp), %esi - movl %esi, %eax - mull %ecx - imull 12(%esp), %esi - addl %edx, %esi - imull 20(%esp), %ecx - movl %esi, %edx - addl %ecx, %edx - popl %esi - ret - -This looks like a scheduling deficiency and lack of remat of the load from -the argument area. ICC apparently produces: - - movl 8(%esp), %ecx - imull 12(%esp), %ecx - movl 16(%esp), %eax - imull 4(%esp), %eax - addl %eax, %ecx - movl 4(%esp), %eax - mull 12(%esp) - addl %ecx, %edx - ret - -Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR: -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236 - -//===---------------------------------------------------------------------===// - -We can fold a store into "zeroing a reg". Instead of: - -xorl %eax, %eax -movl %eax, 124(%esp) - -we should get: - -movl $0, 124(%esp) - -if the flags of the xor are dead. - -Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should -be folded into: shl [mem], 1 - -//===---------------------------------------------------------------------===// - -In SSE mode, we turn abs and neg into a load from the constant pool plus a xor -or and instruction, for example: - - xorpd LCPI1_0, %xmm2 - -However, if xmm2 gets spilled, we end up with really ugly code like this: - - movsd (%esp), %xmm0 - xorpd LCPI1_0, %xmm0 - movsd %xmm0, (%esp) - -Since we 'know' that this is a 'neg', we can actually "fold" the spill into -the neg/abs instruction, turning it into an *integer* operation, like this: - - xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31) - -you could also use xorb, but xorl is less likely to lead to a partial register -stall. Here is a contrived testcase: - -double a, b, c; -void test(double *P) { - double X = *P; - a = X; - bar(); - X = -X; - b = X; - bar(); - c = X; -} - -//===---------------------------------------------------------------------===// - -The generated code on x86 for checking for signed overflow on a multiply the -obvious way is much longer than it needs to be. - -int x(int a, int b) { - long long prod = (long long)a*b; - return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1); -} - -See PR2053 for more details. - -//===---------------------------------------------------------------------===// - -We should investigate using cdq/ctld (effect: edx = sar eax, 31) -more aggressively; it should cost the same as a move+shift on any modern -processor, but it's a lot shorter. Downside is that it puts more -pressure on register allocation because it has fixed operands. - -Example: -int abs(int x) {return x < 0 ? -x : x;} - -gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.: -abs: - movl 4(%esp), %eax - cltd - xorl %edx, %eax - subl %edx, %eax - ret - -//===---------------------------------------------------------------------===// - -Take the following code (from -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): - -extern unsigned char first_one[65536]; -int FirstOnet(unsigned long long arg1) -{ - if (arg1 >> 48) - return (first_one[arg1 >> 48]); - return 0; -} - - -The following code is currently generated: -FirstOnet: - movl 8(%esp), %eax - cmpl $65536, %eax - movl 4(%esp), %ecx - jb .LBB1_2 # UnifiedReturnBlock -.LBB1_1: # ifthen - shrl $16, %eax - movzbl first_one(%eax), %eax - ret -.LBB1_2: # UnifiedReturnBlock - xorl %eax, %eax - ret - -We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this -lets us change the cmpl into a testl, which is shorter, and eliminate the shift. - -//===---------------------------------------------------------------------===// - -We compile this function: - -define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind { -entry: - %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1] - br i1 %tmp2, label %bb7, label %bb - -bb: ; preds = %entry - %tmp6 = add i32 %b, %a ; <i32> [#uses=1] - ret i32 %tmp6 - -bb7: ; preds = %entry - %tmp10 = sub i32 %a, %c ; <i32> [#uses=1] - ret i32 %tmp10 -} - -to: - -foo: # @foo -# %bb.0: # %entry - movl 4(%esp), %ecx - cmpb $0, 16(%esp) - je .LBB0_2 -# %bb.1: # %bb - movl 8(%esp), %eax - addl %ecx, %eax - ret -.LBB0_2: # %bb7 - movl 12(%esp), %edx - movl %ecx, %eax - subl %edx, %eax - ret - -There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a -couple more movls by putting 4(%esp) into %eax instead of %ecx. - -//===---------------------------------------------------------------------===// - -See rdar://4653682. - -From flops: - -LBB1_15: # bb310 - cvtss2sd LCPI1_0, %xmm1 - addsd %xmm1, %xmm0 - movsd 176(%esp), %xmm2 - mulsd %xmm0, %xmm2 - movapd %xmm2, %xmm3 - mulsd %xmm3, %xmm3 - movapd %xmm3, %xmm4 - mulsd LCPI1_23, %xmm4 - addsd LCPI1_24, %xmm4 - mulsd %xmm3, %xmm4 - addsd LCPI1_25, %xmm4 - mulsd %xmm3, %xmm4 - addsd LCPI1_26, %xmm4 - mulsd %xmm3, %xmm4 - addsd LCPI1_27, %xmm4 - mulsd %xmm3, %xmm4 - addsd LCPI1_28, %xmm4 - mulsd %xmm3, %xmm4 - addsd %xmm1, %xmm4 - mulsd %xmm2, %xmm4 - movsd 152(%esp), %xmm1 - addsd %xmm4, %xmm1 - movsd %xmm1, 152(%esp) - incl %eax - cmpl %eax, %esi - jge LBB1_15 # bb310 -LBB1_16: # bb358.loopexit - movsd 152(%esp), %xmm0 - addsd %xmm0, %xmm0 - addsd LCPI1_22, %xmm0 - movsd %xmm0, 152(%esp) - -Rather than spilling the result of the last addsd in the loop, we should have -insert a copy to split the interval (one for the duration of the loop, one -extending to the fall through). The register pressure in the loop isn't high -enough to warrant the spill. - -Also check why xmm7 is not used at all in the function. - -//===---------------------------------------------------------------------===// - -Take the following: - -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128" -target triple = "i386-apple-darwin8" -@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2] -define fastcc void @abort_gzip() noreturn nounwind { -entry: - %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1] - br i1 %tmp.b.i, label %bb.i, label %bb4.i -bb.i: ; preds = %entry - tail call void @exit( i32 1 ) noreturn nounwind - unreachable -bb4.i: ; preds = %entry - store i1 true, i1* @in_exit.4870.b - tail call void @exit( i32 1 ) noreturn nounwind - unreachable -} -declare void @exit(i32) noreturn nounwind - -This compiles into: -_abort_gzip: ## @abort_gzip -## %bb.0: ## %entry - subl $12, %esp - movb _in_exit.4870.b, %al - cmpb $1, %al - jne LBB0_2 - -We somehow miss folding the movb into the cmpb. - -//===---------------------------------------------------------------------===// - -We compile: - -int test(int x, int y) { - return x-y-1; -} - -into (-m64): - -_test: - decl %edi - movl %edi, %eax - subl %esi, %eax - ret - -it would be better to codegen as: x+~y (notl+addl) - -//===---------------------------------------------------------------------===// - -This code: - -int foo(const char *str,...) -{ - __builtin_va_list a; int x; - __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a); - return x; -} - -gets compiled into this on x86-64: - subq $200, %rsp - movaps %xmm7, 160(%rsp) - movaps %xmm6, 144(%rsp) - movaps %xmm5, 128(%rsp) - movaps %xmm4, 112(%rsp) - movaps %xmm3, 96(%rsp) - movaps %xmm2, 80(%rsp) - movaps %xmm1, 64(%rsp) - movaps %xmm0, 48(%rsp) - movq %r9, 40(%rsp) - movq %r8, 32(%rsp) - movq %rcx, 24(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 8(%rsp) - leaq (%rsp), %rax - movq %rax, 192(%rsp) - leaq 208(%rsp), %rax - movq %rax, 184(%rsp) - movl $48, 180(%rsp) - movl $8, 176(%rsp) - movl 176(%rsp), %eax - cmpl $47, %eax - jbe .LBB1_3 # bb -.LBB1_1: # bb3 - movq 184(%rsp), %rcx - leaq 8(%rcx), %rax - movq %rax, 184(%rsp) -.LBB1_2: # bb4 - movl (%rcx), %eax - addq $200, %rsp - ret -.LBB1_3: # bb - movl %eax, %ecx - addl $8, %eax - addq 192(%rsp), %rcx - movl %eax, 176(%rsp) - jmp .LBB1_2 # bb4 - -gcc 4.3 generates: - subq $96, %rsp -.LCFI0: - leaq 104(%rsp), %rax - movq %rsi, -80(%rsp) - movl $8, -120(%rsp) - movq %rax, -112(%rsp) - leaq -88(%rsp), %rax - movq %rax, -104(%rsp) - movl $8, %eax - cmpl $48, %eax - jb .L6 - movq -112(%rsp), %rdx - movl (%rdx), %eax - addq $96, %rsp - ret - .p2align 4,,10 - .p2align 3 -.L6: - mov %eax, %edx - addq -104(%rsp), %rdx - addl $8, %eax - movl %eax, -120(%rsp) - movl (%rdx), %eax - addq $96, %rsp - ret - -and it gets compiled into this on x86: - pushl %ebp - movl %esp, %ebp - subl $4, %esp - leal 12(%ebp), %eax - movl %eax, -4(%ebp) - leal 16(%ebp), %eax - movl %eax, -4(%ebp) - movl 12(%ebp), %eax - addl $4, %esp - popl %ebp - ret - -gcc 4.3 generates: - pushl %ebp - movl %esp, %ebp - movl 12(%ebp), %eax - popl %ebp - ret - -//===---------------------------------------------------------------------===// - -Teach tblgen not to check bitconvert source type in some cases. This allows us -to consolidate the following patterns in X86InstrMMX.td: - -def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))))), - (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>; -def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))))), - (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>; -def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))))), - (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>; - -There are other cases in various td files. - -//===---------------------------------------------------------------------===// - -Take something like the following on x86-32: -unsigned a(unsigned long long x, unsigned y) {return x % y;} - -We currently generate a libcall, but we really shouldn't: the expansion is -shorter and likely faster than the libcall. The expected code is something -like the following: - - movl 12(%ebp), %eax - movl 16(%ebp), %ecx - xorl %edx, %edx - divl %ecx - movl 8(%ebp), %eax - divl %ecx - movl %edx, %eax - ret - -A similar code sequence works for division. - -//===---------------------------------------------------------------------===// - -We currently compile this: - -define i32 @func1(i32 %v1, i32 %v2) nounwind { -entry: - %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) - %sum = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %overflow, label %normal -normal: - ret i32 %sum -overflow: - call void @llvm.trap() - unreachable -} -declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) -declare void @llvm.trap() - -to: - -_func1: - movl 4(%esp), %eax - addl 8(%esp), %eax - jo LBB1_2 ## overflow -LBB1_1: ## normal - ret -LBB1_2: ## overflow - ud2 - -it would be nice to produce "into" someday. - -//===---------------------------------------------------------------------===// - -Test instructions can be eliminated by using EFLAGS values from arithmetic -instructions. This is currently not done for mul, and, or, xor, neg, shl, -sra, srl, shld, shrd, atomic ops, and others. It is also currently not done -for read-modify-write instructions. It is also current not done if the -OF or CF flags are needed. - -The shift operators have the complication that when the shift count is -zero, EFLAGS is not set, so they can only subsume a test instruction if -the shift count is known to be non-zero. Also, using the EFLAGS value -from a shift is apparently very slow on some x86 implementations. - -In read-modify-write instructions, the root node in the isel match is -the store, and isel has no way for the use of the EFLAGS result of the -arithmetic to be remapped to the new node. - -Add and subtract instructions set OF on signed overflow and CF on unsiged -overflow, while test instructions always clear OF and CF. In order to -replace a test with an add or subtract in a situation where OF or CF is -needed, codegen must be able to prove that the operation cannot see -signed or unsigned overflow, respectively. - -//===---------------------------------------------------------------------===// - -memcpy/memmove do not lower to SSE copies when possible. A silly example is: -define <16 x float> @foo(<16 x float> %A) nounwind { - %tmp = alloca <16 x float>, align 16 - %tmp2 = alloca <16 x float>, align 16 - store <16 x float> %A, <16 x float>* %tmp - %s = bitcast <16 x float>* %tmp to i8* - %s2 = bitcast <16 x float>* %tmp2 to i8* - call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16) - %R = load <16 x float>* %tmp2 - ret <16 x float> %R -} - -declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind - -which compiles to: - -_foo: - subl $140, %esp - movaps %xmm3, 112(%esp) - movaps %xmm2, 96(%esp) - movaps %xmm1, 80(%esp) - movaps %xmm0, 64(%esp) - movl 60(%esp), %eax - movl %eax, 124(%esp) - movl 56(%esp), %eax - movl %eax, 120(%esp) - movl 52(%esp), %eax - <many many more 32-bit copies> - movaps (%esp), %xmm0 - movaps 16(%esp), %xmm1 - movaps 32(%esp), %xmm2 - movaps 48(%esp), %xmm3 - addl $140, %esp - ret - -On Nehalem, it may even be cheaper to just use movups when unaligned than to -fall back to lower-granularity chunks. - -//===---------------------------------------------------------------------===// - -Implement processor-specific optimizations for parity with GCC on these -processors. GCC does two optimizations: - -1. ix86_pad_returns inserts a noop before ret instructions if immediately - preceded by a conditional branch or is the target of a jump. -2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of - code contains more than 3 branches. - -The first one is done for all AMDs, Core2, and "Generic" -The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, - Core 2, and "Generic" - -//===---------------------------------------------------------------------===// -Testcase: -int x(int a) { return (a&0xf0)>>4; } - -Current output: - movl 4(%esp), %eax - shrl $4, %eax - andl $15, %eax - ret - -Ideal output: - movzbl 4(%esp), %eax - shrl $4, %eax - ret - -//===---------------------------------------------------------------------===// - -Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch -properly. - -When the return value is not used (i.e. only care about the value in the -memory), x86 does not have to use add to implement these. Instead, it can use -add, sub, inc, dec instructions with the "lock" prefix. - -This is currently implemented using a bit of instruction selection trick. The -issue is the target independent pattern produces one output and a chain and we -want to map it into one that just output a chain. The current trick is to select -it into a MERGE_VALUES with the first definition being an implicit_def. The -proper solution is to add new ISD opcodes for the no-output variant. DAG -combiner can then transform the node before it gets to target node selection. - -Problem #2 is we are adding a whole bunch of x86 atomic instructions when in -fact these instructions are identical to the non-lock versions. We need a way to -add target specific information to target nodes and have this information -carried over to machine instructions. Asm printer (or JIT) can use this -information to add the "lock" prefix. - -//===---------------------------------------------------------------------===// - -struct B { - unsigned char y0 : 1; -}; - -int bar(struct B* a) { return a->y0; } - -define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize { - %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0 - %2 = load i8* %1, align 1 - %3 = and i8 %2, 1 - %4 = zext i8 %3 to i32 - ret i32 %4 -} - -bar: # @bar -# %bb.0: - movb (%rdi), %al - andb $1, %al - movzbl %al, %eax - ret - -Missed optimization: should be movl+andl. - -//===---------------------------------------------------------------------===// - -The x86_64 abi says: - -Booleans, when stored in a memory object, are stored as single byte objects the -value of which is always 0 (false) or 1 (true). - -We are not using this fact: - -int bar(_Bool *a) { return *a; } - -define i32 @bar(i8* nocapture %a) nounwind readonly optsize { - %1 = load i8* %a, align 1, !tbaa !0 - %tmp = and i8 %1, 1 - %2 = zext i8 %tmp to i32 - ret i32 %2 -} - -bar: - movb (%rdi), %al - andb $1, %al - movzbl %al, %eax - ret - -GCC produces - -bar: - movzbl (%rdi), %eax - ret - -//===---------------------------------------------------------------------===// - -Take the following C code: -int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } - -We generate the following IR with clang: -define i32 @f(i32 %a, i32 %b) nounwind readnone { -entry: - %tmp = xor i32 %b, %a ; <i32> [#uses=1] - %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1] - %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1] - %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] - ret i32 %conv5 -} - -And the following x86 code: - xorl %esi, %edi - testb $-1, %dil - sete %al - movzbl %al, %eax - ret - -A cmpb instead of the xorl+testb would be one instruction shorter. - -//===---------------------------------------------------------------------===// - -Given the following C code: -int f(int a, int b) { return (signed char)a == (signed char)b; } - -We generate the following IR with clang: -define i32 @f(i32 %a, i32 %b) nounwind readnone { -entry: - %sext = shl i32 %a, 24 ; <i32> [#uses=1] - %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1] - %sext6 = shl i32 %b, 24 ; <i32> [#uses=1] - %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1] - %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1] - %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] - ret i32 %conv5 -} - -And the following x86 code: - movsbl %sil, %eax - movsbl %dil, %ecx - cmpl %eax, %ecx - sete %al - movzbl %al, %eax - ret - - -It should be possible to eliminate the sign extensions. - -//===---------------------------------------------------------------------===// - -LLVM misses a load+store narrowing opportunity in this code: - -%struct.bf = type { i64, i16, i16, i32 } - -@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] - -define void @t1() nounwind ssp { -entry: - %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] - %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1] - %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2] - %3 = load i32* %2, align 1 ; <i32> [#uses=1] - %4 = and i32 %3, -65537 ; <i32> [#uses=1] - store i32 %4, i32* %2, align 1 - %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] - %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1] - %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2] - %8 = load i32* %7, align 1 ; <i32> [#uses=1] - %9 = and i32 %8, -131073 ; <i32> [#uses=1] - store i32 %9, i32* %7, align 1 - ret void -} - -LLVM currently emits this: - - movq bfi(%rip), %rax - andl $-65537, 8(%rax) - movq bfi(%rip), %rax - andl $-131073, 8(%rax) - ret - -It could narrow the loads and stores to emit this: - - movq bfi(%rip), %rax - andb $-2, 10(%rax) - movq bfi(%rip), %rax - andb $-3, 10(%rax) - ret - -The trouble is that there is a TokenFactor between the store and the -load, making it non-trivial to determine if there's anything between -the load and the store which would prohibit narrowing. - -//===---------------------------------------------------------------------===// - -This code: -void foo(unsigned x) { - if (x == 0) bar(); - else if (x == 1) qux(); -} - -currently compiles into: -_foo: - movl 4(%esp), %eax - cmpl $1, %eax - je LBB0_3 - testl %eax, %eax - jne LBB0_4 - -the testl could be removed: -_foo: - movl 4(%esp), %eax - cmpl $1, %eax - je LBB0_3 - jb LBB0_4 - -0 is the only unsigned number < 1. - -//===---------------------------------------------------------------------===// - -This code: - -%0 = type { i32, i1 } - -define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp { -entry: - %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x) - %cmp = extractvalue %0 %uadd, 1 - %inc = zext i1 %cmp to i32 - %add = add i32 %x, %sum - %z.0 = add i32 %add, %inc - ret i32 %z.0 -} - -declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone - -compiles to: - -_add32carry: ## @add32carry - addl %esi, %edi - sbbl %ecx, %ecx - movl %edi, %eax - subl %ecx, %eax - ret - -But it could be: - -_add32carry: - leal (%rsi,%rdi), %eax - cmpl %esi, %eax - adcl $0, %eax - ret - -//===---------------------------------------------------------------------===// - -The hot loop of 256.bzip2 contains code that looks a bit like this: - -int foo(char *P, char *Q, int x, int y) { - if (P[0] != Q[0]) - return P[0] < Q[0]; - if (P[1] != Q[1]) - return P[1] < Q[1]; - if (P[2] != Q[2]) - return P[2] < Q[2]; - return P[3] < Q[3]; -} - -In the real code, we get a lot more wrong than this. However, even in this -code we generate: - -_foo: ## @foo -## %bb.0: ## %entry - movb (%rsi), %al - movb (%rdi), %cl - cmpb %al, %cl - je LBB0_2 -LBB0_1: ## %if.then - cmpb %al, %cl - jmp LBB0_5 -LBB0_2: ## %if.end - movb 1(%rsi), %al - movb 1(%rdi), %cl - cmpb %al, %cl - jne LBB0_1 -## %bb.3: ## %if.end38 - movb 2(%rsi), %al - movb 2(%rdi), %cl - cmpb %al, %cl - jne LBB0_1 -## %bb.4: ## %if.end60 - movb 3(%rdi), %al - cmpb 3(%rsi), %al -LBB0_5: ## %if.end60 - setl %al - movzbl %al, %eax - ret - -Note that we generate jumps to LBB0_1 which does a redundant compare. The -redundant compare also forces the register values to be live, which prevents -folding one of the loads into the compare. In contrast, GCC 4.2 produces: - -_foo: - movzbl (%rsi), %eax - cmpb %al, (%rdi) - jne L10 -L12: - movzbl 1(%rsi), %eax - cmpb %al, 1(%rdi) - jne L10 - movzbl 2(%rsi), %eax - cmpb %al, 2(%rdi) - jne L10 - movzbl 3(%rdi), %eax - cmpb 3(%rsi), %al -L10: - setl %al - movzbl %al, %eax - ret - -which is "perfect". - -//===---------------------------------------------------------------------===// - -For the branch in the following code: -int a(); -int b(int x, int y) { - if (x & (1<<(y&7))) - return a(); - return y; -} - -We currently generate: - movb %sil, %al - andb $7, %al - movzbl %al, %eax - btl %eax, %edi - jae .LBB0_2 - -movl+andl would be shorter than the movb+andb+movzbl sequence. - -//===---------------------------------------------------------------------===// - -For the following: -struct u1 { - float x, y; -}; -float foo(struct u1 u) { - return u.x + u.y; -} - -We currently generate: - movdqa %xmm0, %xmm1 - pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0] - addss %xmm1, %xmm0 - ret - -We could save an instruction here by commuting the addss. - -//===---------------------------------------------------------------------===// - -This (from PR9661): - -float clamp_float(float a) { - if (a > 1.0f) - return 1.0f; - else if (a < 0.0f) - return 0.0f; - else - return a; -} - -Could compile to: - -clamp_float: # @clamp_float - movss .LCPI0_0(%rip), %xmm1 - minss %xmm1, %xmm0 - pxor %xmm1, %xmm1 - maxss %xmm1, %xmm0 - ret - -with -ffast-math. - -//===---------------------------------------------------------------------===// - -This function (from PR9803): - -int clamp2(int a) { - if (a > 5) - a = 5; - if (a < 0) - return 0; - return a; -} - -Compiles to: - -_clamp2: ## @clamp2 - pushq %rbp - movq %rsp, %rbp - cmpl $5, %edi - movl $5, %ecx - cmovlel %edi, %ecx - testl %ecx, %ecx - movl $0, %eax - cmovnsl %ecx, %eax - popq %rbp - ret - -The move of 0 could be scheduled above the test to make it is xor reg,reg. - -//===---------------------------------------------------------------------===// - -GCC PR48986. We currently compile this: - -void bar(void); -void yyy(int* p) { - if (__sync_fetch_and_add(p, -1) == 1) - bar(); -} - -into: - movl $-1, %eax - lock - xaddl %eax, (%rdi) - cmpl $1, %eax - je LBB0_2 - -Instead we could generate: - - lock - dec %rdi - je LBB0_2 - -The trick is to match "fetch_and_add(X, -C) == C". - -//===---------------------------------------------------------------------===// - -unsigned t(unsigned a, unsigned b) { - return a <= b ? 5 : -5; -} - -We generate: - movl $5, %ecx - cmpl %esi, %edi - movl $-5, %eax - cmovbel %ecx, %eax - -GCC: - cmpl %edi, %esi - sbbl %eax, %eax - andl $-10, %eax - addl $5, %eax - -//===---------------------------------------------------------------------===// +//===---------------------------------------------------------------------===// +// Random ideas for the X86 backend. +//===---------------------------------------------------------------------===// + +Improvements to the multiply -> shift/add algorithm: +http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html + +//===---------------------------------------------------------------------===// + +Improve code like this (occurs fairly frequently, e.g. in LLVM): +long long foo(int x) { return 1LL << x; } + +http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html +http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html +http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html + +Another useful one would be ~0ULL >> X and ~0ULL << X. + +One better solution for 1LL << x is: + xorl %eax, %eax + xorl %edx, %edx + testb $32, %cl + sete %al + setne %dl + sall %cl, %eax + sall %cl, %edx + +But that requires good 8-bit subreg support. + +Also, this might be better. It's an extra shift, but it's one instruction +shorter, and doesn't stress 8-bit subreg support. +(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html, +but without the unnecessary and.) + movl %ecx, %eax + shrl $5, %eax + movl %eax, %edx + xorl $1, %edx + sall %cl, %eax + sall %cl. %edx + +64-bit shifts (in general) expand to really bad code. Instead of using +cmovs, we should expand to a conditional branch like GCC produces. + +//===---------------------------------------------------------------------===// + +Some isel ideas: + +1. Dynamic programming based approach when compile time is not an + issue. +2. Code duplication (addressing mode) during isel. +3. Other ideas from "Register-Sensitive Selection, Duplication, and + Sequencing of Instructions". +4. Scheduling for reduced register pressure. E.g. "Minimum Register + Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" + and other related papers. + http://citeseer.ist.psu.edu/govindarajan01minimum.html + +//===---------------------------------------------------------------------===// + +Should we promote i16 to i32 to avoid partial register update stalls? + +//===---------------------------------------------------------------------===// + +Leave any_extend as pseudo instruction and hint to register +allocator. Delay codegen until post register allocation. +Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach +the coalescer how to deal with it though. + +//===---------------------------------------------------------------------===// + +It appears icc use push for parameter passing. Need to investigate. + +//===---------------------------------------------------------------------===// + +The instruction selector sometimes misses folding a load into a compare. The +pattern is written as (cmp reg, (load p)). Because the compare isn't +commutative, it is not matched with the load on both sides. The dag combiner +should be made smart enough to canonicalize the load into the RHS of a compare +when it can invert the result of the compare for free. + +//===---------------------------------------------------------------------===// + +In many cases, LLVM generates code like this: + +_test: + movl 8(%esp), %eax + cmpl %eax, 4(%esp) + setl %al + movzbl %al, %eax + ret + +on some processors (which ones?), it is more efficient to do this: + +_test: + movl 8(%esp), %ebx + xor %eax, %eax + cmpl %ebx, 4(%esp) + setl %al + ret + +Doing this correctly is tricky though, as the xor clobbers the flags. + +//===---------------------------------------------------------------------===// + +We should generate bts/btr/etc instructions on targets where they are cheap or +when codesize is important. e.g., for: + +void setbit(int *target, int bit) { + *target |= (1 << bit); +} +void clearbit(int *target, int bit) { + *target &= ~(1 << bit); +} + +//===---------------------------------------------------------------------===// + +Instead of the following for memset char*, 1, 10: + + movl $16843009, 4(%edx) + movl $16843009, (%edx) + movw $257, 8(%edx) + +It might be better to generate + + movl $16843009, %eax + movl %eax, 4(%edx) + movl %eax, (%edx) + movw al, 8(%edx) + +when we can spare a register. It reduces code size. + +//===---------------------------------------------------------------------===// + +Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently +get this: + +define i32 @test1(i32 %X) { + %Y = sdiv i32 %X, 8 + ret i32 %Y +} + +_test1: + movl 4(%esp), %eax + movl %eax, %ecx + sarl $31, %ecx + shrl $29, %ecx + addl %ecx, %eax + sarl $3, %eax + ret + +GCC knows several different ways to codegen it, one of which is this: + +_test1: + movl 4(%esp), %eax + cmpl $-1, %eax + leal 7(%eax), %ecx + cmovle %ecx, %eax + sarl $3, %eax + ret + +which is probably slower, but it's interesting at least :) + +//===---------------------------------------------------------------------===// + +We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl +We should leave these as libcalls for everything over a much lower threshold, +since libc is hand tuned for medium and large mem ops (avoiding RFO for large +stores, TLB preheating, etc) + +//===---------------------------------------------------------------------===// + +Optimize this into something reasonable: + x * copysign(1.0, y) * copysign(1.0, z) + +//===---------------------------------------------------------------------===// + +Optimize copysign(x, *y) to use an integer load from y. + +//===---------------------------------------------------------------------===// + +The following tests perform worse with LSR: + +lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. + +//===---------------------------------------------------------------------===// + +Adding to the list of cmp / test poor codegen issues: + +int test(__m128 *A, __m128 *B) { + if (_mm_comige_ss(*A, *B)) + return 3; + else + return 4; +} + +_test: + movl 8(%esp), %eax + movaps (%eax), %xmm0 + movl 4(%esp), %eax + movaps (%eax), %xmm1 + comiss %xmm0, %xmm1 + setae %al + movzbl %al, %ecx + movl $3, %eax + movl $4, %edx + cmpl $0, %ecx + cmove %edx, %eax + ret + +Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There +are a number of issues. 1) We are introducing a setcc between the result of the +intrisic call and select. 2) The intrinsic is expected to produce a i32 value +so a any extend (which becomes a zero extend) is added. + +We probably need some kind of target DAG combine hook to fix this. + +//===---------------------------------------------------------------------===// + +We generate significantly worse code for this than GCC: +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150 +http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701 + +There is also one case we do worse on PPC. + +//===---------------------------------------------------------------------===// + +For this: + +int test(int a) +{ + return a * 3; +} + +We currently emits + imull $3, 4(%esp), %eax + +Perhaps this is what we really should generate is? Is imull three or four +cycles? Note: ICC generates this: + movl 4(%esp), %eax + leal (%eax,%eax,2), %eax + +The current instruction priority is based on pattern complexity. The former is +more "complex" because it folds a load so the latter will not be emitted. + +Perhaps we should use AddedComplexity to give LEA32r a higher priority? We +should always try to match LEA first since the LEA matching code does some +estimate to determine whether the match is profitable. + +However, if we care more about code size, then imull is better. It's two bytes +shorter than movl + leal. + +On a Pentium M, both variants have the same characteristics with regard +to throughput; however, the multiplication has a latency of four cycles, as +opposed to two cycles for the movl+lea variant. + +//===---------------------------------------------------------------------===// + +It appears gcc place string data with linkonce linkage in +.section __TEXT,__const_coal,coalesced instead of +.section __DATA,__const_coal,coalesced. +Take a look at darwin.h, there are other Darwin assembler directives that we +do not make use of. + +//===---------------------------------------------------------------------===// + +define i32 @foo(i32* %a, i32 %t) { +entry: + br label %cond_true + +cond_true: ; preds = %cond_true, %entry + %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3] + %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1] + %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1] + %tmp3 = load i32* %tmp2 ; <i32> [#uses=1] + %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1] + %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2] + %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2] + %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1] + br i1 %tmp, label %bb12, label %cond_true + +bb12: ; preds = %cond_true + ret i32 %tmp7 +} +is pessimized by -loop-reduce and -indvars + +//===---------------------------------------------------------------------===// + +u32 to float conversion improvement: + +float uint32_2_float( unsigned u ) { + float fl = (int) (u & 0xffff); + float fh = (int) (u >> 16); + fh *= 0x1.0p16f; + return fh + fl; +} + +00000000 subl $0x04,%esp +00000003 movl 0x08(%esp,1),%eax +00000007 movl %eax,%ecx +00000009 shrl $0x10,%ecx +0000000c cvtsi2ss %ecx,%xmm0 +00000010 andl $0x0000ffff,%eax +00000015 cvtsi2ss %eax,%xmm1 +00000019 mulss 0x00000078,%xmm0 +00000021 addss %xmm1,%xmm0 +00000025 movss %xmm0,(%esp,1) +0000002a flds (%esp,1) +0000002d addl $0x04,%esp +00000030 ret + +//===---------------------------------------------------------------------===// + +When using fastcc abi, align stack slot of argument of type double on 8 byte +boundary to improve performance. + +//===---------------------------------------------------------------------===// + +GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting +simplifications for integer "x cmp y ? a : b". + +//===---------------------------------------------------------------------===// + +Consider the expansion of: + +define i32 @test3(i32 %X) { + %tmp1 = urem i32 %X, 255 + ret i32 %tmp1 +} + +Currently it compiles to: + +... + movl $2155905153, %ecx + movl 8(%esp), %esi + movl %esi, %eax + mull %ecx +... + +This could be "reassociated" into: + + movl $2155905153, %eax + movl 8(%esp), %ecx + mull %ecx + +to avoid the copy. In fact, the existing two-address stuff would do this +except that mul isn't a commutative 2-addr instruction. I guess this has +to be done at isel time based on the #uses to mul? + +//===---------------------------------------------------------------------===// + +Make sure the instruction which starts a loop does not cross a cacheline +boundary. This requires knowning the exact length of each machine instruction. +That is somewhat complicated, but doable. Example 256.bzip2: + +In the new trace, the hot loop has an instruction which crosses a cacheline +boundary. In addition to potential cache misses, this can't help decoding as I +imagine there has to be some kind of complicated decoder reset and realignment +to grab the bytes from the next cacheline. + +532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines +942 942 0x3d03 movl %dh, (1809(%esp, %esi) +937 937 0x3d0a incl %esi +3 3 0x3d0b cmpb %bl, %dl +27 27 0x3d0d jnz 0x000062db <main+11707> + +//===---------------------------------------------------------------------===// + +In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. + +//===---------------------------------------------------------------------===// + +This could be a single 16-bit load. + +int f(char *p) { + if ((p[0] == 1) & (p[1] == 2)) return 1; + return 0; +} + +//===---------------------------------------------------------------------===// + +We should inline lrintf and probably other libc functions. + +//===---------------------------------------------------------------------===// + +This code: + +void test(int X) { + if (X) abort(); +} + +is currently compiled to: + +_test: + subl $12, %esp + cmpl $0, 16(%esp) + jne LBB1_1 + addl $12, %esp + ret +LBB1_1: + call L_abort$stub + +It would be better to produce: + +_test: + subl $12, %esp + cmpl $0, 16(%esp) + jne L_abort$stub + addl $12, %esp + ret + +This can be applied to any no-return function call that takes no arguments etc. +Alternatively, the stack save/restore logic could be shrink-wrapped, producing +something like this: + +_test: + cmpl $0, 4(%esp) + jne LBB1_1 + ret +LBB1_1: + subl $12, %esp + call L_abort$stub + +Both are useful in different situations. Finally, it could be shrink-wrapped +and tail called, like this: + +_test: + cmpl $0, 4(%esp) + jne LBB1_1 + ret +LBB1_1: + pop %eax # realign stack. + call L_abort$stub + +Though this probably isn't worth it. + +//===---------------------------------------------------------------------===// + +Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with +a neg instead of a sub instruction. Consider: + +int test(char X) { return 7-X; } + +we currently produce: +_test: + movl $7, %eax + movsbl 4(%esp), %ecx + subl %ecx, %eax + ret + +We would use one fewer register if codegen'd as: + + movsbl 4(%esp), %eax + neg %eax + add $7, %eax + ret + +Note that this isn't beneficial if the load can be folded into the sub. In +this case, we want a sub: + +int test(int X) { return 7-X; } +_test: + movl $7, %eax + subl 4(%esp), %eax + ret + +//===---------------------------------------------------------------------===// + +Leaf functions that require one 4-byte spill slot have a prolog like this: + +_foo: + pushl %esi + subl $4, %esp +... +and an epilog like this: + addl $4, %esp + popl %esi + ret + +It would be smaller, and potentially faster, to push eax on entry and to +pop into a dummy register instead of using addl/subl of esp. Just don't pop +into any return registers :) + +//===---------------------------------------------------------------------===// + +The X86 backend should fold (branch (or (setcc, setcc))) into multiple +branches. We generate really poor code for: + +double testf(double a) { + return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); +} + +For example, the entry BB is: + +_testf: + subl $20, %esp + pxor %xmm0, %xmm0 + movsd 24(%esp), %xmm1 + ucomisd %xmm0, %xmm1 + setnp %al + sete %cl + testb %cl, %al + jne LBB1_5 # UnifiedReturnBlock +LBB1_1: # cond_true + + +it would be better to replace the last four instructions with: + + jp LBB1_1 + je LBB1_5 +LBB1_1: + +We also codegen the inner ?: into a diamond: + + cvtss2sd LCPI1_0(%rip), %xmm2 + cvtss2sd LCPI1_1(%rip), %xmm3 + ucomisd %xmm1, %xmm0 + ja LBB1_3 # cond_true +LBB1_2: # cond_true + movapd %xmm3, %xmm2 +LBB1_3: # cond_true + movapd %xmm2, %xmm0 + ret + +We should sink the load into xmm3 into the LBB1_2 block. This should +be pretty easy, and will nuke all the copies. + +//===---------------------------------------------------------------------===// + +This: + #include <algorithm> + inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b) + { return std::make_pair(a + b, a + b < a); } + bool no_overflow(unsigned a, unsigned b) + { return !full_add(a, b).second; } + +Should compile to: + addl %esi, %edi + setae %al + movzbl %al, %eax + ret + +on x86-64, instead of the rather stupid-looking: + addl %esi, %edi + setb %al + xorb $1, %al + movzbl %al, %eax + ret + + +//===---------------------------------------------------------------------===// + +The following code: + +bb114.preheader: ; preds = %cond_next94 + %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1] + %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1] + %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1] + %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1] + %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1] + %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2] + %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1] + %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1] + %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1] + %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1] + %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1] + br label %bb114 + +produces: + +LBB3_5: # bb114.preheader + movswl -68(%ebp), %eax + movl $32, %ecx + movl %ecx, -80(%ebp) + subl %eax, -80(%ebp) + movswl -52(%ebp), %eax + movl %ecx, -84(%ebp) + subl %eax, -84(%ebp) + movswl -70(%ebp), %eax + movl %ecx, -88(%ebp) + subl %eax, -88(%ebp) + movswl -50(%ebp), %eax + subl %eax, %ecx + movl %ecx, -76(%ebp) + movswl -42(%ebp), %eax + movl %eax, -92(%ebp) + movswl -66(%ebp), %eax + movl %eax, -96(%ebp) + movw $0, -98(%ebp) + +This appears to be bad because the RA is not folding the store to the stack +slot into the movl. The above instructions could be: + movl $32, -80(%ebp) +... + movl $32, -84(%ebp) +... +This seems like a cross between remat and spill folding. + +This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't +change, so we could simply subtract %eax from %ecx first and then use %ecx (or +vice-versa). + +//===---------------------------------------------------------------------===// + +This code: + + %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1] + br i1 %tmp659, label %cond_true662, label %cond_next715 + +produces this: + + testw %cx, %cx + movswl %cx, %esi + jns LBB4_109 # cond_next715 + +Shark tells us that using %cx in the testw instruction is sub-optimal. It +suggests using the 32-bit register (which is what ICC uses). + +//===---------------------------------------------------------------------===// + +We compile this: + +void compare (long long foo) { + if (foo < 4294967297LL) + abort(); +} + +to: + +compare: + subl $4, %esp + cmpl $0, 8(%esp) + setne %al + movzbw %al, %ax + cmpl $1, 12(%esp) + setg %cl + movzbw %cl, %cx + cmove %ax, %cx + testb $1, %cl + jne .LBB1_2 # UnifiedReturnBlock +.LBB1_1: # ifthen + call abort +.LBB1_2: # UnifiedReturnBlock + addl $4, %esp + ret + +(also really horrible code on ppc). This is due to the expand code for 64-bit +compares. GCC produces multiple branches, which is much nicer: + +compare: + subl $12, %esp + movl 20(%esp), %edx + movl 16(%esp), %eax + decl %edx + jle .L7 +.L5: + addl $12, %esp + ret + .p2align 4,,7 +.L7: + jl .L4 + cmpl $0, %eax + .p2align 4,,8 + ja .L5 +.L4: + .p2align 4,,9 + call abort + +//===---------------------------------------------------------------------===// + +Tail call optimization improvements: Tail call optimization currently +pushes all arguments on the top of the stack (their normal place for +non-tail call optimized calls) that source from the callers arguments +or that source from a virtual register (also possibly sourcing from +callers arguments). +This is done to prevent overwriting of parameters (see example +below) that might be used later. + +example: + +int callee(int32, int64); +int caller(int32 arg1, int32 arg2) { + int64 local = arg2 * 2; + return callee(arg2, (int64)local); +} + +[arg1] [!arg2 no longer valid since we moved local onto it] +[arg2] -> [(int64) +[RETADDR] local ] + +Moving arg1 onto the stack slot of callee function would overwrite +arg2 of the caller. + +Possible optimizations: + + + - Analyse the actual parameters of the callee to see which would + overwrite a caller parameter which is used by the callee and only + push them onto the top of the stack. + + int callee (int32 arg1, int32 arg2); + int caller (int32 arg1, int32 arg2) { + return callee(arg1,arg2); + } + + Here we don't need to write any variables to the top of the stack + since they don't overwrite each other. + + int callee (int32 arg1, int32 arg2); + int caller (int32 arg1, int32 arg2) { + return callee(arg2,arg1); + } + + Here we need to push the arguments because they overwrite each + other. + +//===---------------------------------------------------------------------===// + +main () +{ + int i = 0; + unsigned long int z = 0; + + do { + z -= 0x00004000; + i++; + if (i > 0x00040000) + abort (); + } while (z > 0); + exit (0); +} + +gcc compiles this to: + +_main: + subl $28, %esp + xorl %eax, %eax + jmp L2 +L3: + cmpl $262144, %eax + je L10 +L2: + addl $1, %eax + cmpl $262145, %eax + jne L3 + call L_abort$stub +L10: + movl $0, (%esp) + call L_exit$stub + +llvm: + +_main: + subl $12, %esp + movl $1, %eax + movl $16384, %ecx +LBB1_1: # bb + cmpl $262145, %eax + jge LBB1_4 # cond_true +LBB1_2: # cond_next + incl %eax + addl $4294950912, %ecx + cmpl $16384, %ecx + jne LBB1_1 # bb +LBB1_3: # bb11 + xorl %eax, %eax + addl $12, %esp + ret +LBB1_4: # cond_true + call L_abort$stub + +1. LSR should rewrite the first cmp with induction variable %ecx. +2. DAG combiner should fold + leal 1(%eax), %edx + cmpl $262145, %edx + => + cmpl $262144, %eax + +//===---------------------------------------------------------------------===// + +define i64 @test(double %X) { + %Y = fptosi double %X to i64 + ret i64 %Y +} + +compiles to: + +_test: + subl $20, %esp + movsd 24(%esp), %xmm0 + movsd %xmm0, 8(%esp) + fldl 8(%esp) + fisttpll (%esp) + movl 4(%esp), %edx + movl (%esp), %eax + addl $20, %esp + #FP_REG_KILL + ret + +This should just fldl directly from the input stack slot. + +//===---------------------------------------------------------------------===// + +This code: +int foo (int x) { return (x & 65535) | 255; } + +Should compile into: + +_foo: + movzwl 4(%esp), %eax + orl $255, %eax + ret + +instead of: +_foo: + movl $65280, %eax + andl 4(%esp), %eax + orl $255, %eax + ret + +//===---------------------------------------------------------------------===// + +We're codegen'ing multiply of long longs inefficiently: + +unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) { + return arg1 * arg2; +} + +We compile to (fomit-frame-pointer): + +_LLM: + pushl %esi + movl 8(%esp), %ecx + movl 16(%esp), %esi + movl %esi, %eax + mull %ecx + imull 12(%esp), %esi + addl %edx, %esi + imull 20(%esp), %ecx + movl %esi, %edx + addl %ecx, %edx + popl %esi + ret + +This looks like a scheduling deficiency and lack of remat of the load from +the argument area. ICC apparently produces: + + movl 8(%esp), %ecx + imull 12(%esp), %ecx + movl 16(%esp), %eax + imull 4(%esp), %eax + addl %eax, %ecx + movl 4(%esp), %eax + mull 12(%esp) + addl %ecx, %edx + ret + +Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR: +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236 + +//===---------------------------------------------------------------------===// + +We can fold a store into "zeroing a reg". Instead of: + +xorl %eax, %eax +movl %eax, 124(%esp) + +we should get: + +movl $0, 124(%esp) + +if the flags of the xor are dead. + +Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should +be folded into: shl [mem], 1 + +//===---------------------------------------------------------------------===// + +In SSE mode, we turn abs and neg into a load from the constant pool plus a xor +or and instruction, for example: + + xorpd LCPI1_0, %xmm2 + +However, if xmm2 gets spilled, we end up with really ugly code like this: + + movsd (%esp), %xmm0 + xorpd LCPI1_0, %xmm0 + movsd %xmm0, (%esp) + +Since we 'know' that this is a 'neg', we can actually "fold" the spill into +the neg/abs instruction, turning it into an *integer* operation, like this: + + xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31) + +you could also use xorb, but xorl is less likely to lead to a partial register +stall. Here is a contrived testcase: + +double a, b, c; +void test(double *P) { + double X = *P; + a = X; + bar(); + X = -X; + b = X; + bar(); + c = X; +} + +//===---------------------------------------------------------------------===// + +The generated code on x86 for checking for signed overflow on a multiply the +obvious way is much longer than it needs to be. + +int x(int a, int b) { + long long prod = (long long)a*b; + return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1); +} + +See PR2053 for more details. + +//===---------------------------------------------------------------------===// + +We should investigate using cdq/ctld (effect: edx = sar eax, 31) +more aggressively; it should cost the same as a move+shift on any modern +processor, but it's a lot shorter. Downside is that it puts more +pressure on register allocation because it has fixed operands. + +Example: +int abs(int x) {return x < 0 ? -x : x;} + +gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.: +abs: + movl 4(%esp), %eax + cltd + xorl %edx, %eax + subl %edx, %eax + ret + +//===---------------------------------------------------------------------===// + +Take the following code (from +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): + +extern unsigned char first_one[65536]; +int FirstOnet(unsigned long long arg1) +{ + if (arg1 >> 48) + return (first_one[arg1 >> 48]); + return 0; +} + + +The following code is currently generated: +FirstOnet: + movl 8(%esp), %eax + cmpl $65536, %eax + movl 4(%esp), %ecx + jb .LBB1_2 # UnifiedReturnBlock +.LBB1_1: # ifthen + shrl $16, %eax + movzbl first_one(%eax), %eax + ret +.LBB1_2: # UnifiedReturnBlock + xorl %eax, %eax + ret + +We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this +lets us change the cmpl into a testl, which is shorter, and eliminate the shift. + +//===---------------------------------------------------------------------===// + +We compile this function: + +define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind { +entry: + %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1] + br i1 %tmp2, label %bb7, label %bb + +bb: ; preds = %entry + %tmp6 = add i32 %b, %a ; <i32> [#uses=1] + ret i32 %tmp6 + +bb7: ; preds = %entry + %tmp10 = sub i32 %a, %c ; <i32> [#uses=1] + ret i32 %tmp10 +} + +to: + +foo: # @foo +# %bb.0: # %entry + movl 4(%esp), %ecx + cmpb $0, 16(%esp) + je .LBB0_2 +# %bb.1: # %bb + movl 8(%esp), %eax + addl %ecx, %eax + ret +.LBB0_2: # %bb7 + movl 12(%esp), %edx + movl %ecx, %eax + subl %edx, %eax + ret + +There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a +couple more movls by putting 4(%esp) into %eax instead of %ecx. + +//===---------------------------------------------------------------------===// + +See rdar://4653682. + +From flops: + +LBB1_15: # bb310 + cvtss2sd LCPI1_0, %xmm1 + addsd %xmm1, %xmm0 + movsd 176(%esp), %xmm2 + mulsd %xmm0, %xmm2 + movapd %xmm2, %xmm3 + mulsd %xmm3, %xmm3 + movapd %xmm3, %xmm4 + mulsd LCPI1_23, %xmm4 + addsd LCPI1_24, %xmm4 + mulsd %xmm3, %xmm4 + addsd LCPI1_25, %xmm4 + mulsd %xmm3, %xmm4 + addsd LCPI1_26, %xmm4 + mulsd %xmm3, %xmm4 + addsd LCPI1_27, %xmm4 + mulsd %xmm3, %xmm4 + addsd LCPI1_28, %xmm4 + mulsd %xmm3, %xmm4 + addsd %xmm1, %xmm4 + mulsd %xmm2, %xmm4 + movsd 152(%esp), %xmm1 + addsd %xmm4, %xmm1 + movsd %xmm1, 152(%esp) + incl %eax + cmpl %eax, %esi + jge LBB1_15 # bb310 +LBB1_16: # bb358.loopexit + movsd 152(%esp), %xmm0 + addsd %xmm0, %xmm0 + addsd LCPI1_22, %xmm0 + movsd %xmm0, 152(%esp) + +Rather than spilling the result of the last addsd in the loop, we should have +insert a copy to split the interval (one for the duration of the loop, one +extending to the fall through). The register pressure in the loop isn't high +enough to warrant the spill. + +Also check why xmm7 is not used at all in the function. + +//===---------------------------------------------------------------------===// + +Take the following: + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128" +target triple = "i386-apple-darwin8" +@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2] +define fastcc void @abort_gzip() noreturn nounwind { +entry: + %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1] + br i1 %tmp.b.i, label %bb.i, label %bb4.i +bb.i: ; preds = %entry + tail call void @exit( i32 1 ) noreturn nounwind + unreachable +bb4.i: ; preds = %entry + store i1 true, i1* @in_exit.4870.b + tail call void @exit( i32 1 ) noreturn nounwind + unreachable +} +declare void @exit(i32) noreturn nounwind + +This compiles into: +_abort_gzip: ## @abort_gzip +## %bb.0: ## %entry + subl $12, %esp + movb _in_exit.4870.b, %al + cmpb $1, %al + jne LBB0_2 + +We somehow miss folding the movb into the cmpb. + +//===---------------------------------------------------------------------===// + +We compile: + +int test(int x, int y) { + return x-y-1; +} + +into (-m64): + +_test: + decl %edi + movl %edi, %eax + subl %esi, %eax + ret + +it would be better to codegen as: x+~y (notl+addl) + +//===---------------------------------------------------------------------===// + +This code: + +int foo(const char *str,...) +{ + __builtin_va_list a; int x; + __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a); + return x; +} + +gets compiled into this on x86-64: + subq $200, %rsp + movaps %xmm7, 160(%rsp) + movaps %xmm6, 144(%rsp) + movaps %xmm5, 128(%rsp) + movaps %xmm4, 112(%rsp) + movaps %xmm3, 96(%rsp) + movaps %xmm2, 80(%rsp) + movaps %xmm1, 64(%rsp) + movaps %xmm0, 48(%rsp) + movq %r9, 40(%rsp) + movq %r8, 32(%rsp) + movq %rcx, 24(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 8(%rsp) + leaq (%rsp), %rax + movq %rax, 192(%rsp) + leaq 208(%rsp), %rax + movq %rax, 184(%rsp) + movl $48, 180(%rsp) + movl $8, 176(%rsp) + movl 176(%rsp), %eax + cmpl $47, %eax + jbe .LBB1_3 # bb +.LBB1_1: # bb3 + movq 184(%rsp), %rcx + leaq 8(%rcx), %rax + movq %rax, 184(%rsp) +.LBB1_2: # bb4 + movl (%rcx), %eax + addq $200, %rsp + ret +.LBB1_3: # bb + movl %eax, %ecx + addl $8, %eax + addq 192(%rsp), %rcx + movl %eax, 176(%rsp) + jmp .LBB1_2 # bb4 + +gcc 4.3 generates: + subq $96, %rsp +.LCFI0: + leaq 104(%rsp), %rax + movq %rsi, -80(%rsp) + movl $8, -120(%rsp) + movq %rax, -112(%rsp) + leaq -88(%rsp), %rax + movq %rax, -104(%rsp) + movl $8, %eax + cmpl $48, %eax + jb .L6 + movq -112(%rsp), %rdx + movl (%rdx), %eax + addq $96, %rsp + ret + .p2align 4,,10 + .p2align 3 +.L6: + mov %eax, %edx + addq -104(%rsp), %rdx + addl $8, %eax + movl %eax, -120(%rsp) + movl (%rdx), %eax + addq $96, %rsp + ret + +and it gets compiled into this on x86: + pushl %ebp + movl %esp, %ebp + subl $4, %esp + leal 12(%ebp), %eax + movl %eax, -4(%ebp) + leal 16(%ebp), %eax + movl %eax, -4(%ebp) + movl 12(%ebp), %eax + addl $4, %esp + popl %ebp + ret + +gcc 4.3 generates: + pushl %ebp + movl %esp, %ebp + movl 12(%ebp), %eax + popl %ebp + ret + +//===---------------------------------------------------------------------===// + +Teach tblgen not to check bitconvert source type in some cases. This allows us +to consolidate the following patterns in X86InstrMMX.td: + +def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))))), + (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>; +def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))))), + (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>; +def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))))), + (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>; + +There are other cases in various td files. + +//===---------------------------------------------------------------------===// + +Take something like the following on x86-32: +unsigned a(unsigned long long x, unsigned y) {return x % y;} + +We currently generate a libcall, but we really shouldn't: the expansion is +shorter and likely faster than the libcall. The expected code is something +like the following: + + movl 12(%ebp), %eax + movl 16(%ebp), %ecx + xorl %edx, %edx + divl %ecx + movl 8(%ebp), %eax + divl %ecx + movl %edx, %eax + ret + +A similar code sequence works for division. + +//===---------------------------------------------------------------------===// + +We currently compile this: + +define i32 @func1(i32 %v1, i32 %v2) nounwind { +entry: + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) + %sum = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %normal +normal: + ret i32 %sum +overflow: + call void @llvm.trap() + unreachable +} +declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) +declare void @llvm.trap() + +to: + +_func1: + movl 4(%esp), %eax + addl 8(%esp), %eax + jo LBB1_2 ## overflow +LBB1_1: ## normal + ret +LBB1_2: ## overflow + ud2 + +it would be nice to produce "into" someday. + +//===---------------------------------------------------------------------===// + +Test instructions can be eliminated by using EFLAGS values from arithmetic +instructions. This is currently not done for mul, and, or, xor, neg, shl, +sra, srl, shld, shrd, atomic ops, and others. It is also currently not done +for read-modify-write instructions. It is also current not done if the +OF or CF flags are needed. + +The shift operators have the complication that when the shift count is +zero, EFLAGS is not set, so they can only subsume a test instruction if +the shift count is known to be non-zero. Also, using the EFLAGS value +from a shift is apparently very slow on some x86 implementations. + +In read-modify-write instructions, the root node in the isel match is +the store, and isel has no way for the use of the EFLAGS result of the +arithmetic to be remapped to the new node. + +Add and subtract instructions set OF on signed overflow and CF on unsiged +overflow, while test instructions always clear OF and CF. In order to +replace a test with an add or subtract in a situation where OF or CF is +needed, codegen must be able to prove that the operation cannot see +signed or unsigned overflow, respectively. + +//===---------------------------------------------------------------------===// + +memcpy/memmove do not lower to SSE copies when possible. A silly example is: +define <16 x float> @foo(<16 x float> %A) nounwind { + %tmp = alloca <16 x float>, align 16 + %tmp2 = alloca <16 x float>, align 16 + store <16 x float> %A, <16 x float>* %tmp + %s = bitcast <16 x float>* %tmp to i8* + %s2 = bitcast <16 x float>* %tmp2 to i8* + call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16) + %R = load <16 x float>* %tmp2 + ret <16 x float> %R +} + +declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind + +which compiles to: + +_foo: + subl $140, %esp + movaps %xmm3, 112(%esp) + movaps %xmm2, 96(%esp) + movaps %xmm1, 80(%esp) + movaps %xmm0, 64(%esp) + movl 60(%esp), %eax + movl %eax, 124(%esp) + movl 56(%esp), %eax + movl %eax, 120(%esp) + movl 52(%esp), %eax + <many many more 32-bit copies> + movaps (%esp), %xmm0 + movaps 16(%esp), %xmm1 + movaps 32(%esp), %xmm2 + movaps 48(%esp), %xmm3 + addl $140, %esp + ret + +On Nehalem, it may even be cheaper to just use movups when unaligned than to +fall back to lower-granularity chunks. + +//===---------------------------------------------------------------------===// + +Implement processor-specific optimizations for parity with GCC on these +processors. GCC does two optimizations: + +1. ix86_pad_returns inserts a noop before ret instructions if immediately + preceded by a conditional branch or is the target of a jump. +2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of + code contains more than 3 branches. + +The first one is done for all AMDs, Core2, and "Generic" +The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, + Core 2, and "Generic" + +//===---------------------------------------------------------------------===// +Testcase: +int x(int a) { return (a&0xf0)>>4; } + +Current output: + movl 4(%esp), %eax + shrl $4, %eax + andl $15, %eax + ret + +Ideal output: + movzbl 4(%esp), %eax + shrl $4, %eax + ret + +//===---------------------------------------------------------------------===// + +Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch +properly. + +When the return value is not used (i.e. only care about the value in the +memory), x86 does not have to use add to implement these. Instead, it can use +add, sub, inc, dec instructions with the "lock" prefix. + +This is currently implemented using a bit of instruction selection trick. The +issue is the target independent pattern produces one output and a chain and we +want to map it into one that just output a chain. The current trick is to select +it into a MERGE_VALUES with the first definition being an implicit_def. The +proper solution is to add new ISD opcodes for the no-output variant. DAG +combiner can then transform the node before it gets to target node selection. + +Problem #2 is we are adding a whole bunch of x86 atomic instructions when in +fact these instructions are identical to the non-lock versions. We need a way to +add target specific information to target nodes and have this information +carried over to machine instructions. Asm printer (or JIT) can use this +information to add the "lock" prefix. + +//===---------------------------------------------------------------------===// + +struct B { + unsigned char y0 : 1; +}; + +int bar(struct B* a) { return a->y0; } + +define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize { + %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0 + %2 = load i8* %1, align 1 + %3 = and i8 %2, 1 + %4 = zext i8 %3 to i32 + ret i32 %4 +} + +bar: # @bar +# %bb.0: + movb (%rdi), %al + andb $1, %al + movzbl %al, %eax + ret + +Missed optimization: should be movl+andl. + +//===---------------------------------------------------------------------===// + +The x86_64 abi says: + +Booleans, when stored in a memory object, are stored as single byte objects the +value of which is always 0 (false) or 1 (true). + +We are not using this fact: + +int bar(_Bool *a) { return *a; } + +define i32 @bar(i8* nocapture %a) nounwind readonly optsize { + %1 = load i8* %a, align 1, !tbaa !0 + %tmp = and i8 %1, 1 + %2 = zext i8 %tmp to i32 + ret i32 %2 +} + +bar: + movb (%rdi), %al + andb $1, %al + movzbl %al, %eax + ret + +GCC produces + +bar: + movzbl (%rdi), %eax + ret + +//===---------------------------------------------------------------------===// + +Take the following C code: +int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } + +We generate the following IR with clang: +define i32 @f(i32 %a, i32 %b) nounwind readnone { +entry: + %tmp = xor i32 %b, %a ; <i32> [#uses=1] + %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1] + %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1] + %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] + ret i32 %conv5 +} + +And the following x86 code: + xorl %esi, %edi + testb $-1, %dil + sete %al + movzbl %al, %eax + ret + +A cmpb instead of the xorl+testb would be one instruction shorter. + +//===---------------------------------------------------------------------===// + +Given the following C code: +int f(int a, int b) { return (signed char)a == (signed char)b; } + +We generate the following IR with clang: +define i32 @f(i32 %a, i32 %b) nounwind readnone { +entry: + %sext = shl i32 %a, 24 ; <i32> [#uses=1] + %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1] + %sext6 = shl i32 %b, 24 ; <i32> [#uses=1] + %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1] + %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1] + %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] + ret i32 %conv5 +} + +And the following x86 code: + movsbl %sil, %eax + movsbl %dil, %ecx + cmpl %eax, %ecx + sete %al + movzbl %al, %eax + ret + + +It should be possible to eliminate the sign extensions. + +//===---------------------------------------------------------------------===// + +LLVM misses a load+store narrowing opportunity in this code: + +%struct.bf = type { i64, i16, i16, i32 } + +@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] + +define void @t1() nounwind ssp { +entry: + %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] + %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1] + %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2] + %3 = load i32* %2, align 1 ; <i32> [#uses=1] + %4 = and i32 %3, -65537 ; <i32> [#uses=1] + store i32 %4, i32* %2, align 1 + %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] + %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1] + %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2] + %8 = load i32* %7, align 1 ; <i32> [#uses=1] + %9 = and i32 %8, -131073 ; <i32> [#uses=1] + store i32 %9, i32* %7, align 1 + ret void +} + +LLVM currently emits this: + + movq bfi(%rip), %rax + andl $-65537, 8(%rax) + movq bfi(%rip), %rax + andl $-131073, 8(%rax) + ret + +It could narrow the loads and stores to emit this: + + movq bfi(%rip), %rax + andb $-2, 10(%rax) + movq bfi(%rip), %rax + andb $-3, 10(%rax) + ret + +The trouble is that there is a TokenFactor between the store and the +load, making it non-trivial to determine if there's anything between +the load and the store which would prohibit narrowing. + +//===---------------------------------------------------------------------===// + +This code: +void foo(unsigned x) { + if (x == 0) bar(); + else if (x == 1) qux(); +} + +currently compiles into: +_foo: + movl 4(%esp), %eax + cmpl $1, %eax + je LBB0_3 + testl %eax, %eax + jne LBB0_4 + +the testl could be removed: +_foo: + movl 4(%esp), %eax + cmpl $1, %eax + je LBB0_3 + jb LBB0_4 + +0 is the only unsigned number < 1. + +//===---------------------------------------------------------------------===// + +This code: + +%0 = type { i32, i1 } + +define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp { +entry: + %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x) + %cmp = extractvalue %0 %uadd, 1 + %inc = zext i1 %cmp to i32 + %add = add i32 %x, %sum + %z.0 = add i32 %add, %inc + ret i32 %z.0 +} + +declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone + +compiles to: + +_add32carry: ## @add32carry + addl %esi, %edi + sbbl %ecx, %ecx + movl %edi, %eax + subl %ecx, %eax + ret + +But it could be: + +_add32carry: + leal (%rsi,%rdi), %eax + cmpl %esi, %eax + adcl $0, %eax + ret + +//===---------------------------------------------------------------------===// + +The hot loop of 256.bzip2 contains code that looks a bit like this: + +int foo(char *P, char *Q, int x, int y) { + if (P[0] != Q[0]) + return P[0] < Q[0]; + if (P[1] != Q[1]) + return P[1] < Q[1]; + if (P[2] != Q[2]) + return P[2] < Q[2]; + return P[3] < Q[3]; +} + +In the real code, we get a lot more wrong than this. However, even in this +code we generate: + +_foo: ## @foo +## %bb.0: ## %entry + movb (%rsi), %al + movb (%rdi), %cl + cmpb %al, %cl + je LBB0_2 +LBB0_1: ## %if.then + cmpb %al, %cl + jmp LBB0_5 +LBB0_2: ## %if.end + movb 1(%rsi), %al + movb 1(%rdi), %cl + cmpb %al, %cl + jne LBB0_1 +## %bb.3: ## %if.end38 + movb 2(%rsi), %al + movb 2(%rdi), %cl + cmpb %al, %cl + jne LBB0_1 +## %bb.4: ## %if.end60 + movb 3(%rdi), %al + cmpb 3(%rsi), %al +LBB0_5: ## %if.end60 + setl %al + movzbl %al, %eax + ret + +Note that we generate jumps to LBB0_1 which does a redundant compare. The +redundant compare also forces the register values to be live, which prevents +folding one of the loads into the compare. In contrast, GCC 4.2 produces: + +_foo: + movzbl (%rsi), %eax + cmpb %al, (%rdi) + jne L10 +L12: + movzbl 1(%rsi), %eax + cmpb %al, 1(%rdi) + jne L10 + movzbl 2(%rsi), %eax + cmpb %al, 2(%rdi) + jne L10 + movzbl 3(%rdi), %eax + cmpb 3(%rsi), %al +L10: + setl %al + movzbl %al, %eax + ret + +which is "perfect". + +//===---------------------------------------------------------------------===// + +For the branch in the following code: +int a(); +int b(int x, int y) { + if (x & (1<<(y&7))) + return a(); + return y; +} + +We currently generate: + movb %sil, %al + andb $7, %al + movzbl %al, %eax + btl %eax, %edi + jae .LBB0_2 + +movl+andl would be shorter than the movb+andb+movzbl sequence. + +//===---------------------------------------------------------------------===// + +For the following: +struct u1 { + float x, y; +}; +float foo(struct u1 u) { + return u.x + u.y; +} + +We currently generate: + movdqa %xmm0, %xmm1 + pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0] + addss %xmm1, %xmm0 + ret + +We could save an instruction here by commuting the addss. + +//===---------------------------------------------------------------------===// + +This (from PR9661): + +float clamp_float(float a) { + if (a > 1.0f) + return 1.0f; + else if (a < 0.0f) + return 0.0f; + else + return a; +} + +Could compile to: + +clamp_float: # @clamp_float + movss .LCPI0_0(%rip), %xmm1 + minss %xmm1, %xmm0 + pxor %xmm1, %xmm1 + maxss %xmm1, %xmm0 + ret + +with -ffast-math. + +//===---------------------------------------------------------------------===// + +This function (from PR9803): + +int clamp2(int a) { + if (a > 5) + a = 5; + if (a < 0) + return 0; + return a; +} + +Compiles to: + +_clamp2: ## @clamp2 + pushq %rbp + movq %rsp, %rbp + cmpl $5, %edi + movl $5, %ecx + cmovlel %edi, %ecx + testl %ecx, %ecx + movl $0, %eax + cmovnsl %ecx, %eax + popq %rbp + ret + +The move of 0 could be scheduled above the test to make it is xor reg,reg. + +//===---------------------------------------------------------------------===// + +GCC PR48986. We currently compile this: + +void bar(void); +void yyy(int* p) { + if (__sync_fetch_and_add(p, -1) == 1) + bar(); +} + +into: + movl $-1, %eax + lock + xaddl %eax, (%rdi) + cmpl $1, %eax + je LBB0_2 + +Instead we could generate: + + lock + dec %rdi + je LBB0_2 + +The trick is to match "fetch_and_add(X, -C) == C". + +//===---------------------------------------------------------------------===// + +unsigned t(unsigned a, unsigned b) { + return a <= b ? 5 : -5; +} + +We generate: + movl $5, %ecx + cmpl %esi, %edi + movl $-5, %eax + cmovbel %ecx, %eax + +GCC: + cmpl %edi, %esi + sbbl %eax, %eax + andl $-10, %eax + addl $5, %eax + +//===---------------------------------------------------------------------===// diff --git a/contrib/libs/llvm12/lib/Target/X86/TargetInfo/.yandex_meta/licenses.list.txt b/contrib/libs/llvm12/lib/Target/X86/TargetInfo/.yandex_meta/licenses.list.txt index c62d353021..a4433625d4 100644 --- a/contrib/libs/llvm12/lib/Target/X86/TargetInfo/.yandex_meta/licenses.list.txt +++ b/contrib/libs/llvm12/lib/Target/X86/TargetInfo/.yandex_meta/licenses.list.txt @@ -1,7 +1,7 @@ -====================Apache-2.0 WITH LLVM-exception==================== -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. - - -====================Apache-2.0 WITH LLVM-exception==================== -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +====================Apache-2.0 WITH LLVM-exception==================== +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. + + +====================Apache-2.0 WITH LLVM-exception==================== +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make b/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make index 2f30db941e..9048b1b373 100644 --- a/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make +++ b/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make @@ -2,15 +2,15 @@ LIBRARY() -OWNER( - orivej - g:cpp-contrib -) - -LICENSE(Apache-2.0 WITH LLVM-exception) - -LICENSE_TEXTS(.yandex_meta/licenses.list.txt) - +OWNER( + orivej + g:cpp-contrib +) + +LICENSE(Apache-2.0 WITH LLVM-exception) + +LICENSE_TEXTS(.yandex_meta/licenses.list.txt) + PEERDIR( contrib/libs/llvm12 contrib/libs/llvm12/lib/Support diff --git a/contrib/libs/llvm12/lib/Target/X86/ya.make b/contrib/libs/llvm12/lib/Target/X86/ya.make index 1df03a55e7..eda842b955 100644 --- a/contrib/libs/llvm12/lib/Target/X86/ya.make +++ b/contrib/libs/llvm12/lib/Target/X86/ya.make @@ -2,18 +2,18 @@ LIBRARY() -OWNER( - orivej - g:cpp-contrib -) - -LICENSE( - Apache-2.0 WITH LLVM-exception AND - NCSA -) - -LICENSE_TEXTS(.yandex_meta/licenses.list.txt) +OWNER( + orivej + g:cpp-contrib +) +LICENSE( + Apache-2.0 WITH LLVM-exception AND + NCSA +) + +LICENSE_TEXTS(.yandex_meta/licenses.list.txt) + PEERDIR( contrib/libs/llvm12 contrib/libs/llvm12/include |