diff options
author | somov <somov@yandex-team.ru> | 2022-02-10 16:45:47 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:47 +0300 |
commit | a5950576e397b1909261050b8c7da16db58f10b1 (patch) | |
tree | 7ba7677f6a4c3e19e2cefab34d16df2c8963b4d4 /contrib/libs/highwayhash | |
parent | 81eddc8c0b55990194e112b02d127b87d54164a9 (diff) | |
download | ydb-a5950576e397b1909261050b8c7da16db58f10b1.tar.gz |
Restoring authorship annotation for <somov@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/highwayhash')
64 files changed, 10383 insertions, 10383 deletions
diff --git a/contrib/libs/highwayhash/IMPORT b/contrib/libs/highwayhash/IMPORT index c1b4084ec0..7b70d9300b 100644 --- a/contrib/libs/highwayhash/IMPORT +++ b/contrib/libs/highwayhash/IMPORT @@ -1 +1 @@ -Imported from https://github.com/google/highwayhash commit 2b666ae078292b01024453d01480f3b362a2a012 (master branch, 2017-05-08) +Imported from https://github.com/google/highwayhash commit 2b666ae078292b01024453d01480f3b362a2a012 (master branch, 2017-05-08) diff --git a/contrib/libs/highwayhash/LICENSE b/contrib/libs/highwayhash/LICENSE index 6b0b1270ff..4d581db0a5 100644 --- a/contrib/libs/highwayhash/LICENSE +++ b/contrib/libs/highwayhash/LICENSE @@ -1,203 +1,203 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/contrib/libs/highwayhash/README.md b/contrib/libs/highwayhash/README.md index 2d311c66c1..4d120be2a2 100644 --- a/contrib/libs/highwayhash/README.md +++ b/contrib/libs/highwayhash/README.md @@ -1,350 +1,350 @@ -Strong (well-distributed and unpredictable) hashes: - -* Portable implementation of - [SipHash](https://www.131002.net/siphash/siphash.pdf) -* HighwayHash, a 5x faster SIMD hash with [security - claims](https://arxiv.org/abs/1612.06257) - -## Quick Start - -To build on a Linux or Mac platform, simply run `make`. For Windows, we provide -a Visual Studio 2015 project in the `msvc` subdirectory. - -Run `benchmark` for speed measurements. `sip_hash_test` and `highwayhash_test` -ensure the implementations return known-good values for a given set of inputs. - -64-bit SipHash for any CPU: - - #include "highwayhash/sip_hash.h" - using namespace highwayhash; - const HH_U64 key2[2] HH_ALIGNAS(16) = {1234, 5678}; - char in[8] = {1}; - return SipHash(key2, in, 8); - -64, 128 or 256 bit HighwayHash for the CPU determined by compiler flags: - - #include "highwayhash/highwayhash.h" - using namespace highwayhash; - const HHKey key HH_ALIGNAS(32) = {1, 2, 3, 4}; - char in[8] = {1}; - HHResult64 result; // or HHResult128 or HHResult256 - HHStateT<HH_TARGET> state(key); - HighwayHashT(&state, in, 8, &result); - -64, 128 or 256 bit HighwayHash for the CPU on which we're currently running: - - #include "highwayhash/highwayhash_target.h" - #include "highwayhash/instruction_sets.h" - using namespace highwayhash; - const HHKey key HH_ALIGNAS(32) = {1, 2, 3, 4}; - char in[8] = {1}; - HHResult64 result; // or HHResult128 or HHResult256 - InstructionSets::Run<HighwayHash>(key, in, 8, &result); - -C-callable 64-bit HighwayHash for the CPU on which we're currently running: - - #include "highwayhash/c_bindings.h" - const uint64_t key[4] = {1, 2, 3, 4}; - char in[8] = {1}; - return HighwayHash64(key, in, 8); - -## Introduction - -Hash functions are widely used, so it is desirable to increase their speed and -security. This package provides two 'strong' (well-distributed and -unpredictable) hash functions: a faster version of SipHash, and an even faster -algorithm we call HighwayHash. - -SipHash is a fast but 'cryptographically strong' pseudo-random function by -Aumasson and Bernstein [https://www.131002.net/siphash/siphash.pdf]. - -HighwayHash is a new way of mixing inputs which may inspire new -cryptographically strong hashes. Large inputs are processed at a rate of 0.24 -cycles per byte, and latency remains low even for small inputs. HighwayHash is -faster than SipHash for all input sizes, with 5 times higher throughput at 1 -KiB. We discuss design choices and provide statistical analysis and preliminary -cryptanalysis in https://arxiv.org/abs/1612.06257. - -## Applications - -Unlike prior strong hashes, these functions are fast enough to be recommended -as safer replacements for weak hashes in many applications. The additional CPU -cost appears affordable, based on profiling data indicating C++ hash functions -account for less than 0.25% of CPU usage. - -Hash-based selection of random subsets is useful for A/B experiments and similar -applications. Such random generators are idempotent (repeatable and -deterministic), which is helpful for parallel algorithms and testing. To avoid -bias, it is important that the hash function be unpredictable and -indistinguishable from a uniform random generator. We have verified the bit -distribution and avalanche properties of SipHash and HighwayHash. - -64-bit hashes are also useful for authenticating short-lived messages such as -network/RPC packets. This requires that the hash function withstand -differential, length extension and other attacks. We have published a formal -security analysis for HighwayHash. New cryptanalysis tools may still need to be -developed for further analysis. - -Strong hashes are also important parts of methods for protecting hash tables -against unacceptable worst-case behavior and denial of service attacks -(see "hash flooding" below). - -## SipHash - -Our SipHash implementation is a fast and portable drop-in replacement for -the reference C code. Outputs are identical for the given test cases (messages -between 0 and 63 bytes). - -Interestingly, it is about twice as fast as a SIMD implementation using SSE4.1 -(https://goo.gl/80GBSD). This is presumably due to the lack of SIMD bit rotate -instructions. - -SipHash13 is a faster but weaker variant with one mixing round per update and -three during finalization. - -We also provide a data-parallel 'tree hash' variant that enables efficient SIMD -while retaining safety guarantees. This is about twice as fast as SipHash, but -does not return the same results. - -## HighwayHash - -We have devised a new way of mixing inputs with AVX2 multiply and permute -instructions. The multiplications are 32x32 -> 64 bits and therefore infeasible -to reverse. Permuting equalizes the distribution of the resulting bytes. - -The internal state occupies four 256-bit AVX2 registers. Due to limitations of -the instruction set, the registers are partitioned into two 512-bit halves that -remain independent until the reduce phase. The algorithm outputs 64 bit digests -or up to 256 bits at no extra cost. - -In addition to high throughput, the algorithm is designed for low finalization -cost. The result is more than twice as fast as SipTreeHash. - -For older CPUs, we also provide an SSE4.1 version (80% as fast for large inputs -and 95% as fast for short inputs) and a portable version (10% as fast). - -Statistical analyses and preliminary cryptanalysis are given in -https://arxiv.org/abs/1612.06257. - -## Versioning and stability - -SipHash and HighwayHash 1.0 are 'fingerprint functions' whose input -> hash -mapping will not change. This is important for applications that write hashes to -persistent storage. - -HighwayHash has not yet reached 1.0 and may still change in the near future. We -will announce when it is frozen. - -## Speed measurements - -To measure the CPU cost of a hash function, we can either create an artificial -'microbenchmark' (easier to control, but probably not representative of the -actual runtime), or insert instrumentation directly into an application (risks -influencing the results through observer overhead). We provide novel variants of -both approaches that mitigate their respective disadvantages. - -profiler.h uses software write-combining to stream program traces to memory -with minimal overhead. These can be analyzed offline, or when memory is full, -to learn how much time was spent in each (possibly nested) zone. - -nanobenchmark.h enables cycle-accurate measurements of very short functions. -It uses CPU fences and robust statistics to minimize variability, and also -avoids unrealistic branch prediction effects. - -We compile the C++ implementations with a patched GCC 4.9 and run on a single -core of a Xeon E5-2690 v3 clocked at 2.6 GHz. CPU cost is measured as cycles per -byte for various input sizes: - -Algorithm | 8 | 31 | 32 | 63 | 64 | 1024 ----------------- | ----- | ---- | ---- | ---- | ---- | ---- -HighwayHashAVX2 | 7.34 | 1.81 | 1.71 | 1.04 | 0.95 | 0.24 -HighwayHashSSE41 | 8.00 | 2.11 | 1.75 | 1.13 | 0.96 | 0.30 -SipTreeHash | 16.51 | 4.57 | 4.09 | 2.22 | 2.29 | 0.57 -SipTreeHash13 | 12.33 | 3.47 | 3.06 | 1.68 | 1.63 | 0.33 -SipHash | 8.13 | 2.58 | 2.73 | 1.87 | 1.93 | 1.26 -SipHash13 | 6.96 | 2.09 | 2.12 | 1.32 | 1.33 | 0.68 - -SipTreeHash is slower than SipHash for small inputs because it processes blocks -of 32 bytes. AVX2 and SSE4.1 HighwayHash are faster than SipHash for all input -sizes due to their highly optimized handling of partial vectors. - -Note that previous measurements included the initialization of their input, -which dramatically increased timings especially for small inputs. - -## CPU requirements - -SipTreeHash[13] requires an AVX2-capable CPU (e.g. Haswell). HighwayHash -includes a dispatcher that chooses the best available (AVX2, SSE4.1 or portable) -implementation at runtime, as well as a directly callable function template that -can only run on the CPU for which it was built. SipHash[13] and -ScalarSipTreeHash[13] have no particular CPU requirements. - -Our implementations use custom AVX2 vector classes with overloaded operators -(e.g. `const V4x64U a = b + c`) for type-safety and improved readability vs. -compiler intrinsics (e.g. `const __m256i a = _mm256_add_epi64(b, c)`). - -We intend to port HighwayHash to other SIMD-capable platforms, especially ARM. - -Our instruction_sets dispatcher avoids running newer instructions on older CPUs -that do not support them. However, intrinsics, and therefore also any vector -classes that use them, require a compiler flag that also enables the compiler to -generate code for that CPU. This means the intrinsics must be placed in separate -translation units that are compiled with the required flags. It is important -that these source files and their headers not define any inline functions, -because that might break the one definition rule and cause crashes. - -To minimize dispatch overhead when hashes are computed often (e.g. in a loop), -we can inline the hash function into its caller using templates. The dispatch -overhead will only be paid once (e.g. before the loop). The template mechanism -also avoids duplicating code in each CPU-specific implementation. - -## Defending against hash flooding - -To mitigate hash flooding attacks, we need to take both the hash function and -the data structure into account. - -We wish to defend (web) services that utilize hash sets/maps against -denial-of-service attacks. Such data structures assign attacker-controlled -input messages `m` to a hash table bin `b` by computing the hash `H(s, m)` -using a hash function `H` seeded by `s`, and mapping it to a bin with some -narrowing function `b = R(h)`, discussed below. - -Attackers may attempt to trigger 'flooding' (excessive work in insertions or -lookups) by finding multiple `m` that map to the same bin. If the attacker has -local access, they can do far worse, so we assume the attacker can only issue -remote requests. If the attacker is able to send large numbers of requests, -they can already deny service, so we need only ensure the attacker's cost is -sufficiently large compared to the service's provisioning. - -If the hash function is 'weak', attackers can easily generate 'hash collisions' -(inputs mapping to the same hash values) that are independent of the seed. In -other words, certain input messages will cause collisions regardless of the seed -value. The author of SipHash has published C++ programs to generate such -'universal (key-independent) multicollisions' for CityHash and Murmur. Similar -'differential' attacks are likely possible for any hash function consisting only -of reversible operations (e.g. addition/multiplication/rotation) with a constant -operand. `n` requests with such inputs cause `n^2` work for an unprotected hash -table, which is unacceptable. - -By contrast, 'strong' hashes such as SipHash or HighwayHash require infeasible -attacker effort to find a hash collision (an expected 2^32 guesses of `m` per -the birthday paradox) or recover the seed (2^63 requests). These security claims -assume the seed is secret. It is reasonable to suppose `s` is initially unknown -to attackers, e.g. generated on startup or even per-connection. A timing attack -by Wool/Bar-Yosef recovers 13-bit seeds by testing all 8K possibilities using -millions of requests, which takes several days (even assuming unrealistic 150 us -round-trip times). It appears infeasible to recover 64-bit seeds in this way. - -However, attackers are only looking for multiple `m` mapping to the same bin -rather than identical hash values. We assume they know or are able to discover -the hash table size `p`. It is common to choose `p = 2^i` to enable an efficient -`R(h) := h & (p - 1)`, which simply retains the lower hash bits. It may be -easier for attackers to compute partial collisions where only the lower `i` bits -match. This can be prevented by choosing a prime `p` so that `R(h) := h % p` -incorporates all hash bits. The costly modulo operation can be avoided by -multiplying with the inverse (https://goo.gl/l7ASm8). An interesting alternative -suggested by Kyoung Jae Seo chooses a random subset of the `h` bits. Such an `R` -function can be computed in just 3 cycles using PEXT from the BMI2 instruction -set. This is expected to defend against SAT-solver attacks on the hash bits at a -slightly lower cost than the multiplicative inverse method, and still allows -power-of-two table sizes. - -Summary thus far: given a strong hash function and secret seed, it appears -infeasible for attackers to generate hash collisions because `s` and/or `R` are -unknown. However, they can still observe the timings of data structure -operations for various `m`. With typical table sizes of 2^10 to 2^17 entries, -attackers can detect some 'bin collisions' (inputs mapping to the same bin). -Although this will be costly for the attacker, they can then send many instances -of such inputs, so we need to limit the resulting work for our data structure. - -Hash tables with separate chaining typically store bin entries in a linked list, -so worst-case inputs lead to unacceptable linear-time lookup cost. We instead -seek optimal asymptotic worst-case complexity for each operation (insertion, -deletion and lookups), which is a constant factor times the logarithm of the -data structure size. This naturally leads to a tree-like data structure for each -bin. The Java8 HashMap only replaces its linked list with trees when needed. -This leads to additional cost and complexity for deciding whether a bin is a -list or tree. - -Our first proposal (suggested by Github user funny-falcon) avoids this overhead -by always storing one tree per bin. It may also be worthwhile to store the first -entry directly in the bin, which avoids allocating any tree nodes in the common -case where bins are sparsely populated. What kind of tree should be used? -Scapegoat and splay trees only offer amortized complexity guarantees, whereas -treaps require an entropy source and have higher constant factors in practice. -Self-balancing structures such as 2-3 or red-black trees require additional -bookkeeping information. We can hope to reduce rebalancing cost by realizing -that the output bits of strong `H` functions are uniformly distributed. When -using them as keys instead of the original message `m`, recent relaxed balancing -schemes such as left-leaning red-black or weak AVL trees may require fewer tree -rotations to maintain their invariants. Note that `H` already determines the -bin, so we should only use the remaining bits. 64-bit hashes are likely -sufficient for this purpose, and HighwayHash generates up to 256 bits. It seems -unlikely that attackers can craft inputs resulting in worst cases for both the -bin index and tree key without being able to generate hash collisions, which -would contradict the security claims of strong hashes. Even if they succeed, the -relaxed tree balancing still guarantees an upper bound on height and therefore -the worst-case operation cost. For the AVL variant, the constant factors are -slightly lower than for red-black trees. - -The second proposed approach uses augmented/de-amortized cuckoo hash tables -(https://goo.gl/PFwwkx). These guarantee worst-case `log n` bounds for all -operations, but only if the hash function is 'indistinguishable from random' -(uniformly distributed regardless of the input distribution), which is claimed -for SipHash and HighwayHash but certainly not for weak hashes. - -Both alternatives retain good average case performance and defend against -flooding by limiting the amount of extra work an attacker can cause. The first -approach guarantees an upper bound of `log n` additional work even if the hash -function is compromised. - -In summary, a strong hash function is not, by itself, sufficient to protect a -chained hash table from flooding attacks. However, strong hash functions are -important parts of two schemes for preventing denial of service. Using weak hash -functions can slightly accelerate the best-case and average-case performance of -a service, but at the risk of greatly reduced attack costs and worst-case -performance. - -## Third-party implementations / bindings - -Thanks to Damian Gryski for making us aware of these third-party -implementations or bindings. Please feel free to get in touch or -raise an issue and we'll add yours as well. - -By | Language | URL ---- | --- | --- -Damian Gryski | Go and SSE | https://github.com/dgryski/go-highway/ -Lovell Fuller | node.js bindings | https://github.com/lovell/highwayhash -Vinzent Steinberg | Rust bindings | https://github.com/vks/highwayhash-rs - -## Modules - -### Hashes - -* c_bindings.h declares C-callable versions of SipHash/HighwayHash. -* sip_hash.cc is the compatible implementation of SipHash, and also provides - the final reduction for sip_tree_hash. -* sip_tree_hash.cc is the faster but incompatible SIMD j-lanes tree hash. -* scalar_sip_tree_hash.cc is a non-SIMD version. -* state_helpers.h simplifies the implementation of the SipHash variants. -* highwayhash.h is our new, fast hash function. -* hh_avx2.h, hh_sse41.h and hh_portable.h are its various implementations. -* highwayhash_target.h chooses the best available implementation at runtime. - -### Infrastructure - -* arch_specific.h offers byte swapping and CPUID detection. -* compiler_specific.h defines some compiler-dependent language extensions. -* data_parallel.h provides a C++11 ThreadPool and PerThread (similar to - OpenMP). -* instruction_sets.h and targets.h enable efficient CPU-specific dispatching. -* nanobenchmark.h measures elapsed times with < 1 cycle variability. -* os_specific.h sets thread affinity and priority for benchmarking. -* profiler.h is a low-overhead, deterministic hierarchical profiler. -* tsc_timer.h obtains high-resolution timestamps without CPU reordering. -* vector256.h and vector128.h contain wrapper classes for AVX2 and SSE4.1. - -By Jan Wassenberg <jan.wassenberg@gmail.com> and Jyrki Alakuijala -<jyrki.alakuijala@gmail.com>, updated 2017-02-07 - -This is not an official Google product. +Strong (well-distributed and unpredictable) hashes: + +* Portable implementation of + [SipHash](https://www.131002.net/siphash/siphash.pdf) +* HighwayHash, a 5x faster SIMD hash with [security + claims](https://arxiv.org/abs/1612.06257) + +## Quick Start + +To build on a Linux or Mac platform, simply run `make`. For Windows, we provide +a Visual Studio 2015 project in the `msvc` subdirectory. + +Run `benchmark` for speed measurements. `sip_hash_test` and `highwayhash_test` +ensure the implementations return known-good values for a given set of inputs. + +64-bit SipHash for any CPU: + + #include "highwayhash/sip_hash.h" + using namespace highwayhash; + const HH_U64 key2[2] HH_ALIGNAS(16) = {1234, 5678}; + char in[8] = {1}; + return SipHash(key2, in, 8); + +64, 128 or 256 bit HighwayHash for the CPU determined by compiler flags: + + #include "highwayhash/highwayhash.h" + using namespace highwayhash; + const HHKey key HH_ALIGNAS(32) = {1, 2, 3, 4}; + char in[8] = {1}; + HHResult64 result; // or HHResult128 or HHResult256 + HHStateT<HH_TARGET> state(key); + HighwayHashT(&state, in, 8, &result); + +64, 128 or 256 bit HighwayHash for the CPU on which we're currently running: + + #include "highwayhash/highwayhash_target.h" + #include "highwayhash/instruction_sets.h" + using namespace highwayhash; + const HHKey key HH_ALIGNAS(32) = {1, 2, 3, 4}; + char in[8] = {1}; + HHResult64 result; // or HHResult128 or HHResult256 + InstructionSets::Run<HighwayHash>(key, in, 8, &result); + +C-callable 64-bit HighwayHash for the CPU on which we're currently running: + + #include "highwayhash/c_bindings.h" + const uint64_t key[4] = {1, 2, 3, 4}; + char in[8] = {1}; + return HighwayHash64(key, in, 8); + +## Introduction + +Hash functions are widely used, so it is desirable to increase their speed and +security. This package provides two 'strong' (well-distributed and +unpredictable) hash functions: a faster version of SipHash, and an even faster +algorithm we call HighwayHash. + +SipHash is a fast but 'cryptographically strong' pseudo-random function by +Aumasson and Bernstein [https://www.131002.net/siphash/siphash.pdf]. + +HighwayHash is a new way of mixing inputs which may inspire new +cryptographically strong hashes. Large inputs are processed at a rate of 0.24 +cycles per byte, and latency remains low even for small inputs. HighwayHash is +faster than SipHash for all input sizes, with 5 times higher throughput at 1 +KiB. We discuss design choices and provide statistical analysis and preliminary +cryptanalysis in https://arxiv.org/abs/1612.06257. + +## Applications + +Unlike prior strong hashes, these functions are fast enough to be recommended +as safer replacements for weak hashes in many applications. The additional CPU +cost appears affordable, based on profiling data indicating C++ hash functions +account for less than 0.25% of CPU usage. + +Hash-based selection of random subsets is useful for A/B experiments and similar +applications. Such random generators are idempotent (repeatable and +deterministic), which is helpful for parallel algorithms and testing. To avoid +bias, it is important that the hash function be unpredictable and +indistinguishable from a uniform random generator. We have verified the bit +distribution and avalanche properties of SipHash and HighwayHash. + +64-bit hashes are also useful for authenticating short-lived messages such as +network/RPC packets. This requires that the hash function withstand +differential, length extension and other attacks. We have published a formal +security analysis for HighwayHash. New cryptanalysis tools may still need to be +developed for further analysis. + +Strong hashes are also important parts of methods for protecting hash tables +against unacceptable worst-case behavior and denial of service attacks +(see "hash flooding" below). + +## SipHash + +Our SipHash implementation is a fast and portable drop-in replacement for +the reference C code. Outputs are identical for the given test cases (messages +between 0 and 63 bytes). + +Interestingly, it is about twice as fast as a SIMD implementation using SSE4.1 +(https://goo.gl/80GBSD). This is presumably due to the lack of SIMD bit rotate +instructions. + +SipHash13 is a faster but weaker variant with one mixing round per update and +three during finalization. + +We also provide a data-parallel 'tree hash' variant that enables efficient SIMD +while retaining safety guarantees. This is about twice as fast as SipHash, but +does not return the same results. + +## HighwayHash + +We have devised a new way of mixing inputs with AVX2 multiply and permute +instructions. The multiplications are 32x32 -> 64 bits and therefore infeasible +to reverse. Permuting equalizes the distribution of the resulting bytes. + +The internal state occupies four 256-bit AVX2 registers. Due to limitations of +the instruction set, the registers are partitioned into two 512-bit halves that +remain independent until the reduce phase. The algorithm outputs 64 bit digests +or up to 256 bits at no extra cost. + +In addition to high throughput, the algorithm is designed for low finalization +cost. The result is more than twice as fast as SipTreeHash. + +For older CPUs, we also provide an SSE4.1 version (80% as fast for large inputs +and 95% as fast for short inputs) and a portable version (10% as fast). + +Statistical analyses and preliminary cryptanalysis are given in +https://arxiv.org/abs/1612.06257. + +## Versioning and stability + +SipHash and HighwayHash 1.0 are 'fingerprint functions' whose input -> hash +mapping will not change. This is important for applications that write hashes to +persistent storage. + +HighwayHash has not yet reached 1.0 and may still change in the near future. We +will announce when it is frozen. + +## Speed measurements + +To measure the CPU cost of a hash function, we can either create an artificial +'microbenchmark' (easier to control, but probably not representative of the +actual runtime), or insert instrumentation directly into an application (risks +influencing the results through observer overhead). We provide novel variants of +both approaches that mitigate their respective disadvantages. + +profiler.h uses software write-combining to stream program traces to memory +with minimal overhead. These can be analyzed offline, or when memory is full, +to learn how much time was spent in each (possibly nested) zone. + +nanobenchmark.h enables cycle-accurate measurements of very short functions. +It uses CPU fences and robust statistics to minimize variability, and also +avoids unrealistic branch prediction effects. + +We compile the C++ implementations with a patched GCC 4.9 and run on a single +core of a Xeon E5-2690 v3 clocked at 2.6 GHz. CPU cost is measured as cycles per +byte for various input sizes: + +Algorithm | 8 | 31 | 32 | 63 | 64 | 1024 +---------------- | ----- | ---- | ---- | ---- | ---- | ---- +HighwayHashAVX2 | 7.34 | 1.81 | 1.71 | 1.04 | 0.95 | 0.24 +HighwayHashSSE41 | 8.00 | 2.11 | 1.75 | 1.13 | 0.96 | 0.30 +SipTreeHash | 16.51 | 4.57 | 4.09 | 2.22 | 2.29 | 0.57 +SipTreeHash13 | 12.33 | 3.47 | 3.06 | 1.68 | 1.63 | 0.33 +SipHash | 8.13 | 2.58 | 2.73 | 1.87 | 1.93 | 1.26 +SipHash13 | 6.96 | 2.09 | 2.12 | 1.32 | 1.33 | 0.68 + +SipTreeHash is slower than SipHash for small inputs because it processes blocks +of 32 bytes. AVX2 and SSE4.1 HighwayHash are faster than SipHash for all input +sizes due to their highly optimized handling of partial vectors. + +Note that previous measurements included the initialization of their input, +which dramatically increased timings especially for small inputs. + +## CPU requirements + +SipTreeHash[13] requires an AVX2-capable CPU (e.g. Haswell). HighwayHash +includes a dispatcher that chooses the best available (AVX2, SSE4.1 or portable) +implementation at runtime, as well as a directly callable function template that +can only run on the CPU for which it was built. SipHash[13] and +ScalarSipTreeHash[13] have no particular CPU requirements. + +Our implementations use custom AVX2 vector classes with overloaded operators +(e.g. `const V4x64U a = b + c`) for type-safety and improved readability vs. +compiler intrinsics (e.g. `const __m256i a = _mm256_add_epi64(b, c)`). + +We intend to port HighwayHash to other SIMD-capable platforms, especially ARM. + +Our instruction_sets dispatcher avoids running newer instructions on older CPUs +that do not support them. However, intrinsics, and therefore also any vector +classes that use them, require a compiler flag that also enables the compiler to +generate code for that CPU. This means the intrinsics must be placed in separate +translation units that are compiled with the required flags. It is important +that these source files and their headers not define any inline functions, +because that might break the one definition rule and cause crashes. + +To minimize dispatch overhead when hashes are computed often (e.g. in a loop), +we can inline the hash function into its caller using templates. The dispatch +overhead will only be paid once (e.g. before the loop). The template mechanism +also avoids duplicating code in each CPU-specific implementation. + +## Defending against hash flooding + +To mitigate hash flooding attacks, we need to take both the hash function and +the data structure into account. + +We wish to defend (web) services that utilize hash sets/maps against +denial-of-service attacks. Such data structures assign attacker-controlled +input messages `m` to a hash table bin `b` by computing the hash `H(s, m)` +using a hash function `H` seeded by `s`, and mapping it to a bin with some +narrowing function `b = R(h)`, discussed below. + +Attackers may attempt to trigger 'flooding' (excessive work in insertions or +lookups) by finding multiple `m` that map to the same bin. If the attacker has +local access, they can do far worse, so we assume the attacker can only issue +remote requests. If the attacker is able to send large numbers of requests, +they can already deny service, so we need only ensure the attacker's cost is +sufficiently large compared to the service's provisioning. + +If the hash function is 'weak', attackers can easily generate 'hash collisions' +(inputs mapping to the same hash values) that are independent of the seed. In +other words, certain input messages will cause collisions regardless of the seed +value. The author of SipHash has published C++ programs to generate such +'universal (key-independent) multicollisions' for CityHash and Murmur. Similar +'differential' attacks are likely possible for any hash function consisting only +of reversible operations (e.g. addition/multiplication/rotation) with a constant +operand. `n` requests with such inputs cause `n^2` work for an unprotected hash +table, which is unacceptable. + +By contrast, 'strong' hashes such as SipHash or HighwayHash require infeasible +attacker effort to find a hash collision (an expected 2^32 guesses of `m` per +the birthday paradox) or recover the seed (2^63 requests). These security claims +assume the seed is secret. It is reasonable to suppose `s` is initially unknown +to attackers, e.g. generated on startup or even per-connection. A timing attack +by Wool/Bar-Yosef recovers 13-bit seeds by testing all 8K possibilities using +millions of requests, which takes several days (even assuming unrealistic 150 us +round-trip times). It appears infeasible to recover 64-bit seeds in this way. + +However, attackers are only looking for multiple `m` mapping to the same bin +rather than identical hash values. We assume they know or are able to discover +the hash table size `p`. It is common to choose `p = 2^i` to enable an efficient +`R(h) := h & (p - 1)`, which simply retains the lower hash bits. It may be +easier for attackers to compute partial collisions where only the lower `i` bits +match. This can be prevented by choosing a prime `p` so that `R(h) := h % p` +incorporates all hash bits. The costly modulo operation can be avoided by +multiplying with the inverse (https://goo.gl/l7ASm8). An interesting alternative +suggested by Kyoung Jae Seo chooses a random subset of the `h` bits. Such an `R` +function can be computed in just 3 cycles using PEXT from the BMI2 instruction +set. This is expected to defend against SAT-solver attacks on the hash bits at a +slightly lower cost than the multiplicative inverse method, and still allows +power-of-two table sizes. + +Summary thus far: given a strong hash function and secret seed, it appears +infeasible for attackers to generate hash collisions because `s` and/or `R` are +unknown. However, they can still observe the timings of data structure +operations for various `m`. With typical table sizes of 2^10 to 2^17 entries, +attackers can detect some 'bin collisions' (inputs mapping to the same bin). +Although this will be costly for the attacker, they can then send many instances +of such inputs, so we need to limit the resulting work for our data structure. + +Hash tables with separate chaining typically store bin entries in a linked list, +so worst-case inputs lead to unacceptable linear-time lookup cost. We instead +seek optimal asymptotic worst-case complexity for each operation (insertion, +deletion and lookups), which is a constant factor times the logarithm of the +data structure size. This naturally leads to a tree-like data structure for each +bin. The Java8 HashMap only replaces its linked list with trees when needed. +This leads to additional cost and complexity for deciding whether a bin is a +list or tree. + +Our first proposal (suggested by Github user funny-falcon) avoids this overhead +by always storing one tree per bin. It may also be worthwhile to store the first +entry directly in the bin, which avoids allocating any tree nodes in the common +case where bins are sparsely populated. What kind of tree should be used? +Scapegoat and splay trees only offer amortized complexity guarantees, whereas +treaps require an entropy source and have higher constant factors in practice. +Self-balancing structures such as 2-3 or red-black trees require additional +bookkeeping information. We can hope to reduce rebalancing cost by realizing +that the output bits of strong `H` functions are uniformly distributed. When +using them as keys instead of the original message `m`, recent relaxed balancing +schemes such as left-leaning red-black or weak AVL trees may require fewer tree +rotations to maintain their invariants. Note that `H` already determines the +bin, so we should only use the remaining bits. 64-bit hashes are likely +sufficient for this purpose, and HighwayHash generates up to 256 bits. It seems +unlikely that attackers can craft inputs resulting in worst cases for both the +bin index and tree key without being able to generate hash collisions, which +would contradict the security claims of strong hashes. Even if they succeed, the +relaxed tree balancing still guarantees an upper bound on height and therefore +the worst-case operation cost. For the AVL variant, the constant factors are +slightly lower than for red-black trees. + +The second proposed approach uses augmented/de-amortized cuckoo hash tables +(https://goo.gl/PFwwkx). These guarantee worst-case `log n` bounds for all +operations, but only if the hash function is 'indistinguishable from random' +(uniformly distributed regardless of the input distribution), which is claimed +for SipHash and HighwayHash but certainly not for weak hashes. + +Both alternatives retain good average case performance and defend against +flooding by limiting the amount of extra work an attacker can cause. The first +approach guarantees an upper bound of `log n` additional work even if the hash +function is compromised. + +In summary, a strong hash function is not, by itself, sufficient to protect a +chained hash table from flooding attacks. However, strong hash functions are +important parts of two schemes for preventing denial of service. Using weak hash +functions can slightly accelerate the best-case and average-case performance of +a service, but at the risk of greatly reduced attack costs and worst-case +performance. + +## Third-party implementations / bindings + +Thanks to Damian Gryski for making us aware of these third-party +implementations or bindings. Please feel free to get in touch or +raise an issue and we'll add yours as well. + +By | Language | URL +--- | --- | --- +Damian Gryski | Go and SSE | https://github.com/dgryski/go-highway/ +Lovell Fuller | node.js bindings | https://github.com/lovell/highwayhash +Vinzent Steinberg | Rust bindings | https://github.com/vks/highwayhash-rs + +## Modules + +### Hashes + +* c_bindings.h declares C-callable versions of SipHash/HighwayHash. +* sip_hash.cc is the compatible implementation of SipHash, and also provides + the final reduction for sip_tree_hash. +* sip_tree_hash.cc is the faster but incompatible SIMD j-lanes tree hash. +* scalar_sip_tree_hash.cc is a non-SIMD version. +* state_helpers.h simplifies the implementation of the SipHash variants. +* highwayhash.h is our new, fast hash function. +* hh_avx2.h, hh_sse41.h and hh_portable.h are its various implementations. +* highwayhash_target.h chooses the best available implementation at runtime. + +### Infrastructure + +* arch_specific.h offers byte swapping and CPUID detection. +* compiler_specific.h defines some compiler-dependent language extensions. +* data_parallel.h provides a C++11 ThreadPool and PerThread (similar to + OpenMP). +* instruction_sets.h and targets.h enable efficient CPU-specific dispatching. +* nanobenchmark.h measures elapsed times with < 1 cycle variability. +* os_specific.h sets thread affinity and priority for benchmarking. +* profiler.h is a low-overhead, deterministic hierarchical profiler. +* tsc_timer.h obtains high-resolution timestamps without CPU reordering. +* vector256.h and vector128.h contain wrapper classes for AVX2 and SSE4.1. + +By Jan Wassenberg <jan.wassenberg@gmail.com> and Jyrki Alakuijala +<jyrki.alakuijala@gmail.com>, updated 2017-02-07 + +This is not an official Google product. diff --git a/contrib/libs/highwayhash/arch/avx2/ya.make b/contrib/libs/highwayhash/arch/avx2/ya.make index 3084a352d8..df09ac249e 100644 --- a/contrib/libs/highwayhash/arch/avx2/ya.make +++ b/contrib/libs/highwayhash/arch/avx2/ya.make @@ -1,22 +1,22 @@ -LIBRARY() - +LIBRARY() + WITHOUT_LICENSE_TEXTS() LICENSE(Apache-2.0) OWNER(somov) - -ADDINCL(contrib/libs/highwayhash) - -SRCDIR(contrib/libs/highwayhash/highwayhash) - -CFLAGS(-mavx2) - -NO_COMPILER_WARNINGS() - -SRCS( - sip_tree_hash.cc - hh_avx2.cc -) - -END() + +ADDINCL(contrib/libs/highwayhash) + +SRCDIR(contrib/libs/highwayhash/highwayhash) + +CFLAGS(-mavx2) + +NO_COMPILER_WARNINGS() + +SRCS( + sip_tree_hash.cc + hh_avx2.cc +) + +END() diff --git a/contrib/libs/highwayhash/arch/sse41/ya.make b/contrib/libs/highwayhash/arch/sse41/ya.make index d94ad97038..e56731ef9a 100644 --- a/contrib/libs/highwayhash/arch/sse41/ya.make +++ b/contrib/libs/highwayhash/arch/sse41/ya.make @@ -1,21 +1,21 @@ -LIBRARY() - +LIBRARY() + WITHOUT_LICENSE_TEXTS() LICENSE(Apache-2.0) OWNER(somov) - -ADDINCL(contrib/libs/highwayhash) - -SRCDIR(contrib/libs/highwayhash/highwayhash) - -CFLAGS(-msse4.1) - -NO_COMPILER_WARNINGS() - -SRCS( - hh_sse41.cc -) - -END() + +ADDINCL(contrib/libs/highwayhash) + +SRCDIR(contrib/libs/highwayhash/highwayhash) + +CFLAGS(-msse4.1) + +NO_COMPILER_WARNINGS() + +SRCS( + hh_sse41.cc +) + +END() diff --git a/contrib/libs/highwayhash/highwayhash/arch_specific.cc b/contrib/libs/highwayhash/highwayhash/arch_specific.cc index 1ab839f58b..b8048e46ee 100644 --- a/contrib/libs/highwayhash/highwayhash/arch_specific.cc +++ b/contrib/libs/highwayhash/highwayhash/arch_specific.cc @@ -1,118 +1,118 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "highwayhash/arch_specific.h" - -#include <stdint.h> - -#if HH_ARCH_X64 && !HH_MSC_VERSION -# include <cpuid.h> -#endif - -#include <string.h> // memcpy -#include <string> - -namespace highwayhash { - -const char* TargetName(const TargetBits target_bit) { - switch (target_bit) { - case HH_TARGET_Portable: - return "Portable"; - case HH_TARGET_SSE41: - return "SSE41"; - case HH_TARGET_AVX2: - return "AVX2"; - default: - return nullptr; // zero, multiple, or unknown bits - } -} - -#if HH_ARCH_X64 - -void Cpuid(const uint32_t level, const uint32_t count, - uint32_t* HH_RESTRICT abcd) { -#if HH_MSC_VERSION - int regs[4]; - __cpuidex(regs, level, count); - for (int i = 0; i < 4; ++i) { - abcd[i] = regs[i]; - } -#else - uint32_t a, b, c, d; - __cpuid_count(level, count, a, b, c, d); - abcd[0] = a; - abcd[1] = b; - abcd[2] = c; - abcd[3] = d; -#endif -} - -uint32_t ApicId() { - uint32_t abcd[4]; - Cpuid(1, 0, abcd); - return abcd[1] >> 24; // ebx -} - -namespace { - -std::string BrandString() { - char brand_string[49]; - uint32_t abcd[4]; - - // Check if brand string is supported (it is on all reasonable Intel/AMD) - Cpuid(0x80000000U, 0, abcd); - if (abcd[0] < 0x80000004U) { - return std::string(); - } - - for (int i = 0; i < 3; ++i) { - Cpuid(0x80000002U + i, 0, abcd); - memcpy(brand_string + i * 16, &abcd, sizeof(abcd)); - } - brand_string[48] = 0; - return brand_string; -} - -double DetectInvariantCyclesPerSecond() { - const std::string& brand_string = BrandString(); - // Brand strings include the maximum configured frequency. These prefixes are - // defined by Intel CPUID documentation. - const char* prefixes[3] = {"MHz", "GHz", "THz"}; - const double multipliers[3] = {1E6, 1E9, 1E12}; - for (size_t i = 0; i < 3; ++i) { - const size_t pos_prefix = brand_string.find(prefixes[i]); - if (pos_prefix != std::string::npos) { - const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1); - if (pos_space != std::string::npos) { - const std::string digits = - brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1); - return std::stod(digits) * multipliers[i]; - } - } - } - - return 0.0; -} - -} // namespace - -double InvariantCyclesPerSecond() { - // Thread-safe caching - this is called several times. - static const double cycles_per_second = DetectInvariantCyclesPerSecond(); - return cycles_per_second; -} - -#endif // HH_ARCH_X64 - -} // namespace highwayhash +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highwayhash/arch_specific.h" + +#include <stdint.h> + +#if HH_ARCH_X64 && !HH_MSC_VERSION +# include <cpuid.h> +#endif + +#include <string.h> // memcpy +#include <string> + +namespace highwayhash { + +const char* TargetName(const TargetBits target_bit) { + switch (target_bit) { + case HH_TARGET_Portable: + return "Portable"; + case HH_TARGET_SSE41: + return "SSE41"; + case HH_TARGET_AVX2: + return "AVX2"; + default: + return nullptr; // zero, multiple, or unknown bits + } +} + +#if HH_ARCH_X64 + +void Cpuid(const uint32_t level, const uint32_t count, + uint32_t* HH_RESTRICT abcd) { +#if HH_MSC_VERSION + int regs[4]; + __cpuidex(regs, level, count); + for (int i = 0; i < 4; ++i) { + abcd[i] = regs[i]; + } +#else + uint32_t a, b, c, d; + __cpuid_count(level, count, a, b, c, d); + abcd[0] = a; + abcd[1] = b; + abcd[2] = c; + abcd[3] = d; +#endif +} + +uint32_t ApicId() { + uint32_t abcd[4]; + Cpuid(1, 0, abcd); + return abcd[1] >> 24; // ebx +} + +namespace { + +std::string BrandString() { + char brand_string[49]; + uint32_t abcd[4]; + + // Check if brand string is supported (it is on all reasonable Intel/AMD) + Cpuid(0x80000000U, 0, abcd); + if (abcd[0] < 0x80000004U) { + return std::string(); + } + + for (int i = 0; i < 3; ++i) { + Cpuid(0x80000002U + i, 0, abcd); + memcpy(brand_string + i * 16, &abcd, sizeof(abcd)); + } + brand_string[48] = 0; + return brand_string; +} + +double DetectInvariantCyclesPerSecond() { + const std::string& brand_string = BrandString(); + // Brand strings include the maximum configured frequency. These prefixes are + // defined by Intel CPUID documentation. + const char* prefixes[3] = {"MHz", "GHz", "THz"}; + const double multipliers[3] = {1E6, 1E9, 1E12}; + for (size_t i = 0; i < 3; ++i) { + const size_t pos_prefix = brand_string.find(prefixes[i]); + if (pos_prefix != std::string::npos) { + const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1); + if (pos_space != std::string::npos) { + const std::string digits = + brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1); + return std::stod(digits) * multipliers[i]; + } + } + } + + return 0.0; +} + +} // namespace + +double InvariantCyclesPerSecond() { + // Thread-safe caching - this is called several times. + static const double cycles_per_second = DetectInvariantCyclesPerSecond(); + return cycles_per_second; +} + +#endif // HH_ARCH_X64 + +} // namespace highwayhash diff --git a/contrib/libs/highwayhash/highwayhash/arch_specific.h b/contrib/libs/highwayhash/highwayhash/arch_specific.h index 7419d8ebbc..9fce08bd85 100644 --- a/contrib/libs/highwayhash/highwayhash/arch_specific.h +++ b/contrib/libs/highwayhash/highwayhash/arch_specific.h @@ -1,153 +1,153 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_ARCH_SPECIFIC_H_ -#define HIGHWAYHASH_ARCH_SPECIFIC_H_ - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. -// -// Background: older GCC/Clang require flags such as -mavx2 before AVX2 SIMD -// intrinsics can be used. These intrinsics are only used within blocks that -// first verify CPU capabilities. However, the flag also allows the compiler to -// generate AVX2 code in other places. This can violate the One Definition Rule, -// which requires multiple instances of a function with external linkage -// (e.g. extern inline in a header) to be "equivalent". To prevent the resulting -// crashes on non-AVX2 CPUs, any header (transitively) included from a -// translation unit compiled with different flags is "restricted". This means -// all function definitions must have internal linkage (e.g. static inline), or -// reside in namespace HH_TARGET_NAME, which expands to a name unique to the -// current compiler flags. -// -// Most C system headers are safe to include, but C++ headers should generally -// be avoided because they often do not specify static linkage and cannot -// reliably be wrapped in a namespace. - -#include "highwayhash/compiler_specific.h" - -#include <stdint.h> - -#if HH_MSC_VERSION -#include <intrin.h> // _byteswap_* -#endif - -namespace highwayhash { - -#if defined(__x86_64__) || defined(_M_X64) -#define HH_ARCH_X64 1 -#else -#define HH_ARCH_X64 0 -#endif - -#ifdef __aarch64__ -#define HH_ARCH_AARCH64 1 -#else -#define HH_ARCH_AARCH64 0 -#endif - -#if defined(__powerpc64__) || defined(_M_PPC) -#define HH_ARCH_PPC 1 -#else -#define HH_ARCH_PPC 0 -#endif - -// Target := instruction set extension(s) such as SSE41. A translation unit can -// only provide a single target-specific implementation because they require -// different compiler flags. - -// Either the build system specifies the target by defining HH_TARGET_NAME -// (which is necessary for Portable on X64, and SSE41 on MSVC), or we'll choose -// the most efficient one that can be compiled given the current flags: -#ifndef HH_TARGET_NAME - -// To avoid excessive code size and dispatch overhead, we only support a few -// groups of extensions, e.g. FMA+BMI2+AVX+AVX2 =: "AVX2". These names must -// match the HH_TARGET_* suffixes below. -#ifdef __AVX2__ -#define HH_TARGET_NAME AVX2 -#elif defined(__SSE4_1__) -#define HH_TARGET_NAME SSE41 -#else -#define HH_TARGET_NAME Portable -#endif - -#endif // HH_TARGET_NAME - -#define HH_CONCAT(first, second) first##second -// Required due to macro expansion rules. -#define HH_EXPAND_CONCAT(first, second) HH_CONCAT(first, second) -// Appends HH_TARGET_NAME to "identifier_prefix". -#define HH_ADD_TARGET_SUFFIX(identifier_prefix) \ - HH_EXPAND_CONCAT(identifier_prefix, HH_TARGET_NAME) - -// HH_TARGET expands to an integer constant. Typical usage: HHStateT<HH_TARGET>. -// This ensures your code will work correctly when compiler flags are changed, -// and benefit from subsequently added targets/specializations. -#define HH_TARGET HH_ADD_TARGET_SUFFIX(HH_TARGET_) - -// Deprecated former name of HH_TARGET; please use HH_TARGET instead. -#define HH_TARGET_PREFERRED HH_TARGET - -// Associate targets with integer literals so the preprocessor can compare them -// with HH_TARGET. Do not instantiate templates with these values - use -// HH_TARGET instead. Must be unique powers of two, see TargetBits. Always -// defined even if unavailable on this HH_ARCH to allow calling TargetName. -// The suffixes must match the HH_TARGET_NAME identifiers. -#define HH_TARGET_Portable 1 -#define HH_TARGET_SSE41 2 -#define HH_TARGET_AVX2 4 - -// Bit array for one or more HH_TARGET_*. Used to indicate which target(s) are -// supported or were called by InstructionSets::RunAll. -using TargetBits = unsigned; - -namespace HH_TARGET_NAME { - -// Calls func(bit_value) for every nonzero bit in "bits". -template <class Func> -void ForeachTarget(TargetBits bits, const Func& func) { - while (bits != 0) { - const TargetBits lowest = bits & (~bits + 1); - func(lowest); - bits &= ~lowest; - } -} - -} // namespace HH_TARGET_NAME - -// Returns a brief human-readable string literal identifying one of the above -// bits, or nullptr if zero, multiple, or unknown bits are set. -const char* TargetName(const TargetBits target_bit); - -#if HH_ARCH_X64 - -// Calls CPUID instruction with eax=level and ecx=count and returns the result -// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd). -void Cpuid(const uint32_t level, const uint32_t count, - uint32_t* HH_RESTRICT abcd); - -// Returns the APIC ID of the CPU on which we're currently running. -uint32_t ApicId(); - -// Returns nominal CPU clock frequency for converting tsc_timer cycles to -// seconds. This is unaffected by CPU throttling ("invariant"). Thread-safe. -double InvariantCyclesPerSecond(); - -#endif // HH_ARCH_X64 - -} // namespace highwayhash - -#endif // HIGHWAYHASH_ARCH_SPECIFIC_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_ARCH_SPECIFIC_H_ +#define HIGHWAYHASH_ARCH_SPECIFIC_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. +// +// Background: older GCC/Clang require flags such as -mavx2 before AVX2 SIMD +// intrinsics can be used. These intrinsics are only used within blocks that +// first verify CPU capabilities. However, the flag also allows the compiler to +// generate AVX2 code in other places. This can violate the One Definition Rule, +// which requires multiple instances of a function with external linkage +// (e.g. extern inline in a header) to be "equivalent". To prevent the resulting +// crashes on non-AVX2 CPUs, any header (transitively) included from a +// translation unit compiled with different flags is "restricted". This means +// all function definitions must have internal linkage (e.g. static inline), or +// reside in namespace HH_TARGET_NAME, which expands to a name unique to the +// current compiler flags. +// +// Most C system headers are safe to include, but C++ headers should generally +// be avoided because they often do not specify static linkage and cannot +// reliably be wrapped in a namespace. + +#include "highwayhash/compiler_specific.h" + +#include <stdint.h> + +#if HH_MSC_VERSION +#include <intrin.h> // _byteswap_* +#endif + +namespace highwayhash { + +#if defined(__x86_64__) || defined(_M_X64) +#define HH_ARCH_X64 1 +#else +#define HH_ARCH_X64 0 +#endif + +#ifdef __aarch64__ +#define HH_ARCH_AARCH64 1 +#else +#define HH_ARCH_AARCH64 0 +#endif + +#if defined(__powerpc64__) || defined(_M_PPC) +#define HH_ARCH_PPC 1 +#else +#define HH_ARCH_PPC 0 +#endif + +// Target := instruction set extension(s) such as SSE41. A translation unit can +// only provide a single target-specific implementation because they require +// different compiler flags. + +// Either the build system specifies the target by defining HH_TARGET_NAME +// (which is necessary for Portable on X64, and SSE41 on MSVC), or we'll choose +// the most efficient one that can be compiled given the current flags: +#ifndef HH_TARGET_NAME + +// To avoid excessive code size and dispatch overhead, we only support a few +// groups of extensions, e.g. FMA+BMI2+AVX+AVX2 =: "AVX2". These names must +// match the HH_TARGET_* suffixes below. +#ifdef __AVX2__ +#define HH_TARGET_NAME AVX2 +#elif defined(__SSE4_1__) +#define HH_TARGET_NAME SSE41 +#else +#define HH_TARGET_NAME Portable +#endif + +#endif // HH_TARGET_NAME + +#define HH_CONCAT(first, second) first##second +// Required due to macro expansion rules. +#define HH_EXPAND_CONCAT(first, second) HH_CONCAT(first, second) +// Appends HH_TARGET_NAME to "identifier_prefix". +#define HH_ADD_TARGET_SUFFIX(identifier_prefix) \ + HH_EXPAND_CONCAT(identifier_prefix, HH_TARGET_NAME) + +// HH_TARGET expands to an integer constant. Typical usage: HHStateT<HH_TARGET>. +// This ensures your code will work correctly when compiler flags are changed, +// and benefit from subsequently added targets/specializations. +#define HH_TARGET HH_ADD_TARGET_SUFFIX(HH_TARGET_) + +// Deprecated former name of HH_TARGET; please use HH_TARGET instead. +#define HH_TARGET_PREFERRED HH_TARGET + +// Associate targets with integer literals so the preprocessor can compare them +// with HH_TARGET. Do not instantiate templates with these values - use +// HH_TARGET instead. Must be unique powers of two, see TargetBits. Always +// defined even if unavailable on this HH_ARCH to allow calling TargetName. +// The suffixes must match the HH_TARGET_NAME identifiers. +#define HH_TARGET_Portable 1 +#define HH_TARGET_SSE41 2 +#define HH_TARGET_AVX2 4 + +// Bit array for one or more HH_TARGET_*. Used to indicate which target(s) are +// supported or were called by InstructionSets::RunAll. +using TargetBits = unsigned; + +namespace HH_TARGET_NAME { + +// Calls func(bit_value) for every nonzero bit in "bits". +template <class Func> +void ForeachTarget(TargetBits bits, const Func& func) { + while (bits != 0) { + const TargetBits lowest = bits & (~bits + 1); + func(lowest); + bits &= ~lowest; + } +} + +} // namespace HH_TARGET_NAME + +// Returns a brief human-readable string literal identifying one of the above +// bits, or nullptr if zero, multiple, or unknown bits are set. +const char* TargetName(const TargetBits target_bit); + +#if HH_ARCH_X64 + +// Calls CPUID instruction with eax=level and ecx=count and returns the result +// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd). +void Cpuid(const uint32_t level, const uint32_t count, + uint32_t* HH_RESTRICT abcd); + +// Returns the APIC ID of the CPU on which we're currently running. +uint32_t ApicId(); + +// Returns nominal CPU clock frequency for converting tsc_timer cycles to +// seconds. This is unaffected by CPU throttling ("invariant"). Thread-safe. +double InvariantCyclesPerSecond(); + +#endif // HH_ARCH_X64 + +} // namespace highwayhash + +#endif // HIGHWAYHASH_ARCH_SPECIFIC_H_ diff --git a/contrib/libs/highwayhash/highwayhash/benchmark.cc b/contrib/libs/highwayhash/highwayhash/benchmark.cc index 0422690872..7279b295b9 100644 --- a/contrib/libs/highwayhash/highwayhash/benchmark.cc +++ b/contrib/libs/highwayhash/highwayhash/benchmark.cc @@ -1,313 +1,313 @@ -// Copyright 2016 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Measures hash function throughput for various input sizes. - -#include <algorithm> -#include <cassert> -#include <cstddef> -#include <cstdio> -#include <cstdlib> -#include <map> -#include <string> -#include <utility> -#include <vector> - -#include "highwayhash/compiler_specific.h" -#include "highwayhash/instruction_sets.h" -#include "highwayhash/nanobenchmark.h" -#include "highwayhash/os_specific.h" -#include "highwayhash/robust_statistics.h" - -// Which functions to enable (includes check for compiler support) -#define BENCHMARK_SIP 0 -#define BENCHMARK_SIP_TREE 0 -#define BENCHMARK_HIGHWAY 1 -#define BENCHMARK_HIGHWAY_CAT 1 -#define BENCHMARK_FARM 0 - -#include "highwayhash/highwayhash_test_target.h" -#if BENCHMARK_SIP -#include "highwayhash/sip_hash.h" -#endif -#if BENCHMARK_SIP_TREE -#include "highwayhash/scalar_sip_tree_hash.h" -#include "highwayhash/sip_tree_hash.h" -#endif -#if BENCHMARK_FARM -#include "third_party/farmhash/src/farmhash.h" -#endif - -namespace highwayhash { -namespace { - -// Stores time measurements from benchmarks, with support for printing them -// as LaTeX figures or tables. -class Measurements { - public: - void Add(const char* caption, const size_t bytes, const double cycles) { - const float cpb = static_cast<float>(cycles / bytes); - results_.emplace_back(caption, static_cast<int>(bytes), cpb); - } - - // Prints results as a LaTeX table (only for in_sizes matching the - // desired values). - void PrintTable(const std::vector<size_t>& in_sizes) { - std::vector<size_t> unique = in_sizes; - std::sort(unique.begin(), unique.end()); - unique.erase(std::unique(unique.begin(), unique.end()), unique.end()); - - printf("\\begin{tabular}{"); - for (size_t i = 0; i < unique.size() + 1; ++i) { - printf("%s", i == 0 ? "r" : "|r"); - } - printf("}\n\\toprule\nAlgorithm"); - for (const size_t in_size : unique) { - printf(" & %zu", in_size); - } - printf("\\\\\n\\midrule\n"); - - const SpeedsForCaption cpb_for_caption = SortByCaptionFilterBySize(unique); - for (const auto& item : cpb_for_caption) { - printf("%22s", item.first.c_str()); - for (const float cpb : item.second) { - printf(" & %5.2f", cpb); - } - printf("\\\\\n"); - } - } - - // Prints results suitable for pgfplots. - void PrintPlots() { - const SpeedsForCaption cpb_for_caption = SortByCaption(); - assert(!cpb_for_caption.empty()); - const size_t num_sizes = cpb_for_caption.begin()->second.size(); - - printf("Size "); - // Flatten per-caption vectors into one iterator. - std::vector<std::vector<float>::const_iterator> iterators; - for (const auto& item : cpb_for_caption) { - printf("%21s ", item.first.c_str()); - assert(item.second.size() == num_sizes); - iterators.push_back(item.second.begin()); - } - printf("\n"); - - const std::vector<int>& sizes = UniqueSizes(); - assert(num_sizes == sizes.size()); - for (int i = 0; i < static_cast<int>(num_sizes); ++i) { - printf("%d ", sizes[i]); - for (auto& it : iterators) { - printf("%5.2f ", 1.0f / *it); // bytes per cycle - ++it; - } - printf("\n"); - } - } - - private: - struct Result { - Result(const char* caption, const int in_size, const float cpb) - : caption(caption), in_size(in_size), cpb(cpb) {} - - // Algorithm name. - std::string caption; - // Size of the input data [bytes]. - int in_size; - // Measured throughput [cycles per byte]. - float cpb; - }; - - // Returns set of all input sizes for the first column of a size/speed plot. - std::vector<int> UniqueSizes() { - std::vector<int> sizes; - sizes.reserve(results_.size()); - for (const Result& result : results_) { - sizes.push_back(result.in_size); - } - std::sort(sizes.begin(), sizes.end()); - sizes.erase(std::unique(sizes.begin(), sizes.end()), sizes.end()); - return sizes; - } - - using SpeedsForCaption = std::map<std::string, std::vector<float>>; - - SpeedsForCaption SortByCaption() const { - SpeedsForCaption cpb_for_caption; - for (const Result& result : results_) { - cpb_for_caption[result.caption].push_back(result.cpb); - } - return cpb_for_caption; - } - - // Only includes measurement results matching one of the given sizes. - SpeedsForCaption SortByCaptionFilterBySize( - const std::vector<size_t>& in_sizes) const { - SpeedsForCaption cpb_for_caption; - for (const Result& result : results_) { - for (const size_t in_size : in_sizes) { - if (result.in_size == static_cast<int>(in_size)) { - cpb_for_caption[result.caption].push_back(result.cpb); - } - } - } - return cpb_for_caption; - } - - std::vector<Result> results_; -}; - -void AddMeasurements(DurationsForInputs* input_map, const char* caption, - Measurements* measurements) { - for (size_t i = 0; i < input_map->num_items; ++i) { - const DurationsForInputs::Item& item = input_map->items[i]; - std::vector<float> durations(item.durations, - item.durations + item.num_durations); - const float median = Median(&durations); - const float variability = MedianAbsoluteDeviation(durations, median); - printf("%s %4zu: median=%6.1f cycles; median L1 norm =%4.1f cycles\n", - caption, item.input, median, variability); - measurements->Add(caption, item.input, median); - } - input_map->num_items = 0; -} - -#if BENCHMARK_SIP || BENCHMARK_FARM || (BENCHMARK_SIP_TREE && defined(__AVX2__)) - -void MeasureAndAdd(DurationsForInputs* input_map, const char* caption, - const Func func, Measurements* measurements) { - MeasureDurations(func, input_map); - AddMeasurements(input_map, caption, measurements); -} - -#endif - -// InstructionSets::RunAll callback. -void AddMeasurementsWithPrefix(const char* prefix, const char* target_name, - DurationsForInputs* input_map, void* context) { - std::string caption(prefix); - caption += target_name; - AddMeasurements(input_map, caption.c_str(), - static_cast<Measurements*>(context)); -} - -#if BENCHMARK_SIP - -uint64_t RunSip(const size_t size) { - const HH_U64 key2[2] HH_ALIGNAS(16) = {0, 1}; - char in[kMaxBenchmarkInputSize]; - memcpy(in, &size, sizeof(size)); - return SipHash(key2, in, size); -} - -uint64_t RunSip13(const size_t size) { - const HH_U64 key2[2] HH_ALIGNAS(16) = {0, 1}; - char in[kMaxBenchmarkInputSize]; - memcpy(in, &size, sizeof(size)); - return SipHash13(key2, in, size); -} - -#endif - -#if BENCHMARK_SIP_TREE - -uint64_t RunSipTree(const size_t size) { - const HH_U64 key4[4] HH_ALIGNAS(32) = {0, 1, 2, 3}; - char in[kMaxBenchmarkInputSize]; - memcpy(in, &size, sizeof(size)); - return SipTreeHash(key4, in, size); -} - -uint64_t RunSipTree13(const size_t size) { - const HH_U64 key4[4] HH_ALIGNAS(32) = {0, 1, 2, 3}; - char in[kMaxBenchmarkInputSize]; - memcpy(in, &size, sizeof(size)); - return SipTreeHash13(key4, in, size); -} - -#endif - -#if BENCHMARK_FARM - -uint64_t RunFarm(const size_t size) { - char in[kMaxBenchmarkInputSize]; - memcpy(in, &size, sizeof(size)); - return farmhash::Fingerprint64(reinterpret_cast<const char*>(in), size); -} - -#endif - -void AddMeasurements(const std::vector<size_t>& in_sizes, - Measurements* measurements) { - DurationsForInputs input_map(in_sizes.data(), in_sizes.size(), 40); -#if BENCHMARK_SIP - MeasureAndAdd(&input_map, "SipHash", RunSip, measurements); - MeasureAndAdd(&input_map, "SipHash13", RunSip13, measurements); -#endif - -#if BENCHMARK_SIP_TREE && defined(__AVX2__) - MeasureAndAdd(&input_map, "SipTreeHash", RunSipTree, measurements); - MeasureAndAdd(&input_map, "SipTreeHash13", RunSipTree13, measurements); -#endif - -#if BENCHMARK_FARM - MeasureAndAdd(&input_map, "Farm", &RunFarm, measurements); -#endif - -#if BENCHMARK_HIGHWAY - InstructionSets::RunAll<HighwayHashBenchmark>( - &input_map, &AddMeasurementsWithPrefix, measurements); -#endif - -#if BENCHMARK_HIGHWAY_CAT - InstructionSets::RunAll<HighwayHashCatBenchmark>( - &input_map, &AddMeasurementsWithPrefix, measurements); -#endif -} - -void PrintTable() { - const std::vector<size_t> in_sizes = { - 7, 8, 31, 32, 63, 64, kMaxBenchmarkInputSize}; - Measurements measurements; - AddMeasurements(in_sizes, &measurements); - measurements.PrintTable(in_sizes); -} - -void PrintPlots() { - std::vector<size_t> in_sizes; - for (int num_vectors = 0; num_vectors < 12; ++num_vectors) { - for (int remainder : {0, 9, 18, 27}) { - in_sizes.push_back(num_vectors * 32 + remainder); - assert(in_sizes.back() <= kMaxBenchmarkInputSize); - } - } - - Measurements measurements; - AddMeasurements(in_sizes, &measurements); - measurements.PrintPlots(); -} - -} // namespace -} // namespace highwayhash - -int main(int argc, char* argv[]) { - highwayhash::PinThreadToRandomCPU(); - // No argument or t => table - if (argc < 2 || argv[1][0] == 't') { - highwayhash::PrintTable(); - } else if (argv[1][0] == 'p') { - highwayhash::PrintPlots(); - } - return 0; -} +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Measures hash function throughput for various input sizes. + +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdio> +#include <cstdlib> +#include <map> +#include <string> +#include <utility> +#include <vector> + +#include "highwayhash/compiler_specific.h" +#include "highwayhash/instruction_sets.h" +#include "highwayhash/nanobenchmark.h" +#include "highwayhash/os_specific.h" +#include "highwayhash/robust_statistics.h" + +// Which functions to enable (includes check for compiler support) +#define BENCHMARK_SIP 0 +#define BENCHMARK_SIP_TREE 0 +#define BENCHMARK_HIGHWAY 1 +#define BENCHMARK_HIGHWAY_CAT 1 +#define BENCHMARK_FARM 0 + +#include "highwayhash/highwayhash_test_target.h" +#if BENCHMARK_SIP +#include "highwayhash/sip_hash.h" +#endif +#if BENCHMARK_SIP_TREE +#include "highwayhash/scalar_sip_tree_hash.h" +#include "highwayhash/sip_tree_hash.h" +#endif +#if BENCHMARK_FARM +#include "third_party/farmhash/src/farmhash.h" +#endif + +namespace highwayhash { +namespace { + +// Stores time measurements from benchmarks, with support for printing them +// as LaTeX figures or tables. +class Measurements { + public: + void Add(const char* caption, const size_t bytes, const double cycles) { + const float cpb = static_cast<float>(cycles / bytes); + results_.emplace_back(caption, static_cast<int>(bytes), cpb); + } + + // Prints results as a LaTeX table (only for in_sizes matching the + // desired values). + void PrintTable(const std::vector<size_t>& in_sizes) { + std::vector<size_t> unique = in_sizes; + std::sort(unique.begin(), unique.end()); + unique.erase(std::unique(unique.begin(), unique.end()), unique.end()); + + printf("\\begin{tabular}{"); + for (size_t i = 0; i < unique.size() + 1; ++i) { + printf("%s", i == 0 ? "r" : "|r"); + } + printf("}\n\\toprule\nAlgorithm"); + for (const size_t in_size : unique) { + printf(" & %zu", in_size); + } + printf("\\\\\n\\midrule\n"); + + const SpeedsForCaption cpb_for_caption = SortByCaptionFilterBySize(unique); + for (const auto& item : cpb_for_caption) { + printf("%22s", item.first.c_str()); + for (const float cpb : item.second) { + printf(" & %5.2f", cpb); + } + printf("\\\\\n"); + } + } + + // Prints results suitable for pgfplots. + void PrintPlots() { + const SpeedsForCaption cpb_for_caption = SortByCaption(); + assert(!cpb_for_caption.empty()); + const size_t num_sizes = cpb_for_caption.begin()->second.size(); + + printf("Size "); + // Flatten per-caption vectors into one iterator. + std::vector<std::vector<float>::const_iterator> iterators; + for (const auto& item : cpb_for_caption) { + printf("%21s ", item.first.c_str()); + assert(item.second.size() == num_sizes); + iterators.push_back(item.second.begin()); + } + printf("\n"); + + const std::vector<int>& sizes = UniqueSizes(); + assert(num_sizes == sizes.size()); + for (int i = 0; i < static_cast<int>(num_sizes); ++i) { + printf("%d ", sizes[i]); + for (auto& it : iterators) { + printf("%5.2f ", 1.0f / *it); // bytes per cycle + ++it; + } + printf("\n"); + } + } + + private: + struct Result { + Result(const char* caption, const int in_size, const float cpb) + : caption(caption), in_size(in_size), cpb(cpb) {} + + // Algorithm name. + std::string caption; + // Size of the input data [bytes]. + int in_size; + // Measured throughput [cycles per byte]. + float cpb; + }; + + // Returns set of all input sizes for the first column of a size/speed plot. + std::vector<int> UniqueSizes() { + std::vector<int> sizes; + sizes.reserve(results_.size()); + for (const Result& result : results_) { + sizes.push_back(result.in_size); + } + std::sort(sizes.begin(), sizes.end()); + sizes.erase(std::unique(sizes.begin(), sizes.end()), sizes.end()); + return sizes; + } + + using SpeedsForCaption = std::map<std::string, std::vector<float>>; + + SpeedsForCaption SortByCaption() const { + SpeedsForCaption cpb_for_caption; + for (const Result& result : results_) { + cpb_for_caption[result.caption].push_back(result.cpb); + } + return cpb_for_caption; + } + + // Only includes measurement results matching one of the given sizes. + SpeedsForCaption SortByCaptionFilterBySize( + const std::vector<size_t>& in_sizes) const { + SpeedsForCaption cpb_for_caption; + for (const Result& result : results_) { + for (const size_t in_size : in_sizes) { + if (result.in_size == static_cast<int>(in_size)) { + cpb_for_caption[result.caption].push_back(result.cpb); + } + } + } + return cpb_for_caption; + } + + std::vector<Result> results_; +}; + +void AddMeasurements(DurationsForInputs* input_map, const char* caption, + Measurements* measurements) { + for (size_t i = 0; i < input_map->num_items; ++i) { + const DurationsForInputs::Item& item = input_map->items[i]; + std::vector<float> durations(item.durations, + item.durations + item.num_durations); + const float median = Median(&durations); + const float variability = MedianAbsoluteDeviation(durations, median); + printf("%s %4zu: median=%6.1f cycles; median L1 norm =%4.1f cycles\n", + caption, item.input, median, variability); + measurements->Add(caption, item.input, median); + } + input_map->num_items = 0; +} + +#if BENCHMARK_SIP || BENCHMARK_FARM || (BENCHMARK_SIP_TREE && defined(__AVX2__)) + +void MeasureAndAdd(DurationsForInputs* input_map, const char* caption, + const Func func, Measurements* measurements) { + MeasureDurations(func, input_map); + AddMeasurements(input_map, caption, measurements); +} + +#endif + +// InstructionSets::RunAll callback. +void AddMeasurementsWithPrefix(const char* prefix, const char* target_name, + DurationsForInputs* input_map, void* context) { + std::string caption(prefix); + caption += target_name; + AddMeasurements(input_map, caption.c_str(), + static_cast<Measurements*>(context)); +} + +#if BENCHMARK_SIP + +uint64_t RunSip(const size_t size) { + const HH_U64 key2[2] HH_ALIGNAS(16) = {0, 1}; + char in[kMaxBenchmarkInputSize]; + memcpy(in, &size, sizeof(size)); + return SipHash(key2, in, size); +} + +uint64_t RunSip13(const size_t size) { + const HH_U64 key2[2] HH_ALIGNAS(16) = {0, 1}; + char in[kMaxBenchmarkInputSize]; + memcpy(in, &size, sizeof(size)); + return SipHash13(key2, in, size); +} + +#endif + +#if BENCHMARK_SIP_TREE + +uint64_t RunSipTree(const size_t size) { + const HH_U64 key4[4] HH_ALIGNAS(32) = {0, 1, 2, 3}; + char in[kMaxBenchmarkInputSize]; + memcpy(in, &size, sizeof(size)); + return SipTreeHash(key4, in, size); +} + +uint64_t RunSipTree13(const size_t size) { + const HH_U64 key4[4] HH_ALIGNAS(32) = {0, 1, 2, 3}; + char in[kMaxBenchmarkInputSize]; + memcpy(in, &size, sizeof(size)); + return SipTreeHash13(key4, in, size); +} + +#endif + +#if BENCHMARK_FARM + +uint64_t RunFarm(const size_t size) { + char in[kMaxBenchmarkInputSize]; + memcpy(in, &size, sizeof(size)); + return farmhash::Fingerprint64(reinterpret_cast<const char*>(in), size); +} + +#endif + +void AddMeasurements(const std::vector<size_t>& in_sizes, + Measurements* measurements) { + DurationsForInputs input_map(in_sizes.data(), in_sizes.size(), 40); +#if BENCHMARK_SIP + MeasureAndAdd(&input_map, "SipHash", RunSip, measurements); + MeasureAndAdd(&input_map, "SipHash13", RunSip13, measurements); +#endif + +#if BENCHMARK_SIP_TREE && defined(__AVX2__) + MeasureAndAdd(&input_map, "SipTreeHash", RunSipTree, measurements); + MeasureAndAdd(&input_map, "SipTreeHash13", RunSipTree13, measurements); +#endif + +#if BENCHMARK_FARM + MeasureAndAdd(&input_map, "Farm", &RunFarm, measurements); +#endif + +#if BENCHMARK_HIGHWAY + InstructionSets::RunAll<HighwayHashBenchmark>( + &input_map, &AddMeasurementsWithPrefix, measurements); +#endif + +#if BENCHMARK_HIGHWAY_CAT + InstructionSets::RunAll<HighwayHashCatBenchmark>( + &input_map, &AddMeasurementsWithPrefix, measurements); +#endif +} + +void PrintTable() { + const std::vector<size_t> in_sizes = { + 7, 8, 31, 32, 63, 64, kMaxBenchmarkInputSize}; + Measurements measurements; + AddMeasurements(in_sizes, &measurements); + measurements.PrintTable(in_sizes); +} + +void PrintPlots() { + std::vector<size_t> in_sizes; + for (int num_vectors = 0; num_vectors < 12; ++num_vectors) { + for (int remainder : {0, 9, 18, 27}) { + in_sizes.push_back(num_vectors * 32 + remainder); + assert(in_sizes.back() <= kMaxBenchmarkInputSize); + } + } + + Measurements measurements; + AddMeasurements(in_sizes, &measurements); + measurements.PrintPlots(); +} + +} // namespace +} // namespace highwayhash + +int main(int argc, char* argv[]) { + highwayhash::PinThreadToRandomCPU(); + // No argument or t => table + if (argc < 2 || argv[1][0] == 't') { + highwayhash::PrintTable(); + } else if (argv[1][0] == 'p') { + highwayhash::PrintPlots(); + } + return 0; +} diff --git a/contrib/libs/highwayhash/highwayhash/c_bindings.cc b/contrib/libs/highwayhash/highwayhash/c_bindings.cc index 7e0488fb46..21d5c3652d 100644 --- a/contrib/libs/highwayhash/highwayhash/c_bindings.cc +++ b/contrib/libs/highwayhash/highwayhash/c_bindings.cc @@ -1,35 +1,35 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "highwayhash/c_bindings.h" - -#include "highwayhash/highwayhash_target.h" -#include "highwayhash/instruction_sets.h" - -using highwayhash::InstructionSets; -using highwayhash::HighwayHash; - -extern "C" { - -// Ideally this would reside in highwayhash_target.cc, but that file is -// compiled multiple times and we must only define this function once. -uint64_t HighwayHash64(const HHKey key, const char* bytes, - const uint64_t size) { - HHResult64 result; - InstructionSets::Run<HighwayHash>(*reinterpret_cast<const HHKey*>(key), bytes, - size, &result); - return result; -} - -} // extern "C" +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highwayhash/c_bindings.h" + +#include "highwayhash/highwayhash_target.h" +#include "highwayhash/instruction_sets.h" + +using highwayhash::InstructionSets; +using highwayhash::HighwayHash; + +extern "C" { + +// Ideally this would reside in highwayhash_target.cc, but that file is +// compiled multiple times and we must only define this function once. +uint64_t HighwayHash64(const HHKey key, const char* bytes, + const uint64_t size) { + HHResult64 result; + InstructionSets::Run<HighwayHash>(*reinterpret_cast<const HHKey*>(key), bytes, + size, &result); + return result; +} + +} // extern "C" diff --git a/contrib/libs/highwayhash/highwayhash/c_bindings.h b/contrib/libs/highwayhash/highwayhash/c_bindings.h index 7d52de7d75..dd24019041 100644 --- a/contrib/libs/highwayhash/highwayhash/c_bindings.h +++ b/contrib/libs/highwayhash/highwayhash/c_bindings.h @@ -1,55 +1,55 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_HIGHWAYHASH_C_BINDINGS_H_ -#define HIGHWAYHASH_HIGHWAYHASH_C_BINDINGS_H_ - -// C-callable function prototypes, documented in the other header files. - -#include <stdint.h> - -#include "hh_types.h" - -#ifdef __cplusplus -extern "C" { - -// Bring the symbols out of the namespace. -using highwayhash::HHKey; -using highwayhash::HHPacket; -using highwayhash::HHResult64; -using highwayhash::HHResult128; -using highwayhash::HHResult256; -#endif - -uint64_t SipHashC(const uint64_t* key, const char* bytes, const uint64_t size); -uint64_t SipHash13C(const uint64_t* key, const char* bytes, - const uint64_t size); - -// Uses the best implementation of HighwayHash for the current CPU and -// calculates 64-bit hash of given data. -uint64_t HighwayHash64(const HHKey key, const char* bytes, const uint64_t size); - -// Defined by highwayhash_target.cc, which requires a _Target* suffix. -uint64_t HighwayHash64_TargetPortable(const HHKey key, const char* bytes, - const uint64_t size); -uint64_t HighwayHash64_TargetSSE41(const HHKey key, const char* bytes, - const uint64_t size); -uint64_t HighwayHash64_TargetAVX2(const HHKey key, const char* bytes, - const uint64_t size); - -#ifdef __cplusplus -} -#endif - -#endif // HIGHWAYHASH_HIGHWAYHASH_C_BINDINGS_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HIGHWAYHASH_C_BINDINGS_H_ +#define HIGHWAYHASH_HIGHWAYHASH_C_BINDINGS_H_ + +// C-callable function prototypes, documented in the other header files. + +#include <stdint.h> + +#include "hh_types.h" + +#ifdef __cplusplus +extern "C" { + +// Bring the symbols out of the namespace. +using highwayhash::HHKey; +using highwayhash::HHPacket; +using highwayhash::HHResult64; +using highwayhash::HHResult128; +using highwayhash::HHResult256; +#endif + +uint64_t SipHashC(const uint64_t* key, const char* bytes, const uint64_t size); +uint64_t SipHash13C(const uint64_t* key, const char* bytes, + const uint64_t size); + +// Uses the best implementation of HighwayHash for the current CPU and +// calculates 64-bit hash of given data. +uint64_t HighwayHash64(const HHKey key, const char* bytes, const uint64_t size); + +// Defined by highwayhash_target.cc, which requires a _Target* suffix. +uint64_t HighwayHash64_TargetPortable(const HHKey key, const char* bytes, + const uint64_t size); +uint64_t HighwayHash64_TargetSSE41(const HHKey key, const char* bytes, + const uint64_t size); +uint64_t HighwayHash64_TargetAVX2(const HHKey key, const char* bytes, + const uint64_t size); + +#ifdef __cplusplus +} +#endif + +#endif // HIGHWAYHASH_HIGHWAYHASH_C_BINDINGS_H_ diff --git a/contrib/libs/highwayhash/highwayhash/compiler_specific.h b/contrib/libs/highwayhash/highwayhash/compiler_specific.h index 4789f9a610..958cb6849e 100644 --- a/contrib/libs/highwayhash/highwayhash/compiler_specific.h +++ b/contrib/libs/highwayhash/highwayhash/compiler_specific.h @@ -1,90 +1,90 @@ -// Copyright 2015 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_COMPILER_SPECIFIC_H_ -#define HIGHWAYHASH_COMPILER_SPECIFIC_H_ - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -// Compiler - -// #if is shorter and safer than #ifdef. *_VERSION are zero if not detected, -// otherwise 100 * major + minor version. Note that other packages check for -// #ifdef COMPILER_MSVC, so we cannot use that same name. - -#ifdef _MSC_VER -#define HH_MSC_VERSION _MSC_VER -#else -#define HH_MSC_VERSION 0 -#endif - -#ifdef __GNUC__ -#define HH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) -#else -#define HH_GCC_VERSION 0 -#endif - -#ifdef __clang__ -#define HH_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__) -#else -#define HH_CLANG_VERSION 0 -#endif - -//----------------------------------------------------------------------------- - -#if HH_GCC_VERSION && HH_GCC_VERSION < 408 -#define HH_ALIGNAS(multiple) __attribute__((aligned(multiple))) -#else -#define HH_ALIGNAS(multiple) alignas(multiple) // C++11 -#endif - -#if HH_MSC_VERSION -#define HH_RESTRICT __restrict -#elif HH_GCC_VERSION -#define HH_RESTRICT __restrict__ -#else -#define HH_RESTRICT -#endif - -#if HH_MSC_VERSION -#define HH_INLINE __forceinline -#define HH_NOINLINE __declspec(noinline) -#else -#define HH_INLINE inline -#define HH_NOINLINE __attribute__((noinline)) -#endif - -#if HH_MSC_VERSION -// Unsupported, __assume is not the same. -#define HH_LIKELY(expr) expr -#define HH_UNLIKELY(expr) expr -#else -#define HH_LIKELY(expr) __builtin_expect(!!(expr), 1) -#define HH_UNLIKELY(expr) __builtin_expect(!!(expr), 0) -#endif - -#if HH_MSC_VERSION -#include <intrin.h> -#pragma intrinsic(_ReadWriteBarrier) -#define HH_COMPILER_FENCE _ReadWriteBarrier() -#elif HH_GCC_VERSION -#define HH_COMPILER_FENCE asm volatile("" : : : "memory") -#else -#define HH_COMPILER_FENCE -#endif - -#endif // HIGHWAYHASH_COMPILER_SPECIFIC_H_ +// Copyright 2015 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_COMPILER_SPECIFIC_H_ +#define HIGHWAYHASH_COMPILER_SPECIFIC_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +// Compiler + +// #if is shorter and safer than #ifdef. *_VERSION are zero if not detected, +// otherwise 100 * major + minor version. Note that other packages check for +// #ifdef COMPILER_MSVC, so we cannot use that same name. + +#ifdef _MSC_VER +#define HH_MSC_VERSION _MSC_VER +#else +#define HH_MSC_VERSION 0 +#endif + +#ifdef __GNUC__ +#define HH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) +#else +#define HH_GCC_VERSION 0 +#endif + +#ifdef __clang__ +#define HH_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__) +#else +#define HH_CLANG_VERSION 0 +#endif + +//----------------------------------------------------------------------------- + +#if HH_GCC_VERSION && HH_GCC_VERSION < 408 +#define HH_ALIGNAS(multiple) __attribute__((aligned(multiple))) +#else +#define HH_ALIGNAS(multiple) alignas(multiple) // C++11 +#endif + +#if HH_MSC_VERSION +#define HH_RESTRICT __restrict +#elif HH_GCC_VERSION +#define HH_RESTRICT __restrict__ +#else +#define HH_RESTRICT +#endif + +#if HH_MSC_VERSION +#define HH_INLINE __forceinline +#define HH_NOINLINE __declspec(noinline) +#else +#define HH_INLINE inline +#define HH_NOINLINE __attribute__((noinline)) +#endif + +#if HH_MSC_VERSION +// Unsupported, __assume is not the same. +#define HH_LIKELY(expr) expr +#define HH_UNLIKELY(expr) expr +#else +#define HH_LIKELY(expr) __builtin_expect(!!(expr), 1) +#define HH_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#endif + +#if HH_MSC_VERSION +#include <intrin.h> +#pragma intrinsic(_ReadWriteBarrier) +#define HH_COMPILER_FENCE _ReadWriteBarrier() +#elif HH_GCC_VERSION +#define HH_COMPILER_FENCE asm volatile("" : : : "memory") +#else +#define HH_COMPILER_FENCE +#endif + +#endif // HIGHWAYHASH_COMPILER_SPECIFIC_H_ diff --git a/contrib/libs/highwayhash/highwayhash/data_parallel.h b/contrib/libs/highwayhash/highwayhash/data_parallel.h index d72afc953e..72d6a47e24 100644 --- a/contrib/libs/highwayhash/highwayhash/data_parallel.h +++ b/contrib/libs/highwayhash/highwayhash/data_parallel.h @@ -1,341 +1,341 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_DATA_PARALLEL_H_ -#define HIGHWAYHASH_DATA_PARALLEL_H_ - -// Portable C++11 alternative to OpenMP for data-parallel computations: -// provides low-overhead ThreadPool, plus PerThread with support for reduction. - -#include <stdio.h> -#include <algorithm> // find_if -#include <atomic> -#include <condition_variable> //NOLINT -#include <cstdint> -#include <cstdlib> -#include <functional> -#include <memory> -#include <mutex> //NOLINT -#include <thread> //NOLINT -#include <utility> -#include <vector> - -#define DATA_PARALLEL_CHECK(condition) \ - while (!(condition)) { \ - printf("data_parallel check failed at line %d\n", __LINE__); \ - abort(); \ - } - -namespace highwayhash { - -// Highly scalable thread pool, especially suitable for data-parallel -// computations in the fork-join model, where clients need to know when all -// tasks have completed. -// -// Thread pools usually store small numbers of heterogeneous tasks in a queue. -// When tasks are identical or differ only by an integer input parameter, it is -// much faster to store just one function of an integer parameter and call it -// for each value. -// -// This thread pool can efficiently load-balance millions of tasks using an -// atomic counter, thus avoiding per-task syscalls. With 48 hyperthreads and -// 1M tasks that add to an atomic counter, overall runtime is 10-20x higher -// when using std::async, and up to 200x for a queue-based ThreadPool. -// -// Usage: -// ThreadPool pool; -// pool.Run(0, 1000000, [](const int i) { Func1(i); }); -// // When Run returns, all of its tasks have finished. -// -// pool.RunTasks({Func2, Func3, Func4}); -// // The destructor waits until all worker threads have exited cleanly. -class ThreadPool { - public: - // Starts the given number of worker threads and blocks until they are ready. - // "num_threads" defaults to one per hyperthread. - explicit ThreadPool( - const int num_threads = std::thread::hardware_concurrency()) - : num_threads_(num_threads) { - DATA_PARALLEL_CHECK(num_threads_ > 0); - threads_.reserve(num_threads_); - for (int i = 0; i < num_threads_; ++i) { - threads_.emplace_back(ThreadFunc, this); - } - - padding_[0] = 0; // avoid unused member warning. - - WorkersReadyBarrier(); - } - - ThreadPool(const ThreadPool&) = delete; - ThreadPool& operator&(const ThreadPool&) = delete; - - // Waits for all threads to exit. - ~ThreadPool() { - StartWorkers(kWorkerExit); - - for (std::thread& thread : threads_) { - thread.join(); - } - } - - // Runs func(i) on worker thread(s) for every i in [begin, end). - // Not thread-safe - no two calls to Run and RunTasks may overlap. - // Subsequent calls will reuse the same threads. - // - // Precondition: 0 <= begin <= end. - template <class Func> - void Run(const int begin, const int end, const Func& func) { - DATA_PARALLEL_CHECK(0 <= begin && begin <= end); - if (begin == end) { - return; - } - const WorkerCommand worker_command = (WorkerCommand(end) << 32) + begin; - // Ensure the inputs do not result in a reserved command. - DATA_PARALLEL_CHECK(worker_command != kWorkerWait); - DATA_PARALLEL_CHECK(worker_command != kWorkerExit); - - // If Func is large (many captures), this will allocate memory, but it is - // still slower to use a std::ref wrapper. - task_ = func; - num_reserved_.store(0); - - StartWorkers(worker_command); - WorkersReadyBarrier(); - } - - // Runs each task (closure, typically a lambda function) on worker thread(s). - // Not thread-safe - no two calls to Run and RunTasks may overlap. - // Subsequent calls will reuse the same threads. - // - // This is a more conventional interface for heterogeneous tasks that may be - // independent/unrelated. - void RunTasks(const std::vector<std::function<void(void)>>& tasks) { - Run(0, static_cast<int>(tasks.size()), - [&tasks](const int i) { tasks[i](); }); - } - - // Statically (and deterministically) splits [begin, end) into ranges and - // calls "func" for each of them. Useful when "func" involves some overhead - // (e.g. for PerThread::Get or random seeding) that should be amortized over - // a range of values. "func" is void(int chunk, uint32_t begin, uint32_t end). - template <class Func> - void RunRanges(const uint32_t begin, const uint32_t end, const Func& func) { - const uint32_t length = end - begin; - - // Use constant rather than num_threads_ for machine-independent splitting. - const uint32_t chunk = std::max(1U, (length + 127) / 128); - std::vector<std::pair<uint32_t, uint32_t>> ranges; // begin/end - ranges.reserve(length / chunk + 1); - for (uint32_t i = 0; i < length; i += chunk) { - ranges.emplace_back(begin + i, begin + std::min(i + chunk, length)); - } - - Run(0, static_cast<int>(ranges.size()), [&ranges, func](const int i) { - func(i, ranges[i].first, ranges[i].second); - }); - } - - private: - // After construction and between calls to Run, workers are "ready", i.e. - // waiting on worker_start_cv_. They are "started" by sending a "command" - // and notifying all worker_start_cv_ waiters. (That is why all workers - // must be ready/waiting - otherwise, the notification will not reach all of - // them and the main thread waits in vain for them to report readiness.) - using WorkerCommand = uint64_t; - - // Special values; all others encode the begin/end parameters. - static constexpr WorkerCommand kWorkerWait = 0; - static constexpr WorkerCommand kWorkerExit = ~0ULL; - - void WorkersReadyBarrier() { - std::unique_lock<std::mutex> lock(mutex_); - workers_ready_cv_.wait(lock, - [this]() { return workers_ready_ == num_threads_; }); - workers_ready_ = 0; - } - - // Precondition: all workers are ready. - void StartWorkers(const WorkerCommand worker_command) { - std::unique_lock<std::mutex> lock(mutex_); - worker_start_command_ = worker_command; - // Workers will need this lock, so release it before they wake up. - lock.unlock(); - worker_start_cv_.notify_all(); - } - - // Attempts to reserve and perform some work from the global range of tasks, - // which is encoded within "command". Returns after all tasks are reserved. - static void RunRange(ThreadPool* self, const WorkerCommand command) { - const int begin = command & 0xFFFFFFFF; - const int end = command >> 32; - const int num_tasks = end - begin; - - // OpenMP introduced several "schedule" strategies: - // "single" (static assignment of exactly one chunk per thread): slower. - // "dynamic" (allocates k tasks at a time): competitive for well-chosen k. - // "guided" (allocates k tasks, decreases k): computing k = remaining/n - // is faster than halving k each iteration. We prefer this strategy - // because it avoids user-specified parameters. - - for (;;) { - const int num_reserved = self->num_reserved_.load(); - const int num_remaining = num_tasks - num_reserved; - const int my_size = std::max(num_remaining / (self->num_threads_ * 2), 1); - const int my_begin = begin + self->num_reserved_.fetch_add(my_size); - const int my_end = std::min(my_begin + my_size, begin + num_tasks); - // Another thread already reserved the last task. - if (my_begin >= my_end) { - break; - } - for (int i = my_begin; i < my_end; ++i) { - self->task_(i); - } - } - } - - static void ThreadFunc(ThreadPool* self) { - // Until kWorkerExit command received: - for (;;) { - std::unique_lock<std::mutex> lock(self->mutex_); - // Notify main thread that this thread is ready. - if (++self->workers_ready_ == self->num_threads_) { - self->workers_ready_cv_.notify_one(); - } - RESUME_WAIT: - // Wait for a command. - self->worker_start_cv_.wait(lock); - const WorkerCommand command = self->worker_start_command_; - switch (command) { - case kWorkerWait: // spurious wakeup: - goto RESUME_WAIT; // lock still held, avoid incrementing ready. - case kWorkerExit: - return; // exits thread - } - - lock.unlock(); - RunRange(self, command); - } - } - - const int num_threads_; - - // Unmodified after ctor, but cannot be const because we call thread::join(). - std::vector<std::thread> threads_; - - std::mutex mutex_; // guards both cv and their variables. - std::condition_variable workers_ready_cv_; - int workers_ready_ = 0; - std::condition_variable worker_start_cv_; - WorkerCommand worker_start_command_; - - // Written by main thread, read by workers (after mutex lock/unlock). - std::function<void(int)> task_; - - // Updated by workers; alignment/padding avoids false sharing. - alignas(64) std::atomic<int> num_reserved_{0}; - int padding_[15]; -}; - -// Thread-local storage with support for reduction (combining into one result). -// The "T" type must be unique to the call site because the list of threads' -// copies is a static member. (With knowledge of the underlying threads, we -// could eliminate this list and T allocations, but that is difficult to -// arrange and we prefer this to be usable independently of ThreadPool.) -// -// Usage: -// for (int i = 0; i < N; ++i) { -// // in each thread: -// T& my_copy = PerThread<T>::Get(); -// my_copy.Modify(); -// -// // single-threaded: -// T& combined = PerThread<T>::Reduce(); -// Use(combined); -// PerThread<T>::Destroy(); -// } -// -// T is duck-typed and implements the following interface: -// -// // Returns true if T is default-initialized or Destroy was called without -// // any subsequent re-initialization. -// bool IsNull() const; -// -// // Releases any resources. Postcondition: IsNull() == true. -// void Destroy(); -// -// // Merges in data from "victim". Precondition: !IsNull() && !victim.IsNull(). -// void Assimilate(const T& victim); -template <class T> -class PerThread { - public: - // Returns reference to this thread's T instance (dynamically allocated, - // so its address is unique). Callers are responsible for any initialization - // beyond the default ctor. - static T& Get() { - static thread_local T* t; - if (t == nullptr) { - t = new T; - static std::mutex mutex; - std::lock_guard<std::mutex> lock(mutex); - Threads().push_back(t); - } - return *t; - } - - // Returns vector of all per-thread T. Used inside Reduce() or by clients - // that require direct access to T instead of Assimilating them. - // Function wrapper avoids separate static member variable definition. - static std::vector<T*>& Threads() { - static std::vector<T*> threads; - return threads; - } - - // Returns the first non-null T after assimilating all other threads' T - // into it. Precondition: at least one non-null T exists (caller must have - // called Get() and initialized the result). - static T& Reduce() { - std::vector<T*>& threads = Threads(); - - // Find first non-null T - const auto it = std::find_if(threads.begin(), threads.end(), - [](const T* t) { return !t->IsNull(); }); - if (it == threads.end()) { - abort(); - } - T* const first = *it; - - for (const T* t : threads) { - if (t != first && !t->IsNull()) { - first->Assimilate(*t); - } - } - return *first; - } - - // Calls each thread's T::Destroy to release resources and/or prepare for - // reuse by the same threads/ThreadPool. Note that all T remain allocated - // (we need thread-independent pointers for iterating over each thread's T, - // and deleting them would leave dangling pointers in each thread, which is - // unacceptable because the same thread may call Get() again later.) - static void Destroy() { - for (T* t : Threads()) { - t->Destroy(); - } - } -}; - -} // namespace highwayhash - -#endif // HIGHWAYHASH_DATA_PARALLEL_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_DATA_PARALLEL_H_ +#define HIGHWAYHASH_DATA_PARALLEL_H_ + +// Portable C++11 alternative to OpenMP for data-parallel computations: +// provides low-overhead ThreadPool, plus PerThread with support for reduction. + +#include <stdio.h> +#include <algorithm> // find_if +#include <atomic> +#include <condition_variable> //NOLINT +#include <cstdint> +#include <cstdlib> +#include <functional> +#include <memory> +#include <mutex> //NOLINT +#include <thread> //NOLINT +#include <utility> +#include <vector> + +#define DATA_PARALLEL_CHECK(condition) \ + while (!(condition)) { \ + printf("data_parallel check failed at line %d\n", __LINE__); \ + abort(); \ + } + +namespace highwayhash { + +// Highly scalable thread pool, especially suitable for data-parallel +// computations in the fork-join model, where clients need to know when all +// tasks have completed. +// +// Thread pools usually store small numbers of heterogeneous tasks in a queue. +// When tasks are identical or differ only by an integer input parameter, it is +// much faster to store just one function of an integer parameter and call it +// for each value. +// +// This thread pool can efficiently load-balance millions of tasks using an +// atomic counter, thus avoiding per-task syscalls. With 48 hyperthreads and +// 1M tasks that add to an atomic counter, overall runtime is 10-20x higher +// when using std::async, and up to 200x for a queue-based ThreadPool. +// +// Usage: +// ThreadPool pool; +// pool.Run(0, 1000000, [](const int i) { Func1(i); }); +// // When Run returns, all of its tasks have finished. +// +// pool.RunTasks({Func2, Func3, Func4}); +// // The destructor waits until all worker threads have exited cleanly. +class ThreadPool { + public: + // Starts the given number of worker threads and blocks until they are ready. + // "num_threads" defaults to one per hyperthread. + explicit ThreadPool( + const int num_threads = std::thread::hardware_concurrency()) + : num_threads_(num_threads) { + DATA_PARALLEL_CHECK(num_threads_ > 0); + threads_.reserve(num_threads_); + for (int i = 0; i < num_threads_; ++i) { + threads_.emplace_back(ThreadFunc, this); + } + + padding_[0] = 0; // avoid unused member warning. + + WorkersReadyBarrier(); + } + + ThreadPool(const ThreadPool&) = delete; + ThreadPool& operator&(const ThreadPool&) = delete; + + // Waits for all threads to exit. + ~ThreadPool() { + StartWorkers(kWorkerExit); + + for (std::thread& thread : threads_) { + thread.join(); + } + } + + // Runs func(i) on worker thread(s) for every i in [begin, end). + // Not thread-safe - no two calls to Run and RunTasks may overlap. + // Subsequent calls will reuse the same threads. + // + // Precondition: 0 <= begin <= end. + template <class Func> + void Run(const int begin, const int end, const Func& func) { + DATA_PARALLEL_CHECK(0 <= begin && begin <= end); + if (begin == end) { + return; + } + const WorkerCommand worker_command = (WorkerCommand(end) << 32) + begin; + // Ensure the inputs do not result in a reserved command. + DATA_PARALLEL_CHECK(worker_command != kWorkerWait); + DATA_PARALLEL_CHECK(worker_command != kWorkerExit); + + // If Func is large (many captures), this will allocate memory, but it is + // still slower to use a std::ref wrapper. + task_ = func; + num_reserved_.store(0); + + StartWorkers(worker_command); + WorkersReadyBarrier(); + } + + // Runs each task (closure, typically a lambda function) on worker thread(s). + // Not thread-safe - no two calls to Run and RunTasks may overlap. + // Subsequent calls will reuse the same threads. + // + // This is a more conventional interface for heterogeneous tasks that may be + // independent/unrelated. + void RunTasks(const std::vector<std::function<void(void)>>& tasks) { + Run(0, static_cast<int>(tasks.size()), + [&tasks](const int i) { tasks[i](); }); + } + + // Statically (and deterministically) splits [begin, end) into ranges and + // calls "func" for each of them. Useful when "func" involves some overhead + // (e.g. for PerThread::Get or random seeding) that should be amortized over + // a range of values. "func" is void(int chunk, uint32_t begin, uint32_t end). + template <class Func> + void RunRanges(const uint32_t begin, const uint32_t end, const Func& func) { + const uint32_t length = end - begin; + + // Use constant rather than num_threads_ for machine-independent splitting. + const uint32_t chunk = std::max(1U, (length + 127) / 128); + std::vector<std::pair<uint32_t, uint32_t>> ranges; // begin/end + ranges.reserve(length / chunk + 1); + for (uint32_t i = 0; i < length; i += chunk) { + ranges.emplace_back(begin + i, begin + std::min(i + chunk, length)); + } + + Run(0, static_cast<int>(ranges.size()), [&ranges, func](const int i) { + func(i, ranges[i].first, ranges[i].second); + }); + } + + private: + // After construction and between calls to Run, workers are "ready", i.e. + // waiting on worker_start_cv_. They are "started" by sending a "command" + // and notifying all worker_start_cv_ waiters. (That is why all workers + // must be ready/waiting - otherwise, the notification will not reach all of + // them and the main thread waits in vain for them to report readiness.) + using WorkerCommand = uint64_t; + + // Special values; all others encode the begin/end parameters. + static constexpr WorkerCommand kWorkerWait = 0; + static constexpr WorkerCommand kWorkerExit = ~0ULL; + + void WorkersReadyBarrier() { + std::unique_lock<std::mutex> lock(mutex_); + workers_ready_cv_.wait(lock, + [this]() { return workers_ready_ == num_threads_; }); + workers_ready_ = 0; + } + + // Precondition: all workers are ready. + void StartWorkers(const WorkerCommand worker_command) { + std::unique_lock<std::mutex> lock(mutex_); + worker_start_command_ = worker_command; + // Workers will need this lock, so release it before they wake up. + lock.unlock(); + worker_start_cv_.notify_all(); + } + + // Attempts to reserve and perform some work from the global range of tasks, + // which is encoded within "command". Returns after all tasks are reserved. + static void RunRange(ThreadPool* self, const WorkerCommand command) { + const int begin = command & 0xFFFFFFFF; + const int end = command >> 32; + const int num_tasks = end - begin; + + // OpenMP introduced several "schedule" strategies: + // "single" (static assignment of exactly one chunk per thread): slower. + // "dynamic" (allocates k tasks at a time): competitive for well-chosen k. + // "guided" (allocates k tasks, decreases k): computing k = remaining/n + // is faster than halving k each iteration. We prefer this strategy + // because it avoids user-specified parameters. + + for (;;) { + const int num_reserved = self->num_reserved_.load(); + const int num_remaining = num_tasks - num_reserved; + const int my_size = std::max(num_remaining / (self->num_threads_ * 2), 1); + const int my_begin = begin + self->num_reserved_.fetch_add(my_size); + const int my_end = std::min(my_begin + my_size, begin + num_tasks); + // Another thread already reserved the last task. + if (my_begin >= my_end) { + break; + } + for (int i = my_begin; i < my_end; ++i) { + self->task_(i); + } + } + } + + static void ThreadFunc(ThreadPool* self) { + // Until kWorkerExit command received: + for (;;) { + std::unique_lock<std::mutex> lock(self->mutex_); + // Notify main thread that this thread is ready. + if (++self->workers_ready_ == self->num_threads_) { + self->workers_ready_cv_.notify_one(); + } + RESUME_WAIT: + // Wait for a command. + self->worker_start_cv_.wait(lock); + const WorkerCommand command = self->worker_start_command_; + switch (command) { + case kWorkerWait: // spurious wakeup: + goto RESUME_WAIT; // lock still held, avoid incrementing ready. + case kWorkerExit: + return; // exits thread + } + + lock.unlock(); + RunRange(self, command); + } + } + + const int num_threads_; + + // Unmodified after ctor, but cannot be const because we call thread::join(). + std::vector<std::thread> threads_; + + std::mutex mutex_; // guards both cv and their variables. + std::condition_variable workers_ready_cv_; + int workers_ready_ = 0; + std::condition_variable worker_start_cv_; + WorkerCommand worker_start_command_; + + // Written by main thread, read by workers (after mutex lock/unlock). + std::function<void(int)> task_; + + // Updated by workers; alignment/padding avoids false sharing. + alignas(64) std::atomic<int> num_reserved_{0}; + int padding_[15]; +}; + +// Thread-local storage with support for reduction (combining into one result). +// The "T" type must be unique to the call site because the list of threads' +// copies is a static member. (With knowledge of the underlying threads, we +// could eliminate this list and T allocations, but that is difficult to +// arrange and we prefer this to be usable independently of ThreadPool.) +// +// Usage: +// for (int i = 0; i < N; ++i) { +// // in each thread: +// T& my_copy = PerThread<T>::Get(); +// my_copy.Modify(); +// +// // single-threaded: +// T& combined = PerThread<T>::Reduce(); +// Use(combined); +// PerThread<T>::Destroy(); +// } +// +// T is duck-typed and implements the following interface: +// +// // Returns true if T is default-initialized or Destroy was called without +// // any subsequent re-initialization. +// bool IsNull() const; +// +// // Releases any resources. Postcondition: IsNull() == true. +// void Destroy(); +// +// // Merges in data from "victim". Precondition: !IsNull() && !victim.IsNull(). +// void Assimilate(const T& victim); +template <class T> +class PerThread { + public: + // Returns reference to this thread's T instance (dynamically allocated, + // so its address is unique). Callers are responsible for any initialization + // beyond the default ctor. + static T& Get() { + static thread_local T* t; + if (t == nullptr) { + t = new T; + static std::mutex mutex; + std::lock_guard<std::mutex> lock(mutex); + Threads().push_back(t); + } + return *t; + } + + // Returns vector of all per-thread T. Used inside Reduce() or by clients + // that require direct access to T instead of Assimilating them. + // Function wrapper avoids separate static member variable definition. + static std::vector<T*>& Threads() { + static std::vector<T*> threads; + return threads; + } + + // Returns the first non-null T after assimilating all other threads' T + // into it. Precondition: at least one non-null T exists (caller must have + // called Get() and initialized the result). + static T& Reduce() { + std::vector<T*>& threads = Threads(); + + // Find first non-null T + const auto it = std::find_if(threads.begin(), threads.end(), + [](const T* t) { return !t->IsNull(); }); + if (it == threads.end()) { + abort(); + } + T* const first = *it; + + for (const T* t : threads) { + if (t != first && !t->IsNull()) { + first->Assimilate(*t); + } + } + return *first; + } + + // Calls each thread's T::Destroy to release resources and/or prepare for + // reuse by the same threads/ThreadPool. Note that all T remain allocated + // (we need thread-independent pointers for iterating over each thread's T, + // and deleting them would leave dangling pointers in each thread, which is + // unacceptable because the same thread may call Get() again later.) + static void Destroy() { + for (T* t : Threads()) { + t->Destroy(); + } + } +}; + +} // namespace highwayhash + +#endif // HIGHWAYHASH_DATA_PARALLEL_H_ diff --git a/contrib/libs/highwayhash/highwayhash/data_parallel_benchmark.cc b/contrib/libs/highwayhash/highwayhash/data_parallel_benchmark.cc index ddc88b067f..fafdd93dbd 100644 --- a/contrib/libs/highwayhash/highwayhash/data_parallel_benchmark.cc +++ b/contrib/libs/highwayhash/highwayhash/data_parallel_benchmark.cc @@ -1,151 +1,151 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <cmath> -#include <cstdio> -#include <future> //NOLINT -#include <set> -#include "testing/base/public/gunit.h" -#include "highwayhash/arch_specific.h" -#include "highwayhash/data_parallel.h" -#include "thread/threadpool.h" - -namespace highwayhash { -namespace { - -constexpr int kBenchmarkTasks = 1000000; - -// Returns elapsed time [nanoseconds] for std::async. -double BenchmarkAsync(uint64_t* total) { - const base::Time t0 = base::Now(); - std::atomic<uint64_t> sum1{0}; - std::atomic<uint64_t> sum2{0}; - - std::vector<std::future<void>> futures; - futures.reserve(kBenchmarkTasks); - for (int i = 0; i < kBenchmarkTasks; ++i) { - futures.push_back(std::async( - [&sum1, &sum2](const int i) { - sum1.fetch_add(i); - sum2.fetch_add(1); - }, - i)); - } - - for (auto& future : futures) { - future.get(); - } - - const base::Time t1 = base::Now(); - *total = sum1.load() + sum2.load(); - return base::ToDoubleNanoseconds(t1 - t0); -} - -// Returns elapsed time [nanoseconds] for (atomic) ThreadPool. -double BenchmarkPoolA(uint64_t* total) { - const base::Time t0 = base::Now(); - std::atomic<uint64_t> sum1{0}; - std::atomic<uint64_t> sum2{0}; - - ThreadPool pool; - pool.Run(0, kBenchmarkTasks, [&sum1, &sum2](const int i) { - sum1.fetch_add(i); - sum2.fetch_add(1); - }); - - const base::Time t1 = base::Now(); - *total = sum1.load() + sum2.load(); - return base::ToDoubleNanoseconds(t1 - t0); -} - -// Returns elapsed time [nanoseconds] for ::ThreadPool. -double BenchmarkPoolG(uint64_t* total) { - const base::Time t0 = base::Now(); - std::atomic<uint64_t> sum1{0}; - std::atomic<uint64_t> sum2{0}; - - { - ::ThreadPool pool(std::thread::hardware_concurrency()); - pool.StartWorkers(); - for (int i = 0; i < kBenchmarkTasks; ++i) { - pool.Schedule([&sum1, &sum2, i]() { - sum1.fetch_add(i); - sum2.fetch_add(1); - }); - } - } - - const base::Time t1 = base::Now(); - *total = sum1.load() + sum2.load(); - return base::ToDoubleNanoseconds(t1 - t0); -} - -// Compares ThreadPool speed to std::async and ::ThreadPool. -TEST(DataParallelTest, Benchmarks) { - uint64_t sum1, sum2, sum3; - const double async_ns = BenchmarkAsync(&sum1); - const double poolA_ns = BenchmarkPoolA(&sum2); - const double poolG_ns = BenchmarkPoolG(&sum3); - - printf("Async %11.0f ns\nPoolA %11.0f ns\nPoolG %11.0f ns\n", async_ns, - poolA_ns, poolG_ns); - // baseline 20x, 10x with asan or msan, 5x with tsan - EXPECT_GT(async_ns, poolA_ns * 4); - // baseline 200x, 180x with asan, 70x with msan, 50x with tsan. - EXPECT_GT(poolG_ns, poolA_ns * 20); - - // Should reach same result. - EXPECT_EQ(sum1, sum2); - EXPECT_EQ(sum2, sum3); -} - -// Ensures multiple hardware threads are used (decided by the OS scheduler). -TEST(DataParallelTest, TestApicIds) { - for (int num_threads = 1; num_threads <= std::thread::hardware_concurrency(); - ++num_threads) { - ThreadPool pool(num_threads); - - std::mutex mutex; - std::set<unsigned> ids; - double total = 0.0; - pool.Run(0, 2 * num_threads, [&mutex, &ids, &total](const int i) { - // Useless computations to keep the processor busy so that threads - // can't just reuse the same processor. - double sum = 0.0; - for (int rep = 0; rep < 900 * (i + 30); ++rep) { - sum += pow(rep, 0.5); - } - - mutex.lock(); - ids.insert(ApicId()); - total += sum; - mutex.unlock(); - }); - - // No core ID / APIC ID available - if (num_threads > 1 && ids.size() == 1) { - EXPECT_EQ(0, *ids.begin()); - } else { - // (The Linux scheduler doesn't use all available HTs, but the - // computations should at least keep most cores busy.) - EXPECT_GT(ids.size() + 2, num_threads / 4); - } - - // (Ensure the busy-work is not elided.) - EXPECT_GT(total, 1E4); - } -} - -} // namespace -} // namespace highwayhash +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <cmath> +#include <cstdio> +#include <future> //NOLINT +#include <set> +#include "testing/base/public/gunit.h" +#include "highwayhash/arch_specific.h" +#include "highwayhash/data_parallel.h" +#include "thread/threadpool.h" + +namespace highwayhash { +namespace { + +constexpr int kBenchmarkTasks = 1000000; + +// Returns elapsed time [nanoseconds] for std::async. +double BenchmarkAsync(uint64_t* total) { + const base::Time t0 = base::Now(); + std::atomic<uint64_t> sum1{0}; + std::atomic<uint64_t> sum2{0}; + + std::vector<std::future<void>> futures; + futures.reserve(kBenchmarkTasks); + for (int i = 0; i < kBenchmarkTasks; ++i) { + futures.push_back(std::async( + [&sum1, &sum2](const int i) { + sum1.fetch_add(i); + sum2.fetch_add(1); + }, + i)); + } + + for (auto& future : futures) { + future.get(); + } + + const base::Time t1 = base::Now(); + *total = sum1.load() + sum2.load(); + return base::ToDoubleNanoseconds(t1 - t0); +} + +// Returns elapsed time [nanoseconds] for (atomic) ThreadPool. +double BenchmarkPoolA(uint64_t* total) { + const base::Time t0 = base::Now(); + std::atomic<uint64_t> sum1{0}; + std::atomic<uint64_t> sum2{0}; + + ThreadPool pool; + pool.Run(0, kBenchmarkTasks, [&sum1, &sum2](const int i) { + sum1.fetch_add(i); + sum2.fetch_add(1); + }); + + const base::Time t1 = base::Now(); + *total = sum1.load() + sum2.load(); + return base::ToDoubleNanoseconds(t1 - t0); +} + +// Returns elapsed time [nanoseconds] for ::ThreadPool. +double BenchmarkPoolG(uint64_t* total) { + const base::Time t0 = base::Now(); + std::atomic<uint64_t> sum1{0}; + std::atomic<uint64_t> sum2{0}; + + { + ::ThreadPool pool(std::thread::hardware_concurrency()); + pool.StartWorkers(); + for (int i = 0; i < kBenchmarkTasks; ++i) { + pool.Schedule([&sum1, &sum2, i]() { + sum1.fetch_add(i); + sum2.fetch_add(1); + }); + } + } + + const base::Time t1 = base::Now(); + *total = sum1.load() + sum2.load(); + return base::ToDoubleNanoseconds(t1 - t0); +} + +// Compares ThreadPool speed to std::async and ::ThreadPool. +TEST(DataParallelTest, Benchmarks) { + uint64_t sum1, sum2, sum3; + const double async_ns = BenchmarkAsync(&sum1); + const double poolA_ns = BenchmarkPoolA(&sum2); + const double poolG_ns = BenchmarkPoolG(&sum3); + + printf("Async %11.0f ns\nPoolA %11.0f ns\nPoolG %11.0f ns\n", async_ns, + poolA_ns, poolG_ns); + // baseline 20x, 10x with asan or msan, 5x with tsan + EXPECT_GT(async_ns, poolA_ns * 4); + // baseline 200x, 180x with asan, 70x with msan, 50x with tsan. + EXPECT_GT(poolG_ns, poolA_ns * 20); + + // Should reach same result. + EXPECT_EQ(sum1, sum2); + EXPECT_EQ(sum2, sum3); +} + +// Ensures multiple hardware threads are used (decided by the OS scheduler). +TEST(DataParallelTest, TestApicIds) { + for (int num_threads = 1; num_threads <= std::thread::hardware_concurrency(); + ++num_threads) { + ThreadPool pool(num_threads); + + std::mutex mutex; + std::set<unsigned> ids; + double total = 0.0; + pool.Run(0, 2 * num_threads, [&mutex, &ids, &total](const int i) { + // Useless computations to keep the processor busy so that threads + // can't just reuse the same processor. + double sum = 0.0; + for (int rep = 0; rep < 900 * (i + 30); ++rep) { + sum += pow(rep, 0.5); + } + + mutex.lock(); + ids.insert(ApicId()); + total += sum; + mutex.unlock(); + }); + + // No core ID / APIC ID available + if (num_threads > 1 && ids.size() == 1) { + EXPECT_EQ(0, *ids.begin()); + } else { + // (The Linux scheduler doesn't use all available HTs, but the + // computations should at least keep most cores busy.) + EXPECT_GT(ids.size() + 2, num_threads / 4); + } + + // (Ensure the busy-work is not elided.) + EXPECT_GT(total, 1E4); + } +} + +} // namespace +} // namespace highwayhash diff --git a/contrib/libs/highwayhash/highwayhash/data_parallel_test.cc b/contrib/libs/highwayhash/highwayhash/data_parallel_test.cc index 2728b7d3ad..d733620099 100644 --- a/contrib/libs/highwayhash/highwayhash/data_parallel_test.cc +++ b/contrib/libs/highwayhash/highwayhash/data_parallel_test.cc @@ -1,175 +1,175 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <unistd.h> -#include <cstdint> - -#include "testing/base/public/gunit.h" -#include "highwayhash/data_parallel.h" - -namespace highwayhash { -namespace { - -int PopulationCount(uint64_t bits) { - int num_set = 0; - while (bits != 0) { - num_set += bits & 1; - bits >>= 1; - } - return num_set; -} - -std::atomic<int> func_counts{0}; - -void Func2() { - usleep(200000); - func_counts.fetch_add(4); -} - -void Func3() { - usleep(300000); - func_counts.fetch_add(16); -} - -void Func4() { - usleep(400000); - func_counts.fetch_add(256); -} - -// Exercises the RunTasks feature (running arbitrary tasks/closures) -TEST(DataParallelTest, TestRunTasks) { - ThreadPool pool(4); - pool.RunTasks({Func2, Func3, Func4}); - EXPECT_EQ(276, func_counts.load()); -} - -// Ensures task parameter is in bounds, every parameter is reached, -// pool can be reused (multiple consecutive Run calls), pool can be destroyed -// (joining with its threads). -TEST(DataParallelTest, TestPool) { - for (int num_threads = 1; num_threads <= 18; ++num_threads) { - ThreadPool pool(num_threads); - for (int num_tasks = 0; num_tasks < 32; ++num_tasks) { - std::vector<int> mementos(num_tasks, 0); - for (int begin = 0; begin < 32; ++begin) { - std::fill(mementos.begin(), mementos.end(), 0); - pool.Run(begin, begin + num_tasks, - [begin, num_tasks, &mementos](const int i) { - // Parameter is in the given range - EXPECT_GE(i, begin); - EXPECT_LT(i, begin + num_tasks); - - // Store mementos to be sure we visited each i. - mementos.at(i - begin) = 1000 + i; - }); - for (int i = begin; i < begin + num_tasks; ++i) { - EXPECT_EQ(1000 + i, mementos.at(i - begin)); - } - } - } - } -} - -TEST(DataParallelTest, TestRunRanges) { - for (int num_threads = 1; num_threads <= 18; ++num_threads) { - ThreadPool pool(num_threads); - for (int num_tasks = 0; num_tasks < 32; ++num_tasks) { - std::vector<int> mementos(num_tasks, 0); - for (int begin = 0; begin < 32; ++begin) { - std::fill(mementos.begin(), mementos.end(), 0); - pool.RunRanges(begin, begin + num_tasks, - [begin, num_tasks, &mementos](const int chunk, - const uint32_t my_begin, - const uint32_t my_end) { - for (uint32_t i = my_begin; i < my_end; ++i) { - // Parameter is in the given range - EXPECT_GE(i, begin); - EXPECT_LT(i, begin + num_tasks); - - // Store mementos to be sure we visited each i. - mementos.at(i - begin) = 1000 + i; - } - }); - for (int i = begin; i < begin + num_tasks; ++i) { - EXPECT_EQ(1000 + i, mementos.at(i - begin)); - } - } - } - } -} - -// Ensures each of N threads processes exactly 1 of N tasks, i.e. the -// work distribution is perfectly fair for small counts. -TEST(DataParallelTest, TestSmallAssignments) { - for (int num_threads = 1; num_threads <= 64; ++num_threads) { - ThreadPool pool(num_threads); - - std::atomic<int> counter{0}; - // (Avoid mutex because it may perturb the worker thread scheduling) - std::atomic<uint64_t> id_bits{0}; - - pool.Run(0, num_threads, [&counter, num_threads, &id_bits](const int i) { - const int id = counter.fetch_add(1); - EXPECT_LT(id, num_threads); - uint64_t bits = id_bits.load(std::memory_order_relaxed); - while (!id_bits.compare_exchange_weak(bits, bits | (1ULL << id))) { - } - }); - - const int num_participants = PopulationCount(id_bits.load()); - EXPECT_EQ(num_threads, num_participants); - } -} - -// Test payload for PerThread. -struct CheckUniqueIDs { - bool IsNull() const { return false; } - void Destroy() { id_bits = 0; } - void Assimilate(const CheckUniqueIDs& victim) { - // Cannot overlap because each PerThread has unique bits. - EXPECT_EQ(0, id_bits & victim.id_bits); - id_bits |= victim.id_bits; - } - - uint64_t id_bits = 0; -}; - -// Ensures each thread has a PerThread instance, that they are successfully -// combined/reduced into a single result, and that reuse is possible after -// Destroy(). -TEST(DataParallelTest, TestPerThread) { - // We use a uint64_t bit array for convenience => no more than 64 threads. - const int max_threads = std::min(64U, std::thread::hardware_concurrency()); - for (int num_threads = 1; num_threads <= max_threads; ++num_threads) { - ThreadPool pool(num_threads); - - std::atomic<int> counter{0}; - pool.Run(0, num_threads, [&counter, num_threads](const int i) { - const int id = counter.fetch_add(1); - EXPECT_LT(id, num_threads); - PerThread<CheckUniqueIDs>::Get().id_bits |= 1ULL << id; - }); - - // Verify each thread's bit is set. - const uint64_t all_bits = PerThread<CheckUniqueIDs>::Reduce().id_bits; - // Avoid shifting by 64 (undefined). - const uint64_t expected = - num_threads == 64 ? ~0ULL : (1ULL << num_threads) - 1; - EXPECT_EQ(expected, all_bits); - PerThread<CheckUniqueIDs>::Destroy(); - } -} - -} // namespace -} // namespace highwayhash +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <unistd.h> +#include <cstdint> + +#include "testing/base/public/gunit.h" +#include "highwayhash/data_parallel.h" + +namespace highwayhash { +namespace { + +int PopulationCount(uint64_t bits) { + int num_set = 0; + while (bits != 0) { + num_set += bits & 1; + bits >>= 1; + } + return num_set; +} + +std::atomic<int> func_counts{0}; + +void Func2() { + usleep(200000); + func_counts.fetch_add(4); +} + +void Func3() { + usleep(300000); + func_counts.fetch_add(16); +} + +void Func4() { + usleep(400000); + func_counts.fetch_add(256); +} + +// Exercises the RunTasks feature (running arbitrary tasks/closures) +TEST(DataParallelTest, TestRunTasks) { + ThreadPool pool(4); + pool.RunTasks({Func2, Func3, Func4}); + EXPECT_EQ(276, func_counts.load()); +} + +// Ensures task parameter is in bounds, every parameter is reached, +// pool can be reused (multiple consecutive Run calls), pool can be destroyed +// (joining with its threads). +TEST(DataParallelTest, TestPool) { + for (int num_threads = 1; num_threads <= 18; ++num_threads) { + ThreadPool pool(num_threads); + for (int num_tasks = 0; num_tasks < 32; ++num_tasks) { + std::vector<int> mementos(num_tasks, 0); + for (int begin = 0; begin < 32; ++begin) { + std::fill(mementos.begin(), mementos.end(), 0); + pool.Run(begin, begin + num_tasks, + [begin, num_tasks, &mementos](const int i) { + // Parameter is in the given range + EXPECT_GE(i, begin); + EXPECT_LT(i, begin + num_tasks); + + // Store mementos to be sure we visited each i. + mementos.at(i - begin) = 1000 + i; + }); + for (int i = begin; i < begin + num_tasks; ++i) { + EXPECT_EQ(1000 + i, mementos.at(i - begin)); + } + } + } + } +} + +TEST(DataParallelTest, TestRunRanges) { + for (int num_threads = 1; num_threads <= 18; ++num_threads) { + ThreadPool pool(num_threads); + for (int num_tasks = 0; num_tasks < 32; ++num_tasks) { + std::vector<int> mementos(num_tasks, 0); + for (int begin = 0; begin < 32; ++begin) { + std::fill(mementos.begin(), mementos.end(), 0); + pool.RunRanges(begin, begin + num_tasks, + [begin, num_tasks, &mementos](const int chunk, + const uint32_t my_begin, + const uint32_t my_end) { + for (uint32_t i = my_begin; i < my_end; ++i) { + // Parameter is in the given range + EXPECT_GE(i, begin); + EXPECT_LT(i, begin + num_tasks); + + // Store mementos to be sure we visited each i. + mementos.at(i - begin) = 1000 + i; + } + }); + for (int i = begin; i < begin + num_tasks; ++i) { + EXPECT_EQ(1000 + i, mementos.at(i - begin)); + } + } + } + } +} + +// Ensures each of N threads processes exactly 1 of N tasks, i.e. the +// work distribution is perfectly fair for small counts. +TEST(DataParallelTest, TestSmallAssignments) { + for (int num_threads = 1; num_threads <= 64; ++num_threads) { + ThreadPool pool(num_threads); + + std::atomic<int> counter{0}; + // (Avoid mutex because it may perturb the worker thread scheduling) + std::atomic<uint64_t> id_bits{0}; + + pool.Run(0, num_threads, [&counter, num_threads, &id_bits](const int i) { + const int id = counter.fetch_add(1); + EXPECT_LT(id, num_threads); + uint64_t bits = id_bits.load(std::memory_order_relaxed); + while (!id_bits.compare_exchange_weak(bits, bits | (1ULL << id))) { + } + }); + + const int num_participants = PopulationCount(id_bits.load()); + EXPECT_EQ(num_threads, num_participants); + } +} + +// Test payload for PerThread. +struct CheckUniqueIDs { + bool IsNull() const { return false; } + void Destroy() { id_bits = 0; } + void Assimilate(const CheckUniqueIDs& victim) { + // Cannot overlap because each PerThread has unique bits. + EXPECT_EQ(0, id_bits & victim.id_bits); + id_bits |= victim.id_bits; + } + + uint64_t id_bits = 0; +}; + +// Ensures each thread has a PerThread instance, that they are successfully +// combined/reduced into a single result, and that reuse is possible after +// Destroy(). +TEST(DataParallelTest, TestPerThread) { + // We use a uint64_t bit array for convenience => no more than 64 threads. + const int max_threads = std::min(64U, std::thread::hardware_concurrency()); + for (int num_threads = 1; num_threads <= max_threads; ++num_threads) { + ThreadPool pool(num_threads); + + std::atomic<int> counter{0}; + pool.Run(0, num_threads, [&counter, num_threads](const int i) { + const int id = counter.fetch_add(1); + EXPECT_LT(id, num_threads); + PerThread<CheckUniqueIDs>::Get().id_bits |= 1ULL << id; + }); + + // Verify each thread's bit is set. + const uint64_t all_bits = PerThread<CheckUniqueIDs>::Reduce().id_bits; + // Avoid shifting by 64 (undefined). + const uint64_t expected = + num_threads == 64 ? ~0ULL : (1ULL << num_threads) - 1; + EXPECT_EQ(expected, all_bits); + PerThread<CheckUniqueIDs>::Destroy(); + } +} + +} // namespace +} // namespace highwayhash diff --git a/contrib/libs/highwayhash/highwayhash/endianess.h b/contrib/libs/highwayhash/highwayhash/endianess.h index 776a02fa21..6c82d6e50c 100644 --- a/contrib/libs/highwayhash/highwayhash/endianess.h +++ b/contrib/libs/highwayhash/highwayhash/endianess.h @@ -1,108 +1,108 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_ENDIANESS_H_ -#define HIGHWAYHASH_ENDIANESS_H_ - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include <stdint.h> - -#if defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN) - - /* Someone has already included <endian.h> or equivalent. */ - -#elif defined(__LITTLE_ENDIAN__) - -# define HH_IS_LITTLE_ENDIAN 1 -# define HH_IS_BIG_ENDIAN 0 -# ifdef __BIG_ENDIAN__ -# error "Platform is both little and big endian?" -# endif - -#elif defined(__BIG_ENDIAN__) - -# define HH_IS_LITTLE_ENDIAN 0 -# define HH_IS_BIG_ENDIAN 1 - -#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \ - defined(__ORDER_LITTLE_ENDIAN__) - -# define HH_IS_LITTLE_ENDIAN (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -# define HH_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) - -#elif defined(__linux__) || defined(__CYGWIN__) || defined( __GNUC__ ) || \ - defined( __GNU_LIBRARY__ ) - -# include <endian.h> - -#elif defined(__OpenBSD__) || defined(__NetBSD__) || defined(__FreeBSD__) || \ - defined(__DragonFly__) - -# include <sys/endian.h> - -#elif defined(_WIN32) - -#define HH_IS_LITTLE_ENDIAN 1 -#define HH_IS_BIG_ENDIAN 0 - -#else - -# error "Unsupported platform. Cannot determine byte order." - -#endif - - -#ifndef HH_IS_LITTLE_ENDIAN -# define HH_IS_LITTLE_ENDIAN (BYTE_ORDER == LITTLE_ENDIAN) -# define HH_IS_BIG_ENDIAN (BYTE_ORDER == BIG_ENDIAN) -#endif - - -namespace highwayhash { - -#if HH_IS_LITTLE_ENDIAN - -static inline uint32_t le32_from_host(uint32_t x) { return x; } -static inline uint32_t host_from_le32(uint32_t x) { return x; } -static inline uint64_t le64_from_host(uint64_t x) { return x; } -static inline uint64_t host_from_le64(uint64_t x) { return x; } - -#elif !HH_IS_BIG_ENDIAN - -# error "Unsupported byte order." - -#elif defined(_WIN16) || defined(_WIN32) || defined(_WIN64) - -#include <intrin.h> -static inline uint32_t host_from_le32(uint32_t x) { return _byteswap_ulong(x); } -static inline uint32_t le32_from_host(uint32_t x) { return _byteswap_ulong(x); } -static inline uint64_t host_from_le64(uint64_t x) { return _byteswap_uint64(x);} -static inline uint64_t le64_from_host(uint64_t x) { return _byteswap_uint64(x);} - -#else - -static inline uint32_t host_from_le32(uint32_t x) {return __builtin_bswap32(x);} -static inline uint32_t le32_from_host(uint32_t x) {return __builtin_bswap32(x);} -static inline uint64_t host_from_le64(uint64_t x) {return __builtin_bswap64(x);} -static inline uint64_t le64_from_host(uint64_t x) {return __builtin_bswap64(x);} - -#endif - -} // namespace highwayhash - -#endif // HIGHWAYHASH_ENDIANESS_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_ENDIANESS_H_ +#define HIGHWAYHASH_ENDIANESS_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include <stdint.h> + +#if defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN) + + /* Someone has already included <endian.h> or equivalent. */ + +#elif defined(__LITTLE_ENDIAN__) + +# define HH_IS_LITTLE_ENDIAN 1 +# define HH_IS_BIG_ENDIAN 0 +# ifdef __BIG_ENDIAN__ +# error "Platform is both little and big endian?" +# endif + +#elif defined(__BIG_ENDIAN__) + +# define HH_IS_LITTLE_ENDIAN 0 +# define HH_IS_BIG_ENDIAN 1 + +#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \ + defined(__ORDER_LITTLE_ENDIAN__) + +# define HH_IS_LITTLE_ENDIAN (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define HH_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + +#elif defined(__linux__) || defined(__CYGWIN__) || defined( __GNUC__ ) || \ + defined( __GNU_LIBRARY__ ) + +# include <endian.h> + +#elif defined(__OpenBSD__) || defined(__NetBSD__) || defined(__FreeBSD__) || \ + defined(__DragonFly__) + +# include <sys/endian.h> + +#elif defined(_WIN32) + +#define HH_IS_LITTLE_ENDIAN 1 +#define HH_IS_BIG_ENDIAN 0 + +#else + +# error "Unsupported platform. Cannot determine byte order." + +#endif + + +#ifndef HH_IS_LITTLE_ENDIAN +# define HH_IS_LITTLE_ENDIAN (BYTE_ORDER == LITTLE_ENDIAN) +# define HH_IS_BIG_ENDIAN (BYTE_ORDER == BIG_ENDIAN) +#endif + + +namespace highwayhash { + +#if HH_IS_LITTLE_ENDIAN + +static inline uint32_t le32_from_host(uint32_t x) { return x; } +static inline uint32_t host_from_le32(uint32_t x) { return x; } +static inline uint64_t le64_from_host(uint64_t x) { return x; } +static inline uint64_t host_from_le64(uint64_t x) { return x; } + +#elif !HH_IS_BIG_ENDIAN + +# error "Unsupported byte order." + +#elif defined(_WIN16) || defined(_WIN32) || defined(_WIN64) + +#include <intrin.h> +static inline uint32_t host_from_le32(uint32_t x) { return _byteswap_ulong(x); } +static inline uint32_t le32_from_host(uint32_t x) { return _byteswap_ulong(x); } +static inline uint64_t host_from_le64(uint64_t x) { return _byteswap_uint64(x);} +static inline uint64_t le64_from_host(uint64_t x) { return _byteswap_uint64(x);} + +#else + +static inline uint32_t host_from_le32(uint32_t x) {return __builtin_bswap32(x);} +static inline uint32_t le32_from_host(uint32_t x) {return __builtin_bswap32(x);} +static inline uint64_t host_from_le64(uint64_t x) {return __builtin_bswap64(x);} +static inline uint64_t le64_from_host(uint64_t x) {return __builtin_bswap64(x);} + +#endif + +} // namespace highwayhash + +#endif // HIGHWAYHASH_ENDIANESS_H_ diff --git a/contrib/libs/highwayhash/highwayhash/example.cc b/contrib/libs/highwayhash/highwayhash/example.cc index 587e3c5985..ed7c6a3173 100644 --- a/contrib/libs/highwayhash/highwayhash/example.cc +++ b/contrib/libs/highwayhash/highwayhash/example.cc @@ -1,30 +1,30 @@ -// Minimal usage example: prints a hash. Tested on x86, ppc, arm. - -#include "highwayhash/highwayhash.h" - -#include <algorithm> -#include <iostream> - -using namespace highwayhash; - -int main(int argc, char* argv[]) { - // Please use a different key to ensure your hashes aren't identical. - const HHKey key HH_ALIGNAS(32) = {1, 2, 3, 4}; - // Aligning inputs to 32 bytes may help but is not required. - const char in[] = "bytes_to_hash"; - // Type determines the hash size; can also be HHResult128 or HHResult256. - HHResult64 result; - // HH_TARGET_PREFERRED expands to the best specialization available for the - // CPU detected via compiler flags (e.g. AVX2 #ifdef __AVX2__). - HHStateT<HH_TARGET_PREFERRED> state(key); - // Using argc prevents the compiler from eliding the hash computations. - const size_t size = std::min(sizeof(in), static_cast<size_t>(argc)); - HighwayHashT(&state, in, size, &result); - std::cout << "Hash : " << result << std::endl; - - HighwayHashCatT<HH_TARGET_PREFERRED> cat(key); - cat.Append(in, size); - cat.Finalize(&result); - std::cout << "HashCat: " << result << std::endl; - return 0; -} +// Minimal usage example: prints a hash. Tested on x86, ppc, arm. + +#include "highwayhash/highwayhash.h" + +#include <algorithm> +#include <iostream> + +using namespace highwayhash; + +int main(int argc, char* argv[]) { + // Please use a different key to ensure your hashes aren't identical. + const HHKey key HH_ALIGNAS(32) = {1, 2, 3, 4}; + // Aligning inputs to 32 bytes may help but is not required. + const char in[] = "bytes_to_hash"; + // Type determines the hash size; can also be HHResult128 or HHResult256. + HHResult64 result; + // HH_TARGET_PREFERRED expands to the best specialization available for the + // CPU detected via compiler flags (e.g. AVX2 #ifdef __AVX2__). + HHStateT<HH_TARGET_PREFERRED> state(key); + // Using argc prevents the compiler from eliding the hash computations. + const size_t size = std::min(sizeof(in), static_cast<size_t>(argc)); + HighwayHashT(&state, in, size, &result); + std::cout << "Hash : " << result << std::endl; + + HighwayHashCatT<HH_TARGET_PREFERRED> cat(key); + cat.Append(in, size); + cat.Finalize(&result); + std::cout << "HashCat: " << result << std::endl; + return 0; +} diff --git a/contrib/libs/highwayhash/highwayhash/hh_avx2.cc b/contrib/libs/highwayhash/highwayhash/hh_avx2.cc index 7e3ddff0d4..b4477ad2e2 100644 --- a/contrib/libs/highwayhash/highwayhash/hh_avx2.cc +++ b/contrib/libs/highwayhash/highwayhash/hh_avx2.cc @@ -1,19 +1,19 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// WARNING: this is a "restricted" source file; avoid including any headers -// unless they are also restricted. See arch_specific.h for details. - -#define HH_TARGET_NAME AVX2 -#include "highwayhash/highwayhash_target.cc" +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME AVX2 +#include "highwayhash/highwayhash_target.cc" diff --git a/contrib/libs/highwayhash/highwayhash/hh_avx2.h b/contrib/libs/highwayhash/highwayhash/hh_avx2.h index 2912a31b11..dfb85ab32a 100644 --- a/contrib/libs/highwayhash/highwayhash/hh_avx2.h +++ b/contrib/libs/highwayhash/highwayhash/hh_avx2.h @@ -1,383 +1,383 @@ -// Copyright 2015-2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_HH_AVX2_H_ -#define HIGHWAYHASH_HH_AVX2_H_ - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" -#include "highwayhash/hh_buffer.h" -#include "highwayhash/hh_types.h" -#include "highwayhash/load3.h" -#include "highwayhash/vector128.h" -#include "highwayhash/vector256.h" - -// For auto-dependency generation, we need to include all headers but not their -// contents (otherwise compilation fails because -mavx2 is not specified). -#ifndef HH_DISABLE_TARGET_SPECIFIC - -namespace highwayhash { -// See vector128.h for why this namespace is necessary; matching it here makes -// it easier use the vector128 symbols, but requires textual inclusion. -namespace HH_TARGET_NAME { - -class HHStateAVX2 { - public: - explicit HH_INLINE HHStateAVX2(const HHKey key_lanes) { Reset(key_lanes); } - - HH_INLINE void Reset(const HHKey key_lanes) { - // "Nothing up my sleeve" numbers, concatenated hex digits of Pi from - // http://www.numberworld.org/digits/Pi/, retrieved Feb 22, 2016. - // - // We use this python code to generate the fourth number to have - // more even mixture of bits: - /* -def x(a,b,c): - retval = 0 - for i in range(64): - count = ((a >> i) & 1) + ((b >> i) & 1) + ((c >> i) & 1) - if (count <= 1): - retval |= 1 << i - return retval - */ - const V4x64U init0(0x243f6a8885a308d3ull, 0x13198a2e03707344ull, - 0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full); - const V4x64U init1(0x452821e638d01377ull, 0xbe5466cf34e90c6cull, - 0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull); - const V4x64U key = LoadUnaligned<V4x64U>(key_lanes); - v0 = key ^ init0; - v1 = Rotate64By32(key) ^ init1; - mul0 = init0; - mul1 = init1; - } - - HH_INLINE void Update(const HHPacket& packet_bytes) { - const uint64_t* HH_RESTRICT packet = - reinterpret_cast<const uint64_t * HH_RESTRICT>(packet_bytes); - Update(LoadUnaligned<V4x64U>(packet)); - } - - HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) { - // 'Length padding' differentiates zero-valued inputs that have the same - // size/32. mod32 is sufficient because each Update behaves as if a - // counter were injected, because the state is large and mixed thoroughly. - const V8x32U size256( - _mm256_broadcastd_epi32(_mm_cvtsi64_si128(size_mod32))); - // Equivalent to storing size_mod32 in packet. - v0 += V4x64U(size256); - // Boosts the avalanche effect of mod32. - v1 = Rotate32By(v1, size256); - - const char* remainder = bytes + (size_mod32 & ~3); - const size_t size_mod4 = size_mod32 & 3; - - const V4x32U size(_mm256_castsi256_si128(size256)); - - // (Branching is faster than a single _mm256_maskload_epi32.) - if (HH_UNLIKELY(size_mod32 & 16)) { // 16..31 bytes left - const V4x32U packetL = - LoadUnaligned<V4x32U>(reinterpret_cast<const uint32_t*>(bytes)); - - const V4x32U int_mask = IntMask<16>()(size); - const V4x32U int_lanes = MaskedLoadInt(bytes + 16, int_mask); - const uint32_t last4 = - Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4); - - // The upper four bytes of packetH are zero, so insert there. - const V4x32U packetH(_mm_insert_epi32(int_lanes, last4, 3)); - Update(packetH, packetL); - } else { // size_mod32 < 16 - const V4x32U int_mask = IntMask<0>()(size); - const V4x32U packetL = MaskedLoadInt(bytes, int_mask); - const uint64_t last3 = - Load3()(Load3::AllowUnordered(), remainder, size_mod4); - - // Rather than insert into packetL[3], it is faster to initialize - // the otherwise empty packetH. - const V4x32U packetH(_mm_cvtsi64_si128(last3)); - Update(packetH, packetL); - } - } - - HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) { - // Mix together all lanes. It is slightly better to permute v0 than v1; - // it will be added to v1. - Update(Permute(v0)); - Update(Permute(v0)); - Update(Permute(v0)); - Update(Permute(v0)); - - const V2x64U sum0(_mm256_castsi256_si128(v0 + mul0)); - const V2x64U sum1(_mm256_castsi256_si128(v1 + mul1)); - const V2x64U hash = sum0 + sum1; - // Each lane is sufficiently mixed, so just truncate to 64 bits. - _mm_storel_epi64(reinterpret_cast<__m128i*>(result), hash); - } - - HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) { - Update(Permute(v0)); - Update(Permute(v0)); - Update(Permute(v0)); - Update(Permute(v0)); - - const V2x64U sum0(_mm256_castsi256_si128(v0 + mul0)); - const V2x64U sum1(_mm256_extracti128_si256(v1 + mul1, 1)); - const V2x64U hash = sum0 + sum1; - _mm_storeu_si128(reinterpret_cast<__m128i*>(result), hash); - } - - HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) { - Update(Permute(v0)); - Update(Permute(v0)); - Update(Permute(v0)); - Update(Permute(v0)); - - const V4x64U sum0 = v0 + mul0; - const V4x64U sum1 = v1 + mul1; - const V4x64U hash = ModularReduction(sum1, sum0); - StoreUnaligned(hash, &(*result)[0]); - } - - // "buffer" must be 32-byte aligned. - static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer) { - const __m256i zero = _mm256_setzero_si256(); - _mm256_store_si256(reinterpret_cast<__m256i*>(buffer), zero); - } - - // "buffer" must be 32-byte aligned. - static HH_INLINE void CopyPartial(const char* HH_RESTRICT from, - const size_t size_mod32, - char* HH_RESTRICT buffer) { - const V4x32U size(size_mod32); - const uint32_t* const HH_RESTRICT from_u32 = - reinterpret_cast<const uint32_t * HH_RESTRICT>(from); - uint32_t* const HH_RESTRICT buffer_u32 = - reinterpret_cast<uint32_t * HH_RESTRICT>(buffer); - if (HH_UNLIKELY(size_mod32 & 16)) { // Copying 16..31 bytes - const V4x32U inL = LoadUnaligned<V4x32U>(from_u32); - Store(inL, buffer_u32); - const V4x32U inH = Load0To16<16, Load3::AllowReadBefore>( - from + 16, size_mod32 - 16, size); - Store(inH, buffer_u32 + V4x32U::N); - } else { // Copying 0..15 bytes - const V4x32U inL = Load0To16<>(from, size_mod32, size); - Store(inL, buffer_u32); - // No need to change upper 16 bytes of buffer. - } - } - - // "buffer" must be 32-byte aligned. - static HH_INLINE void AppendPartial(const char* HH_RESTRICT from, - const size_t size_mod32, - char* HH_RESTRICT buffer, - const size_t buffer_valid) { - const V4x32U size(size_mod32); - uint32_t* const HH_RESTRICT buffer_u32 = - reinterpret_cast<uint32_t * HH_RESTRICT>(buffer); - // buffer_valid + size <= 32 => appending 0..16 bytes inside upper 16 bytes. - if (HH_UNLIKELY(buffer_valid & 16)) { - const V4x32U suffix = Load0To16<>(from, size_mod32, size); - const V4x32U bufferH = Load<V4x32U>(buffer_u32 + V4x32U::N); - const V4x32U outH = Concatenate(bufferH, buffer_valid - 16, suffix); - Store(outH, buffer_u32 + V4x32U::N); - } else { // Appending 0..32 bytes starting at offset 0..15. - const V4x32U bufferL = Load<V4x32U>(buffer_u32); - const V4x32U suffixL = Load0To16<>(from, size_mod32, size); - const V4x32U outL = Concatenate(bufferL, buffer_valid, suffixL); - Store(outL, buffer_u32); - const size_t offsetH = sizeof(V4x32U) - buffer_valid; - // Do we have enough input to start filling the upper 16 buffer bytes? - if (size_mod32 > offsetH) { - const size_t sizeH = size_mod32 - offsetH; - const V4x32U outH = Load0To16<>(from + offsetH, sizeH, V4x32U(sizeH)); - Store(outH, buffer_u32 + V4x32U::N); - } - } - } - - // "buffer" must be 32-byte aligned. - HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from, - const size_t size_mod32, - const char* HH_RESTRICT buffer, - const size_t buffer_valid) { - const V4x32U size(size_mod32); - const uint32_t* const HH_RESTRICT buffer_u32 = - reinterpret_cast<const uint32_t * HH_RESTRICT>(buffer); - // buffer_valid + size <= 32 => appending 0..16 bytes inside upper 16 bytes. - if (HH_UNLIKELY(buffer_valid & 16)) { - const V4x32U suffix = Load0To16<>(from, size_mod32, size); - const V4x32U packetL = Load<V4x32U>(buffer_u32); - const V4x32U bufferH = Load<V4x32U>(buffer_u32 + V4x32U::N); - const V4x32U packetH = Concatenate(bufferH, buffer_valid - 16, suffix); - Update(packetH, packetL); - } else { // Appending 0..32 bytes starting at offset 0..15. - const V4x32U bufferL = Load<V4x32U>(buffer_u32); - const V4x32U suffixL = Load0To16<>(from, size_mod32, size); - const V4x32U packetL = Concatenate(bufferL, buffer_valid, suffixL); - const size_t offsetH = sizeof(V4x32U) - buffer_valid; - V4x32U packetH = packetL - packetL; - // Do we have enough input to start filling the upper 16 packet bytes? - if (size_mod32 > offsetH) { - const size_t sizeH = size_mod32 - offsetH; - packetH = Load0To16<>(from + offsetH, sizeH, V4x32U(sizeH)); - } - - Update(packetH, packetL); - } - } - - private: - static HH_INLINE V4x32U MaskedLoadInt(const char* from, - const V4x32U& int_mask) { - // No faults will be raised when reading n=0..3 ints from "from" provided - // int_mask[n] = 0. - const int* HH_RESTRICT int_from = reinterpret_cast<const int*>(from); - return V4x32U(_mm_maskload_epi32(int_from, int_mask)); - } - - // Loads <= 16 bytes without accessing any byte outside [from, from + size). - // from[i] is loaded into lane i; from[i >= size] is undefined. - template <uint32_t kSizeOffset = 0, class Load3Policy = Load3::AllowNone> - static HH_INLINE V4x32U Load0To16(const char* from, const size_t size_mod32, - const V4x32U& size) { - const char* remainder = from + (size_mod32 & ~3); - const uint64_t last3 = Load3()(Load3Policy(), remainder, size_mod32 & 3); - const V4x32U int_mask = IntMask<kSizeOffset>()(size); - const V4x32U int_lanes = MaskedLoadInt(from, int_mask); - return Insert4AboveMask(last3, int_mask, int_lanes); - } - - static HH_INLINE V4x64U Rotate64By32(const V4x64U& v) { - return V4x64U(_mm256_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1))); - } - - // Rotates 32-bit lanes by "count" bits. - static HH_INLINE V4x64U Rotate32By(const V4x64U& v, const V8x32U& count) { - // Use variable shifts because sll_epi32 has 4 cycle latency (presumably - // to broadcast the shift count). - const V4x64U shifted_left(_mm256_sllv_epi32(v, count)); - const V4x64U shifted_right(_mm256_srlv_epi32(v, V8x32U(32) - count)); - return shifted_left | shifted_right; - } - - static HH_INLINE V4x64U Permute(const V4x64U& v) { - // For complete mixing, we need to swap the upper and lower 128-bit halves; - // we also swap all 32-bit halves. This is faster than extracti128 plus - // inserti128 followed by Rotate64By32. - const V4x64U indices(0x0000000200000003ull, 0x0000000000000001ull, - 0x0000000600000007ull, 0x0000000400000005ull); - return V4x64U(_mm256_permutevar8x32_epi32(v, indices)); - } - - static HH_INLINE V4x64U MulLow32(const V4x64U& a, const V4x64U& b) { - return V4x64U(_mm256_mul_epu32(a, b)); - } - - static HH_INLINE V4x64U ZipperMerge(const V4x64U& v) { - // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to - // varying degrees. In descending order of goodness, bytes - // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. - // As expected, the upper and lower bytes are much worse. - // For each 64-bit lane, our objectives are: - // 1) maximizing and equalizing total goodness across the four lanes. - // 2) mixing with bytes from the neighboring lane (AVX-2 makes it difficult - // to cross the 128-bit wall, but PermuteAndUpdate takes care of that); - // 3) placing the worst bytes in the upper 32 bits because those will not - // be used in the next 32x32 multiplication. - const uint64_t hi = 0x070806090D0A040Bull; - const uint64_t lo = 0x000F010E05020C03ull; - return V4x64U(_mm256_shuffle_epi8(v, V4x64U(hi, lo, hi, lo))); - } - - // Updates four hash lanes in parallel by injecting four 64-bit packets. - HH_INLINE void Update(const V4x64U& packet) { - v1 += packet; - v1 += mul0; - mul0 ^= MulLow32(v1, v0 >> 32); - HH_COMPILER_FENCE; - v0 += mul1; - mul1 ^= MulLow32(v0, v1 >> 32); - HH_COMPILER_FENCE; - v0 += ZipperMerge(v1); - v1 += ZipperMerge(v0); - } - - HH_INLINE void Update(const V4x32U& packetH, const V4x32U& packetL) { - const __m256i packetL256 = _mm256_castsi128_si256(packetL); - Update(V4x64U(_mm256_inserti128_si256(packetL256, packetH, 1))); - } - - // XORs a << 1 and a << 2 into *out after clearing the upper two bits of a. - // Also does the same for the upper 128 bit lane "b". Bit shifts are only - // possible on independent 64-bit lanes. We therefore insert the upper bits - // of a[0] that were lost into a[1]. Thanks to D. Lemire for helpful comments! - static HH_INLINE void XorByShift128Left12(const V4x64U& ba, - V4x64U* HH_RESTRICT out) { - const V4x64U zero = ba ^ ba; - const V4x64U top_bits2 = ba >> (64 - 2); - const V4x64U ones = ba == ba; // FF .. FF - const V4x64U shifted1_unmasked = ba + ba; // (avoids needing port0) - HH_COMPILER_FENCE; - - // Only the lower halves of top_bits1's 128 bit lanes will be used, so we - // can compute it before clearing the upper two bits of ba. - const V4x64U top_bits1 = ba >> (64 - 1); - const V4x64U upper_8bytes(_mm256_slli_si256(ones, 8)); // F 0 F 0 - const V4x64U shifted2 = shifted1_unmasked + shifted1_unmasked; - HH_COMPILER_FENCE; - - const V4x64U upper_bit_of_128 = upper_8bytes << 63; // 80..00 80..00 - const V4x64U new_low_bits2(_mm256_unpacklo_epi64(zero, top_bits2)); - *out ^= shifted2; - HH_COMPILER_FENCE; - - // The result must be as if the upper two bits of the input had been clear, - // otherwise we're no longer computing a reduction. - const V4x64U shifted1 = AndNot(upper_bit_of_128, shifted1_unmasked); - *out ^= new_low_bits2; - HH_COMPILER_FENCE; - - const V4x64U new_low_bits1(_mm256_unpacklo_epi64(zero, top_bits1)); - *out ^= shifted1; - - *out ^= new_low_bits1; - } - - // Modular reduction by the irreducible polynomial (x^128 + x^2 + x). - // Input: two 256-bit numbers a3210 and b3210, interleaved in 2 vectors. - // The upper and lower 128-bit halves are processed independently. - static HH_INLINE V4x64U ModularReduction(const V4x64U& b32a32, - const V4x64U& b10a10) { - // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf. - V4x64U out = b10a10; - XorByShift128Left12(b32a32, &out); - return out; - } - - V4x64U v0; - V4x64U v1; - V4x64U mul0; - V4x64U mul1; -}; - -} // namespace HH_TARGET_NAME -} // namespace highwayhash - -#endif // HH_DISABLE_TARGET_SPECIFIC -#endif // HIGHWAYHASH_HH_AVX2_H_ +// Copyright 2015-2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_AVX2_H_ +#define HIGHWAYHASH_HH_AVX2_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/hh_buffer.h" +#include "highwayhash/hh_types.h" +#include "highwayhash/load3.h" +#include "highwayhash/vector128.h" +#include "highwayhash/vector256.h" + +// For auto-dependency generation, we need to include all headers but not their +// contents (otherwise compilation fails because -mavx2 is not specified). +#ifndef HH_DISABLE_TARGET_SPECIFIC + +namespace highwayhash { +// See vector128.h for why this namespace is necessary; matching it here makes +// it easier use the vector128 symbols, but requires textual inclusion. +namespace HH_TARGET_NAME { + +class HHStateAVX2 { + public: + explicit HH_INLINE HHStateAVX2(const HHKey key_lanes) { Reset(key_lanes); } + + HH_INLINE void Reset(const HHKey key_lanes) { + // "Nothing up my sleeve" numbers, concatenated hex digits of Pi from + // http://www.numberworld.org/digits/Pi/, retrieved Feb 22, 2016. + // + // We use this python code to generate the fourth number to have + // more even mixture of bits: + /* +def x(a,b,c): + retval = 0 + for i in range(64): + count = ((a >> i) & 1) + ((b >> i) & 1) + ((c >> i) & 1) + if (count <= 1): + retval |= 1 << i + return retval + */ + const V4x64U init0(0x243f6a8885a308d3ull, 0x13198a2e03707344ull, + 0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full); + const V4x64U init1(0x452821e638d01377ull, 0xbe5466cf34e90c6cull, + 0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull); + const V4x64U key = LoadUnaligned<V4x64U>(key_lanes); + v0 = key ^ init0; + v1 = Rotate64By32(key) ^ init1; + mul0 = init0; + mul1 = init1; + } + + HH_INLINE void Update(const HHPacket& packet_bytes) { + const uint64_t* HH_RESTRICT packet = + reinterpret_cast<const uint64_t * HH_RESTRICT>(packet_bytes); + Update(LoadUnaligned<V4x64U>(packet)); + } + + HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) { + // 'Length padding' differentiates zero-valued inputs that have the same + // size/32. mod32 is sufficient because each Update behaves as if a + // counter were injected, because the state is large and mixed thoroughly. + const V8x32U size256( + _mm256_broadcastd_epi32(_mm_cvtsi64_si128(size_mod32))); + // Equivalent to storing size_mod32 in packet. + v0 += V4x64U(size256); + // Boosts the avalanche effect of mod32. + v1 = Rotate32By(v1, size256); + + const char* remainder = bytes + (size_mod32 & ~3); + const size_t size_mod4 = size_mod32 & 3; + + const V4x32U size(_mm256_castsi256_si128(size256)); + + // (Branching is faster than a single _mm256_maskload_epi32.) + if (HH_UNLIKELY(size_mod32 & 16)) { // 16..31 bytes left + const V4x32U packetL = + LoadUnaligned<V4x32U>(reinterpret_cast<const uint32_t*>(bytes)); + + const V4x32U int_mask = IntMask<16>()(size); + const V4x32U int_lanes = MaskedLoadInt(bytes + 16, int_mask); + const uint32_t last4 = + Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4); + + // The upper four bytes of packetH are zero, so insert there. + const V4x32U packetH(_mm_insert_epi32(int_lanes, last4, 3)); + Update(packetH, packetL); + } else { // size_mod32 < 16 + const V4x32U int_mask = IntMask<0>()(size); + const V4x32U packetL = MaskedLoadInt(bytes, int_mask); + const uint64_t last3 = + Load3()(Load3::AllowUnordered(), remainder, size_mod4); + + // Rather than insert into packetL[3], it is faster to initialize + // the otherwise empty packetH. + const V4x32U packetH(_mm_cvtsi64_si128(last3)); + Update(packetH, packetL); + } + } + + HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) { + // Mix together all lanes. It is slightly better to permute v0 than v1; + // it will be added to v1. + Update(Permute(v0)); + Update(Permute(v0)); + Update(Permute(v0)); + Update(Permute(v0)); + + const V2x64U sum0(_mm256_castsi256_si128(v0 + mul0)); + const V2x64U sum1(_mm256_castsi256_si128(v1 + mul1)); + const V2x64U hash = sum0 + sum1; + // Each lane is sufficiently mixed, so just truncate to 64 bits. + _mm_storel_epi64(reinterpret_cast<__m128i*>(result), hash); + } + + HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) { + Update(Permute(v0)); + Update(Permute(v0)); + Update(Permute(v0)); + Update(Permute(v0)); + + const V2x64U sum0(_mm256_castsi256_si128(v0 + mul0)); + const V2x64U sum1(_mm256_extracti128_si256(v1 + mul1, 1)); + const V2x64U hash = sum0 + sum1; + _mm_storeu_si128(reinterpret_cast<__m128i*>(result), hash); + } + + HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) { + Update(Permute(v0)); + Update(Permute(v0)); + Update(Permute(v0)); + Update(Permute(v0)); + + const V4x64U sum0 = v0 + mul0; + const V4x64U sum1 = v1 + mul1; + const V4x64U hash = ModularReduction(sum1, sum0); + StoreUnaligned(hash, &(*result)[0]); + } + + // "buffer" must be 32-byte aligned. + static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_store_si256(reinterpret_cast<__m256i*>(buffer), zero); + } + + // "buffer" must be 32-byte aligned. + static HH_INLINE void CopyPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer) { + const V4x32U size(size_mod32); + const uint32_t* const HH_RESTRICT from_u32 = + reinterpret_cast<const uint32_t * HH_RESTRICT>(from); + uint32_t* const HH_RESTRICT buffer_u32 = + reinterpret_cast<uint32_t * HH_RESTRICT>(buffer); + if (HH_UNLIKELY(size_mod32 & 16)) { // Copying 16..31 bytes + const V4x32U inL = LoadUnaligned<V4x32U>(from_u32); + Store(inL, buffer_u32); + const V4x32U inH = Load0To16<16, Load3::AllowReadBefore>( + from + 16, size_mod32 - 16, size); + Store(inH, buffer_u32 + V4x32U::N); + } else { // Copying 0..15 bytes + const V4x32U inL = Load0To16<>(from, size_mod32, size); + Store(inL, buffer_u32); + // No need to change upper 16 bytes of buffer. + } + } + + // "buffer" must be 32-byte aligned. + static HH_INLINE void AppendPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer, + const size_t buffer_valid) { + const V4x32U size(size_mod32); + uint32_t* const HH_RESTRICT buffer_u32 = + reinterpret_cast<uint32_t * HH_RESTRICT>(buffer); + // buffer_valid + size <= 32 => appending 0..16 bytes inside upper 16 bytes. + if (HH_UNLIKELY(buffer_valid & 16)) { + const V4x32U suffix = Load0To16<>(from, size_mod32, size); + const V4x32U bufferH = Load<V4x32U>(buffer_u32 + V4x32U::N); + const V4x32U outH = Concatenate(bufferH, buffer_valid - 16, suffix); + Store(outH, buffer_u32 + V4x32U::N); + } else { // Appending 0..32 bytes starting at offset 0..15. + const V4x32U bufferL = Load<V4x32U>(buffer_u32); + const V4x32U suffixL = Load0To16<>(from, size_mod32, size); + const V4x32U outL = Concatenate(bufferL, buffer_valid, suffixL); + Store(outL, buffer_u32); + const size_t offsetH = sizeof(V4x32U) - buffer_valid; + // Do we have enough input to start filling the upper 16 buffer bytes? + if (size_mod32 > offsetH) { + const size_t sizeH = size_mod32 - offsetH; + const V4x32U outH = Load0To16<>(from + offsetH, sizeH, V4x32U(sizeH)); + Store(outH, buffer_u32 + V4x32U::N); + } + } + } + + // "buffer" must be 32-byte aligned. + HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from, + const size_t size_mod32, + const char* HH_RESTRICT buffer, + const size_t buffer_valid) { + const V4x32U size(size_mod32); + const uint32_t* const HH_RESTRICT buffer_u32 = + reinterpret_cast<const uint32_t * HH_RESTRICT>(buffer); + // buffer_valid + size <= 32 => appending 0..16 bytes inside upper 16 bytes. + if (HH_UNLIKELY(buffer_valid & 16)) { + const V4x32U suffix = Load0To16<>(from, size_mod32, size); + const V4x32U packetL = Load<V4x32U>(buffer_u32); + const V4x32U bufferH = Load<V4x32U>(buffer_u32 + V4x32U::N); + const V4x32U packetH = Concatenate(bufferH, buffer_valid - 16, suffix); + Update(packetH, packetL); + } else { // Appending 0..32 bytes starting at offset 0..15. + const V4x32U bufferL = Load<V4x32U>(buffer_u32); + const V4x32U suffixL = Load0To16<>(from, size_mod32, size); + const V4x32U packetL = Concatenate(bufferL, buffer_valid, suffixL); + const size_t offsetH = sizeof(V4x32U) - buffer_valid; + V4x32U packetH = packetL - packetL; + // Do we have enough input to start filling the upper 16 packet bytes? + if (size_mod32 > offsetH) { + const size_t sizeH = size_mod32 - offsetH; + packetH = Load0To16<>(from + offsetH, sizeH, V4x32U(sizeH)); + } + + Update(packetH, packetL); + } + } + + private: + static HH_INLINE V4x32U MaskedLoadInt(const char* from, + const V4x32U& int_mask) { + // No faults will be raised when reading n=0..3 ints from "from" provided + // int_mask[n] = 0. + const int* HH_RESTRICT int_from = reinterpret_cast<const int*>(from); + return V4x32U(_mm_maskload_epi32(int_from, int_mask)); + } + + // Loads <= 16 bytes without accessing any byte outside [from, from + size). + // from[i] is loaded into lane i; from[i >= size] is undefined. + template <uint32_t kSizeOffset = 0, class Load3Policy = Load3::AllowNone> + static HH_INLINE V4x32U Load0To16(const char* from, const size_t size_mod32, + const V4x32U& size) { + const char* remainder = from + (size_mod32 & ~3); + const uint64_t last3 = Load3()(Load3Policy(), remainder, size_mod32 & 3); + const V4x32U int_mask = IntMask<kSizeOffset>()(size); + const V4x32U int_lanes = MaskedLoadInt(from, int_mask); + return Insert4AboveMask(last3, int_mask, int_lanes); + } + + static HH_INLINE V4x64U Rotate64By32(const V4x64U& v) { + return V4x64U(_mm256_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1))); + } + + // Rotates 32-bit lanes by "count" bits. + static HH_INLINE V4x64U Rotate32By(const V4x64U& v, const V8x32U& count) { + // Use variable shifts because sll_epi32 has 4 cycle latency (presumably + // to broadcast the shift count). + const V4x64U shifted_left(_mm256_sllv_epi32(v, count)); + const V4x64U shifted_right(_mm256_srlv_epi32(v, V8x32U(32) - count)); + return shifted_left | shifted_right; + } + + static HH_INLINE V4x64U Permute(const V4x64U& v) { + // For complete mixing, we need to swap the upper and lower 128-bit halves; + // we also swap all 32-bit halves. This is faster than extracti128 plus + // inserti128 followed by Rotate64By32. + const V4x64U indices(0x0000000200000003ull, 0x0000000000000001ull, + 0x0000000600000007ull, 0x0000000400000005ull); + return V4x64U(_mm256_permutevar8x32_epi32(v, indices)); + } + + static HH_INLINE V4x64U MulLow32(const V4x64U& a, const V4x64U& b) { + return V4x64U(_mm256_mul_epu32(a, b)); + } + + static HH_INLINE V4x64U ZipperMerge(const V4x64U& v) { + // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + // varying degrees. In descending order of goodness, bytes + // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + // As expected, the upper and lower bytes are much worse. + // For each 64-bit lane, our objectives are: + // 1) maximizing and equalizing total goodness across the four lanes. + // 2) mixing with bytes from the neighboring lane (AVX-2 makes it difficult + // to cross the 128-bit wall, but PermuteAndUpdate takes care of that); + // 3) placing the worst bytes in the upper 32 bits because those will not + // be used in the next 32x32 multiplication. + const uint64_t hi = 0x070806090D0A040Bull; + const uint64_t lo = 0x000F010E05020C03ull; + return V4x64U(_mm256_shuffle_epi8(v, V4x64U(hi, lo, hi, lo))); + } + + // Updates four hash lanes in parallel by injecting four 64-bit packets. + HH_INLINE void Update(const V4x64U& packet) { + v1 += packet; + v1 += mul0; + mul0 ^= MulLow32(v1, v0 >> 32); + HH_COMPILER_FENCE; + v0 += mul1; + mul1 ^= MulLow32(v0, v1 >> 32); + HH_COMPILER_FENCE; + v0 += ZipperMerge(v1); + v1 += ZipperMerge(v0); + } + + HH_INLINE void Update(const V4x32U& packetH, const V4x32U& packetL) { + const __m256i packetL256 = _mm256_castsi128_si256(packetL); + Update(V4x64U(_mm256_inserti128_si256(packetL256, packetH, 1))); + } + + // XORs a << 1 and a << 2 into *out after clearing the upper two bits of a. + // Also does the same for the upper 128 bit lane "b". Bit shifts are only + // possible on independent 64-bit lanes. We therefore insert the upper bits + // of a[0] that were lost into a[1]. Thanks to D. Lemire for helpful comments! + static HH_INLINE void XorByShift128Left12(const V4x64U& ba, + V4x64U* HH_RESTRICT out) { + const V4x64U zero = ba ^ ba; + const V4x64U top_bits2 = ba >> (64 - 2); + const V4x64U ones = ba == ba; // FF .. FF + const V4x64U shifted1_unmasked = ba + ba; // (avoids needing port0) + HH_COMPILER_FENCE; + + // Only the lower halves of top_bits1's 128 bit lanes will be used, so we + // can compute it before clearing the upper two bits of ba. + const V4x64U top_bits1 = ba >> (64 - 1); + const V4x64U upper_8bytes(_mm256_slli_si256(ones, 8)); // F 0 F 0 + const V4x64U shifted2 = shifted1_unmasked + shifted1_unmasked; + HH_COMPILER_FENCE; + + const V4x64U upper_bit_of_128 = upper_8bytes << 63; // 80..00 80..00 + const V4x64U new_low_bits2(_mm256_unpacklo_epi64(zero, top_bits2)); + *out ^= shifted2; + HH_COMPILER_FENCE; + + // The result must be as if the upper two bits of the input had been clear, + // otherwise we're no longer computing a reduction. + const V4x64U shifted1 = AndNot(upper_bit_of_128, shifted1_unmasked); + *out ^= new_low_bits2; + HH_COMPILER_FENCE; + + const V4x64U new_low_bits1(_mm256_unpacklo_epi64(zero, top_bits1)); + *out ^= shifted1; + + *out ^= new_low_bits1; + } + + // Modular reduction by the irreducible polynomial (x^128 + x^2 + x). + // Input: two 256-bit numbers a3210 and b3210, interleaved in 2 vectors. + // The upper and lower 128-bit halves are processed independently. + static HH_INLINE V4x64U ModularReduction(const V4x64U& b32a32, + const V4x64U& b10a10) { + // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf. + V4x64U out = b10a10; + XorByShift128Left12(b32a32, &out); + return out; + } + + V4x64U v0; + V4x64U v1; + V4x64U mul0; + V4x64U mul1; +}; + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HH_DISABLE_TARGET_SPECIFIC +#endif // HIGHWAYHASH_HH_AVX2_H_ diff --git a/contrib/libs/highwayhash/highwayhash/hh_buffer.h b/contrib/libs/highwayhash/highwayhash/hh_buffer.h index 5b1c83f95b..83b0fa6b8e 100644 --- a/contrib/libs/highwayhash/highwayhash/hh_buffer.h +++ b/contrib/libs/highwayhash/highwayhash/hh_buffer.h @@ -1,103 +1,103 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_HH_BUFFER_H_ -#define HIGHWAYHASH_HH_BUFFER_H_ - -// Helper functions used by hh_avx2 and hh_sse41. - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include "highwayhash/vector128.h" - -// For auto-dependency generation, we need to include all headers but not their -// contents (otherwise compilation fails because -msse4.1 is not specified). -#ifndef HH_DISABLE_TARGET_SPECIFIC - -namespace highwayhash { -// To prevent ODR violations when including this from multiple translation -// units (TU) that are compiled with different flags, the contents must reside -// in a namespace whose name is unique to the TU. NOTE: this behavior is -// incompatible with precompiled modules and requires textual inclusion instead. -namespace HH_TARGET_NAME { - -template <uint32_t kSizeOffset> -struct IntMask {}; // primary template - -template <> -struct IntMask<0> { - // Returns 32-bit lanes : ~0U if that lane can be loaded given "size" bytes. - // Typical case: size = 0..16, nothing deducted. - HH_INLINE V4x32U operator()(const V4x32U& size) const { - // Lane n is valid if size >= (n + 1) * 4; subtract one because we only have - // greater-than comparisons and don't want a negated mask. - return V4x32U(_mm_cmpgt_epi32(size, V4x32U(15, 11, 7, 3))); - } -}; - -template <> -struct IntMask<16> { - // "size" is 16..31; this is for loading the upper half of a packet, so - // effectively deduct 16 from size by changing the comparands. - HH_INLINE V4x32U operator()(const V4x32U& size) const { - return V4x32U(_mm_cmpgt_epi32(size, V4x32U(31, 27, 23, 19))); - } -}; - -// Inserts "bytes4" into "prev" at the lowest i such that mask[i] = 0. -// Assumes prev[j] == 0 if mask[j] = 0. -HH_INLINE V4x32U Insert4AboveMask(const uint32_t bytes4, const V4x32U& mask, - const V4x32U& prev) { - // There is no 128-bit shift by a variable count. Using shuffle_epi8 with a - // control mask requires a table lookup. We know the shift count is a - // multiple of 4 bytes, so we can broadcastd_epi32 and clear all lanes except - // those where mask != 0. This works because any upper output lanes need not - // be zero. - return prev | AndNot(mask, V4x32U(bytes4)); -} - -// Shifts "suffix" left by "prefix_len" = 0..15 bytes, clears upper bytes of -// "prefix", and returns the merged/concatenated bytes. -HH_INLINE V4x32U Concatenate(const V4x32U& prefix, const size_t prefix_len, - const V4x32U& suffix) { - static const uint64_t table[V16x8U::N][V2x64U::N] = { - {0x0706050403020100ull, 0x0F0E0D0C0B0A0908ull}, - {0x06050403020100FFull, 0x0E0D0C0B0A090807ull}, - {0x050403020100FFFFull, 0x0D0C0B0A09080706ull}, - {0x0403020100FFFFFFull, 0x0C0B0A0908070605ull}, - {0x03020100FFFFFFFFull, 0x0B0A090807060504ull}, - {0x020100FFFFFFFFFFull, 0x0A09080706050403ull}, - {0x0100FFFFFFFFFFFFull, 0x0908070605040302ull}, - {0x00FFFFFFFFFFFFFFull, 0x0807060504030201ull}, - {0xFFFFFFFFFFFFFFFFull, 0x0706050403020100ull}, - {0xFFFFFFFFFFFFFFFFull, 0x06050403020100FFull}, - {0xFFFFFFFFFFFFFFFFull, 0x050403020100FFFFull}, - {0xFFFFFFFFFFFFFFFFull, 0x0403020100FFFFFFull}, - {0xFFFFFFFFFFFFFFFFull, 0x03020100FFFFFFFFull}, - {0xFFFFFFFFFFFFFFFFull, 0x020100FFFFFFFFFFull}, - {0xFFFFFFFFFFFFFFFFull, 0x0100FFFFFFFFFFFFull}, - {0xFFFFFFFFFFFFFFFFull, 0x00FFFFFFFFFFFFFFull}}; - const V2x64U control = Load<V2x64U>(&table[prefix_len][0]); - const V2x64U shifted_suffix(_mm_shuffle_epi8(suffix, control)); - return V4x32U(_mm_blendv_epi8(shifted_suffix, prefix, control)); -} - -} // namespace HH_TARGET_NAME -} // namespace highwayhash - -#endif // HH_DISABLE_TARGET_SPECIFIC -#endif // HIGHWAYHASH_HH_BUFFER_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_BUFFER_H_ +#define HIGHWAYHASH_HH_BUFFER_H_ + +// Helper functions used by hh_avx2 and hh_sse41. + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/vector128.h" + +// For auto-dependency generation, we need to include all headers but not their +// contents (otherwise compilation fails because -msse4.1 is not specified). +#ifndef HH_DISABLE_TARGET_SPECIFIC + +namespace highwayhash { +// To prevent ODR violations when including this from multiple translation +// units (TU) that are compiled with different flags, the contents must reside +// in a namespace whose name is unique to the TU. NOTE: this behavior is +// incompatible with precompiled modules and requires textual inclusion instead. +namespace HH_TARGET_NAME { + +template <uint32_t kSizeOffset> +struct IntMask {}; // primary template + +template <> +struct IntMask<0> { + // Returns 32-bit lanes : ~0U if that lane can be loaded given "size" bytes. + // Typical case: size = 0..16, nothing deducted. + HH_INLINE V4x32U operator()(const V4x32U& size) const { + // Lane n is valid if size >= (n + 1) * 4; subtract one because we only have + // greater-than comparisons and don't want a negated mask. + return V4x32U(_mm_cmpgt_epi32(size, V4x32U(15, 11, 7, 3))); + } +}; + +template <> +struct IntMask<16> { + // "size" is 16..31; this is for loading the upper half of a packet, so + // effectively deduct 16 from size by changing the comparands. + HH_INLINE V4x32U operator()(const V4x32U& size) const { + return V4x32U(_mm_cmpgt_epi32(size, V4x32U(31, 27, 23, 19))); + } +}; + +// Inserts "bytes4" into "prev" at the lowest i such that mask[i] = 0. +// Assumes prev[j] == 0 if mask[j] = 0. +HH_INLINE V4x32U Insert4AboveMask(const uint32_t bytes4, const V4x32U& mask, + const V4x32U& prev) { + // There is no 128-bit shift by a variable count. Using shuffle_epi8 with a + // control mask requires a table lookup. We know the shift count is a + // multiple of 4 bytes, so we can broadcastd_epi32 and clear all lanes except + // those where mask != 0. This works because any upper output lanes need not + // be zero. + return prev | AndNot(mask, V4x32U(bytes4)); +} + +// Shifts "suffix" left by "prefix_len" = 0..15 bytes, clears upper bytes of +// "prefix", and returns the merged/concatenated bytes. +HH_INLINE V4x32U Concatenate(const V4x32U& prefix, const size_t prefix_len, + const V4x32U& suffix) { + static const uint64_t table[V16x8U::N][V2x64U::N] = { + {0x0706050403020100ull, 0x0F0E0D0C0B0A0908ull}, + {0x06050403020100FFull, 0x0E0D0C0B0A090807ull}, + {0x050403020100FFFFull, 0x0D0C0B0A09080706ull}, + {0x0403020100FFFFFFull, 0x0C0B0A0908070605ull}, + {0x03020100FFFFFFFFull, 0x0B0A090807060504ull}, + {0x020100FFFFFFFFFFull, 0x0A09080706050403ull}, + {0x0100FFFFFFFFFFFFull, 0x0908070605040302ull}, + {0x00FFFFFFFFFFFFFFull, 0x0807060504030201ull}, + {0xFFFFFFFFFFFFFFFFull, 0x0706050403020100ull}, + {0xFFFFFFFFFFFFFFFFull, 0x06050403020100FFull}, + {0xFFFFFFFFFFFFFFFFull, 0x050403020100FFFFull}, + {0xFFFFFFFFFFFFFFFFull, 0x0403020100FFFFFFull}, + {0xFFFFFFFFFFFFFFFFull, 0x03020100FFFFFFFFull}, + {0xFFFFFFFFFFFFFFFFull, 0x020100FFFFFFFFFFull}, + {0xFFFFFFFFFFFFFFFFull, 0x0100FFFFFFFFFFFFull}, + {0xFFFFFFFFFFFFFFFFull, 0x00FFFFFFFFFFFFFFull}}; + const V2x64U control = Load<V2x64U>(&table[prefix_len][0]); + const V2x64U shifted_suffix(_mm_shuffle_epi8(suffix, control)); + return V4x32U(_mm_blendv_epi8(shifted_suffix, prefix, control)); +} + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HH_DISABLE_TARGET_SPECIFIC +#endif // HIGHWAYHASH_HH_BUFFER_H_ diff --git a/contrib/libs/highwayhash/highwayhash/hh_portable.cc b/contrib/libs/highwayhash/highwayhash/hh_portable.cc index 3e0de9ed9c..1c8072aebe 100644 --- a/contrib/libs/highwayhash/highwayhash/hh_portable.cc +++ b/contrib/libs/highwayhash/highwayhash/hh_portable.cc @@ -1,19 +1,19 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// WARNING: this is a "restricted" source file; avoid including any headers -// unless they are also restricted. See arch_specific.h for details. - -#define HH_TARGET_NAME Portable -#include "highwayhash/highwayhash_target.cc" +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME Portable +#include "highwayhash/highwayhash_target.cc" diff --git a/contrib/libs/highwayhash/highwayhash/hh_portable.h b/contrib/libs/highwayhash/highwayhash/hh_portable.h index 11284deae8..150ecdee7c 100644 --- a/contrib/libs/highwayhash/highwayhash/hh_portable.h +++ b/contrib/libs/highwayhash/highwayhash/hh_portable.h @@ -1,301 +1,301 @@ -// Copyright 2015-2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_HH_PORTABLE_H_ -#define HIGHWAYHASH_HH_PORTABLE_H_ - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" -#include "highwayhash/endianess.h" -#include "highwayhash/hh_types.h" -#include "highwayhash/load3.h" - -namespace highwayhash { -// See vector128.h for why this namespace is necessary; we match it here for -// consistency. As a result, this header requires textual inclusion. -namespace HH_TARGET_NAME { - -class HHStatePortable { - public: - static const int kNumLanes = 4; - using Lanes = uint64_t[kNumLanes]; - - explicit HH_INLINE HHStatePortable(const HHKey keys) { Reset(keys); } - - HH_INLINE void Reset(const HHKey keys) { - static const Lanes init0 = {0xdbe6d5d5fe4cce2full, 0xa4093822299f31d0ull, - 0x13198a2e03707344ull, 0x243f6a8885a308d3ull}; - static const Lanes init1 = {0x3bd39e10cb0ef593ull, 0xc0acf169b5f18a8cull, - 0xbe5466cf34e90c6cull, 0x452821e638d01377ull}; - Lanes rotated_keys; - Rotate64By32(keys, &rotated_keys); - Copy(init0, &mul0); - Copy(init1, &mul1); - Xor(init0, keys, &v0); - Xor(init1, rotated_keys, &v1); - } - - HH_INLINE void Update(const HHPacket& packet) { - Lanes packet_lanes; - CopyPartial(&packet[0], sizeof(HHPacket), - reinterpret_cast<char*>(&packet_lanes)); - for (int lane = 0; lane < kNumLanes; ++lane) { - packet_lanes[lane] = host_from_le64(packet_lanes[lane]); - } - Update(packet_lanes); - } - - HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) { - // 'Length padding' differentiates zero-valued inputs that have the same - // size/32. mod32 is sufficient because each Update behaves as if a - // counter were injected, because the state is large and mixed thoroughly. - const uint64_t mod32_pair = (size_mod32 << 32) + size_mod32; - for (int lane = 0; lane < kNumLanes; ++lane) { - v0[lane] += mod32_pair; - } - Rotate32By(reinterpret_cast<uint32_t*>(&v1), size_mod32); - - const size_t size_mod4 = size_mod32 & 3; - const char* remainder = bytes + (size_mod32 & ~3); - - HHPacket packet HH_ALIGNAS(32) = {0}; - CopyPartial(bytes, remainder - bytes, &packet[0]); - - if (size_mod32 & 16) { // 16..31 bytes left - // Read the last 0..3 bytes and previous 1..4 into the upper bits. - // Insert into the upper four bytes of packet, which are zero. - uint32_t last4 = - Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4); - CopyPartial(reinterpret_cast<const char*>(&last4), 4, &packet[28]); - } else { // size_mod32 < 16 - uint64_t last4 = Load3()(Load3::AllowUnordered(), remainder, size_mod4); - - // Rather than insert at packet + 28, it is faster to initialize - // the otherwise empty packet + 16 with up to 64 bits of padding. - CopyPartial(reinterpret_cast<const char*>(&last4), sizeof(last4), - &packet[16]); - } - Update(packet); - } - - HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) { - PermuteAndUpdate(); - PermuteAndUpdate(); - PermuteAndUpdate(); - PermuteAndUpdate(); - - *result = v0[0] + v1[0] + mul0[0] + mul1[0]; - } - - HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) { - PermuteAndUpdate(); - PermuteAndUpdate(); - PermuteAndUpdate(); - PermuteAndUpdate(); - - (*result)[0] = v0[0] + mul0[0] + v1[2] + mul1[2]; - (*result)[1] = v0[1] + mul0[1] + v1[3] + mul1[3]; - } - - HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) { - PermuteAndUpdate(); - PermuteAndUpdate(); - PermuteAndUpdate(); - PermuteAndUpdate(); - - ModularReduction(v1[1] + mul1[1], v1[0] + mul1[0], v0[1] + mul0[1], - v0[0] + mul0[0], &(*result)[1], &(*result)[0]); - ModularReduction(v1[3] + mul1[3], v1[2] + mul1[2], v0[3] + mul0[3], - v0[2] + mul0[2], &(*result)[3], &(*result)[2]); - } - - static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer) { - for (size_t i = 0; i < sizeof(HHPacket); ++i) { - buffer[i] = 0; - } - } - - static HH_INLINE void CopyPartial(const char* HH_RESTRICT from, - const size_t size_mod32, - char* HH_RESTRICT buffer) { - for (size_t i = 0; i < size_mod32; ++i) { - buffer[i] = from[i]; - } - } - - static HH_INLINE void AppendPartial(const char* HH_RESTRICT from, - const size_t size_mod32, - char* HH_RESTRICT buffer, - const size_t buffer_valid) { - for (size_t i = 0; i < size_mod32; ++i) { - buffer[buffer_valid + i] = from[i]; - } - } - - HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from, - const size_t size_mod32, - const char* HH_RESTRICT buffer, - const size_t buffer_valid) { - HHPacket tmp HH_ALIGNAS(32); - for (size_t i = 0; i < buffer_valid; ++i) { - tmp[i] = buffer[i]; - } - for (size_t i = 0; i < size_mod32; ++i) { - tmp[buffer_valid + i] = from[i]; - } - Update(tmp); - } - - private: - static HH_INLINE void Copy(const Lanes& source, Lanes* HH_RESTRICT dest) { - for (int lane = 0; lane < kNumLanes; ++lane) { - (*dest)[lane] = source[lane]; - } - } - - static HH_INLINE void Add(const Lanes& source, Lanes* HH_RESTRICT dest) { - for (int lane = 0; lane < kNumLanes; ++lane) { - (*dest)[lane] += source[lane]; - } - } - - template <typename LanesOrPointer> - static HH_INLINE void Xor(const Lanes& op1, const LanesOrPointer& op2, - Lanes* HH_RESTRICT dest) { - for (int lane = 0; lane < kNumLanes; ++lane) { - (*dest)[lane] = op1[lane] ^ op2[lane]; - } - } - -// Clears all bits except one byte at the given offset. -#define MASK(v, bytes) ((v) & (0xFFull << ((bytes)*8))) - - // 16-byte permutation; shifting is about 10% faster than byte loads. - // Adds zipper-merge result to add*. - static HH_INLINE void ZipperMergeAndAdd(const uint64_t v1, const uint64_t v0, - uint64_t* HH_RESTRICT add1, - uint64_t* HH_RESTRICT add0) { - *add0 += ((MASK(v0, 3) + MASK(v1, 4)) >> 24) + - ((MASK(v0, 5) + MASK(v1, 6)) >> 16) + MASK(v0, 2) + - (MASK(v0, 1) << 32) + (MASK(v1, 7) >> 8) + (v0 << 56); - - *add1 += ((MASK(v1, 3) + MASK(v0, 4)) >> 24) + MASK(v1, 2) + - (MASK(v1, 5) >> 16) + (MASK(v1, 1) << 24) + (MASK(v0, 6) >> 8) + - (MASK(v1, 0) << 48) + MASK(v0, 7); - } - -#undef MASK - - // For inputs that are already in native byte order (e.g. PermuteAndAdd) - HH_INLINE void Update(const Lanes& packet_lanes) { - Add(packet_lanes, &v1); - Add(mul0, &v1); - - // (Loop is faster than unrolling) - for (int lane = 0; lane < kNumLanes; ++lane) { - const uint32_t v1_32 = static_cast<uint32_t>(v1[lane]); - mul0[lane] ^= v1_32 * (v0[lane] >> 32); - v0[lane] += mul1[lane]; - const uint32_t v0_32 = static_cast<uint32_t>(v0[lane]); - mul1[lane] ^= v0_32 * (v1[lane] >> 32); - } - - ZipperMergeAndAdd(v1[1], v1[0], &v0[1], &v0[0]); - ZipperMergeAndAdd(v1[3], v1[2], &v0[3], &v0[2]); - - ZipperMergeAndAdd(v0[1], v0[0], &v1[1], &v1[0]); - ZipperMergeAndAdd(v0[3], v0[2], &v1[3], &v1[2]); - } - - static HH_INLINE uint64_t Rotate64By32(const uint64_t x) { - return (x >> 32) | (x << 32); - } - - template <typename LanesOrPointer> - static HH_INLINE void Rotate64By32(const LanesOrPointer& v, - Lanes* HH_RESTRICT rotated) { - for (int i = 0; i < kNumLanes; ++i) { - (*rotated)[i] = Rotate64By32(v[i]); - } - } - - static HH_INLINE void Rotate32By(uint32_t* halves, const uint64_t count) { - for (int i = 0; i < 2 * kNumLanes; ++i) { - const uint32_t x = halves[i]; - halves[i] = (x << count) | (x >> (32 - count)); - } - } - - static HH_INLINE void Permute(const Lanes& v, Lanes* HH_RESTRICT permuted) { - (*permuted)[0] = Rotate64By32(v[2]); - (*permuted)[1] = Rotate64By32(v[3]); - (*permuted)[2] = Rotate64By32(v[0]); - (*permuted)[3] = Rotate64By32(v[1]); - } - - HH_INLINE void PermuteAndUpdate() { - Lanes permuted; - Permute(v0, &permuted); - Update(permuted); - } - - // Computes a << kBits for 128-bit a = (a1, a0). - // Bit shifts are only possible on independent 64-bit lanes. We therefore - // insert the upper bits of a0 that were lost into a1. This is slightly - // shorter than Lemire's (a << 1) | (((a >> 8) << 1) << 8) approach. - template <int kBits> - static HH_INLINE void Shift128Left(uint64_t* HH_RESTRICT a1, - uint64_t* HH_RESTRICT a0) { - const uint64_t shifted1 = (*a1) << kBits; - const uint64_t top_bits = (*a0) >> (64 - kBits); - *a0 <<= kBits; - *a1 = shifted1 | top_bits; - } - - // Modular reduction by the irreducible polynomial (x^128 + x^2 + x). - // Input: a 256-bit number a3210. - static HH_INLINE void ModularReduction(const uint64_t a3_unmasked, - const uint64_t a2, const uint64_t a1, - const uint64_t a0, - uint64_t* HH_RESTRICT m1, - uint64_t* HH_RESTRICT m0) { - // The upper two bits must be clear, otherwise a3 << 2 would lose bits, - // in which case we're no longer computing a reduction. - const uint64_t a3 = a3_unmasked & 0x3FFFFFFFFFFFFFFFull; - // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf. - uint64_t a3_shl1 = a3; - uint64_t a2_shl1 = a2; - uint64_t a3_shl2 = a3; - uint64_t a2_shl2 = a2; - Shift128Left<1>(&a3_shl1, &a2_shl1); - Shift128Left<2>(&a3_shl2, &a2_shl2); - *m1 = a1 ^ a3_shl1 ^ a3_shl2; - *m0 = a0 ^ a2_shl1 ^ a2_shl2; - } - - Lanes v0; - Lanes v1; - Lanes mul0; - Lanes mul1; -}; - -} // namespace HH_TARGET_NAME -} // namespace highwayhash - -#endif // HIGHWAYHASH_HH_PORTABLE_H_ +// Copyright 2015-2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_PORTABLE_H_ +#define HIGHWAYHASH_HH_PORTABLE_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/endianess.h" +#include "highwayhash/hh_types.h" +#include "highwayhash/load3.h" + +namespace highwayhash { +// See vector128.h for why this namespace is necessary; we match it here for +// consistency. As a result, this header requires textual inclusion. +namespace HH_TARGET_NAME { + +class HHStatePortable { + public: + static const int kNumLanes = 4; + using Lanes = uint64_t[kNumLanes]; + + explicit HH_INLINE HHStatePortable(const HHKey keys) { Reset(keys); } + + HH_INLINE void Reset(const HHKey keys) { + static const Lanes init0 = {0xdbe6d5d5fe4cce2full, 0xa4093822299f31d0ull, + 0x13198a2e03707344ull, 0x243f6a8885a308d3ull}; + static const Lanes init1 = {0x3bd39e10cb0ef593ull, 0xc0acf169b5f18a8cull, + 0xbe5466cf34e90c6cull, 0x452821e638d01377ull}; + Lanes rotated_keys; + Rotate64By32(keys, &rotated_keys); + Copy(init0, &mul0); + Copy(init1, &mul1); + Xor(init0, keys, &v0); + Xor(init1, rotated_keys, &v1); + } + + HH_INLINE void Update(const HHPacket& packet) { + Lanes packet_lanes; + CopyPartial(&packet[0], sizeof(HHPacket), + reinterpret_cast<char*>(&packet_lanes)); + for (int lane = 0; lane < kNumLanes; ++lane) { + packet_lanes[lane] = host_from_le64(packet_lanes[lane]); + } + Update(packet_lanes); + } + + HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) { + // 'Length padding' differentiates zero-valued inputs that have the same + // size/32. mod32 is sufficient because each Update behaves as if a + // counter were injected, because the state is large and mixed thoroughly. + const uint64_t mod32_pair = (size_mod32 << 32) + size_mod32; + for (int lane = 0; lane < kNumLanes; ++lane) { + v0[lane] += mod32_pair; + } + Rotate32By(reinterpret_cast<uint32_t*>(&v1), size_mod32); + + const size_t size_mod4 = size_mod32 & 3; + const char* remainder = bytes + (size_mod32 & ~3); + + HHPacket packet HH_ALIGNAS(32) = {0}; + CopyPartial(bytes, remainder - bytes, &packet[0]); + + if (size_mod32 & 16) { // 16..31 bytes left + // Read the last 0..3 bytes and previous 1..4 into the upper bits. + // Insert into the upper four bytes of packet, which are zero. + uint32_t last4 = + Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4); + CopyPartial(reinterpret_cast<const char*>(&last4), 4, &packet[28]); + } else { // size_mod32 < 16 + uint64_t last4 = Load3()(Load3::AllowUnordered(), remainder, size_mod4); + + // Rather than insert at packet + 28, it is faster to initialize + // the otherwise empty packet + 16 with up to 64 bits of padding. + CopyPartial(reinterpret_cast<const char*>(&last4), sizeof(last4), + &packet[16]); + } + Update(packet); + } + + HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) { + PermuteAndUpdate(); + PermuteAndUpdate(); + PermuteAndUpdate(); + PermuteAndUpdate(); + + *result = v0[0] + v1[0] + mul0[0] + mul1[0]; + } + + HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) { + PermuteAndUpdate(); + PermuteAndUpdate(); + PermuteAndUpdate(); + PermuteAndUpdate(); + + (*result)[0] = v0[0] + mul0[0] + v1[2] + mul1[2]; + (*result)[1] = v0[1] + mul0[1] + v1[3] + mul1[3]; + } + + HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) { + PermuteAndUpdate(); + PermuteAndUpdate(); + PermuteAndUpdate(); + PermuteAndUpdate(); + + ModularReduction(v1[1] + mul1[1], v1[0] + mul1[0], v0[1] + mul0[1], + v0[0] + mul0[0], &(*result)[1], &(*result)[0]); + ModularReduction(v1[3] + mul1[3], v1[2] + mul1[2], v0[3] + mul0[3], + v0[2] + mul0[2], &(*result)[3], &(*result)[2]); + } + + static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer) { + for (size_t i = 0; i < sizeof(HHPacket); ++i) { + buffer[i] = 0; + } + } + + static HH_INLINE void CopyPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer) { + for (size_t i = 0; i < size_mod32; ++i) { + buffer[i] = from[i]; + } + } + + static HH_INLINE void AppendPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer, + const size_t buffer_valid) { + for (size_t i = 0; i < size_mod32; ++i) { + buffer[buffer_valid + i] = from[i]; + } + } + + HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from, + const size_t size_mod32, + const char* HH_RESTRICT buffer, + const size_t buffer_valid) { + HHPacket tmp HH_ALIGNAS(32); + for (size_t i = 0; i < buffer_valid; ++i) { + tmp[i] = buffer[i]; + } + for (size_t i = 0; i < size_mod32; ++i) { + tmp[buffer_valid + i] = from[i]; + } + Update(tmp); + } + + private: + static HH_INLINE void Copy(const Lanes& source, Lanes* HH_RESTRICT dest) { + for (int lane = 0; lane < kNumLanes; ++lane) { + (*dest)[lane] = source[lane]; + } + } + + static HH_INLINE void Add(const Lanes& source, Lanes* HH_RESTRICT dest) { + for (int lane = 0; lane < kNumLanes; ++lane) { + (*dest)[lane] += source[lane]; + } + } + + template <typename LanesOrPointer> + static HH_INLINE void Xor(const Lanes& op1, const LanesOrPointer& op2, + Lanes* HH_RESTRICT dest) { + for (int lane = 0; lane < kNumLanes; ++lane) { + (*dest)[lane] = op1[lane] ^ op2[lane]; + } + } + +// Clears all bits except one byte at the given offset. +#define MASK(v, bytes) ((v) & (0xFFull << ((bytes)*8))) + + // 16-byte permutation; shifting is about 10% faster than byte loads. + // Adds zipper-merge result to add*. + static HH_INLINE void ZipperMergeAndAdd(const uint64_t v1, const uint64_t v0, + uint64_t* HH_RESTRICT add1, + uint64_t* HH_RESTRICT add0) { + *add0 += ((MASK(v0, 3) + MASK(v1, 4)) >> 24) + + ((MASK(v0, 5) + MASK(v1, 6)) >> 16) + MASK(v0, 2) + + (MASK(v0, 1) << 32) + (MASK(v1, 7) >> 8) + (v0 << 56); + + *add1 += ((MASK(v1, 3) + MASK(v0, 4)) >> 24) + MASK(v1, 2) + + (MASK(v1, 5) >> 16) + (MASK(v1, 1) << 24) + (MASK(v0, 6) >> 8) + + (MASK(v1, 0) << 48) + MASK(v0, 7); + } + +#undef MASK + + // For inputs that are already in native byte order (e.g. PermuteAndAdd) + HH_INLINE void Update(const Lanes& packet_lanes) { + Add(packet_lanes, &v1); + Add(mul0, &v1); + + // (Loop is faster than unrolling) + for (int lane = 0; lane < kNumLanes; ++lane) { + const uint32_t v1_32 = static_cast<uint32_t>(v1[lane]); + mul0[lane] ^= v1_32 * (v0[lane] >> 32); + v0[lane] += mul1[lane]; + const uint32_t v0_32 = static_cast<uint32_t>(v0[lane]); + mul1[lane] ^= v0_32 * (v1[lane] >> 32); + } + + ZipperMergeAndAdd(v1[1], v1[0], &v0[1], &v0[0]); + ZipperMergeAndAdd(v1[3], v1[2], &v0[3], &v0[2]); + + ZipperMergeAndAdd(v0[1], v0[0], &v1[1], &v1[0]); + ZipperMergeAndAdd(v0[3], v0[2], &v1[3], &v1[2]); + } + + static HH_INLINE uint64_t Rotate64By32(const uint64_t x) { + return (x >> 32) | (x << 32); + } + + template <typename LanesOrPointer> + static HH_INLINE void Rotate64By32(const LanesOrPointer& v, + Lanes* HH_RESTRICT rotated) { + for (int i = 0; i < kNumLanes; ++i) { + (*rotated)[i] = Rotate64By32(v[i]); + } + } + + static HH_INLINE void Rotate32By(uint32_t* halves, const uint64_t count) { + for (int i = 0; i < 2 * kNumLanes; ++i) { + const uint32_t x = halves[i]; + halves[i] = (x << count) | (x >> (32 - count)); + } + } + + static HH_INLINE void Permute(const Lanes& v, Lanes* HH_RESTRICT permuted) { + (*permuted)[0] = Rotate64By32(v[2]); + (*permuted)[1] = Rotate64By32(v[3]); + (*permuted)[2] = Rotate64By32(v[0]); + (*permuted)[3] = Rotate64By32(v[1]); + } + + HH_INLINE void PermuteAndUpdate() { + Lanes permuted; + Permute(v0, &permuted); + Update(permuted); + } + + // Computes a << kBits for 128-bit a = (a1, a0). + // Bit shifts are only possible on independent 64-bit lanes. We therefore + // insert the upper bits of a0 that were lost into a1. This is slightly + // shorter than Lemire's (a << 1) | (((a >> 8) << 1) << 8) approach. + template <int kBits> + static HH_INLINE void Shift128Left(uint64_t* HH_RESTRICT a1, + uint64_t* HH_RESTRICT a0) { + const uint64_t shifted1 = (*a1) << kBits; + const uint64_t top_bits = (*a0) >> (64 - kBits); + *a0 <<= kBits; + *a1 = shifted1 | top_bits; + } + + // Modular reduction by the irreducible polynomial (x^128 + x^2 + x). + // Input: a 256-bit number a3210. + static HH_INLINE void ModularReduction(const uint64_t a3_unmasked, + const uint64_t a2, const uint64_t a1, + const uint64_t a0, + uint64_t* HH_RESTRICT m1, + uint64_t* HH_RESTRICT m0) { + // The upper two bits must be clear, otherwise a3 << 2 would lose bits, + // in which case we're no longer computing a reduction. + const uint64_t a3 = a3_unmasked & 0x3FFFFFFFFFFFFFFFull; + // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf. + uint64_t a3_shl1 = a3; + uint64_t a2_shl1 = a2; + uint64_t a3_shl2 = a3; + uint64_t a2_shl2 = a2; + Shift128Left<1>(&a3_shl1, &a2_shl1); + Shift128Left<2>(&a3_shl2, &a2_shl2); + *m1 = a1 ^ a3_shl1 ^ a3_shl2; + *m0 = a0 ^ a2_shl1 ^ a2_shl2; + } + + Lanes v0; + Lanes v1; + Lanes mul0; + Lanes mul1; +}; + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HIGHWAYHASH_HH_PORTABLE_H_ diff --git a/contrib/libs/highwayhash/highwayhash/hh_sse41.cc b/contrib/libs/highwayhash/highwayhash/hh_sse41.cc index 9d6a0b968f..0bf13ab4f5 100644 --- a/contrib/libs/highwayhash/highwayhash/hh_sse41.cc +++ b/contrib/libs/highwayhash/highwayhash/hh_sse41.cc @@ -1,19 +1,19 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// WARNING: this is a "restricted" source file; avoid including any headers -// unless they are also restricted. See arch_specific.h for details. - -#define HH_TARGET_NAME SSE41 -#include "highwayhash/highwayhash_target.cc" +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME SSE41 +#include "highwayhash/highwayhash_target.cc" diff --git a/contrib/libs/highwayhash/highwayhash/hh_sse41.h b/contrib/libs/highwayhash/highwayhash/hh_sse41.h index a2a86da9b6..c4d56697e2 100644 --- a/contrib/libs/highwayhash/highwayhash/hh_sse41.h +++ b/contrib/libs/highwayhash/highwayhash/hh_sse41.h @@ -1,330 +1,330 @@ -// Copyright 2015-2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_HH_SSE41_H_ -#define HIGHWAYHASH_HH_SSE41_H_ - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" -#include "highwayhash/hh_buffer.h" -#include "highwayhash/hh_types.h" -#include "highwayhash/load3.h" -#include "highwayhash/vector128.h" - -// For auto-dependency generation, we need to include all headers but not their -// contents (otherwise compilation fails because -msse4.1 is not specified). -#ifndef HH_DISABLE_TARGET_SPECIFIC - -namespace highwayhash { -// See vector128.h for why this namespace is necessary; matching it here makes -// it easier use the vector128 symbols, but requires textual inclusion. -namespace HH_TARGET_NAME { - -// J-lanes tree hashing: see http://dx.doi.org/10.4236/jis.2014.53010 -// Uses pairs of SSE4.1 instructions to emulate the AVX-2 algorithm. -class HHStateSSE41 { - public: - explicit HH_INLINE HHStateSSE41(const HHKey key) { Reset(key); } - - HH_INLINE void Reset(const HHKey key) { - // "Nothing up my sleeve numbers"; see HHStateTAVX2. - const V2x64U init0L(0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full); - const V2x64U init0H(0x243f6a8885a308d3ull, 0x13198a2e03707344ull); - const V2x64U init1L(0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull); - const V2x64U init1H(0x452821e638d01377ull, 0xbe5466cf34e90c6cull); - const V2x64U keyL = LoadUnaligned<V2x64U>(key + 0); - const V2x64U keyH = LoadUnaligned<V2x64U>(key + 2); - v0L = keyL ^ init0L; - v0H = keyH ^ init0H; - v1L = Rotate64By32(keyL) ^ init1L; - v1H = Rotate64By32(keyH) ^ init1H; - mul0L = init0L; - mul0H = init0H; - mul1L = init1L; - mul1H = init1H; - } - - HH_INLINE void Update(const HHPacket& packet_bytes) { - const uint64_t* HH_RESTRICT packet = - reinterpret_cast<const uint64_t * HH_RESTRICT>(packet_bytes); - const V2x64U packetL = LoadUnaligned<V2x64U>(packet + 0); - const V2x64U packetH = LoadUnaligned<V2x64U>(packet + 2); - Update(packetH, packetL); - } - - HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) { - // 'Length padding' differentiates zero-valued inputs that have the same - // size/32. mod32 is sufficient because each Update behaves as if a - // counter were injected, because the state is large and mixed thoroughly. - const V4x32U vsize_mod32(static_cast<uint32_t>(size_mod32)); - // Equivalent to storing size_mod32 in packet. - v0L += V2x64U(vsize_mod32); - v0H += V2x64U(vsize_mod32); - // Boosts the avalanche effect of mod32. - Rotate32By(&v1H, &v1L, size_mod32); - - const size_t size_mod4 = size_mod32 & 3; - const char* HH_RESTRICT remainder = bytes + (size_mod32 & ~3); - - if (HH_UNLIKELY(size_mod32 & 16)) { // 16..31 bytes left - const V2x64U packetL = - LoadUnaligned<V2x64U>(reinterpret_cast<const uint64_t*>(bytes)); - - V2x64U packetH = LoadMultipleOfFour(bytes + 16, size_mod32); - - const uint32_t last4 = - Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4); - - // The upper four bytes of packetH are zero, so insert there. - packetH = V2x64U(_mm_insert_epi32(packetH, last4, 3)); - Update(packetH, packetL); - } else { // size_mod32 < 16 - const V2x64U packetL = LoadMultipleOfFour(bytes, size_mod32); - - const uint64_t last4 = - Load3()(Load3::AllowUnordered(), remainder, size_mod4); - - // Rather than insert into packetL[3], it is faster to initialize - // the otherwise empty packetH. - const V2x64U packetH(_mm_cvtsi64_si128(last4)); - Update(packetH, packetL); - } - } - - HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) { - // Mix together all lanes. - PermuteAndUpdate(); - PermuteAndUpdate(); - PermuteAndUpdate(); - PermuteAndUpdate(); - - const V2x64U sum0 = v0L + mul0L; - const V2x64U sum1 = v1L + mul1L; - const V2x64U hash = sum0 + sum1; - _mm_storel_epi64(reinterpret_cast<__m128i*>(result), hash); - } - - HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) { - PermuteAndUpdate(); - PermuteAndUpdate(); - PermuteAndUpdate(); - PermuteAndUpdate(); - - const V2x64U sum0 = v0L + mul0L; - const V2x64U sum1 = v1H + mul1H; - const V2x64U hash = sum0 + sum1; - StoreUnaligned(hash, &(*result)[0]); - } - - HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) { - PermuteAndUpdate(); - PermuteAndUpdate(); - PermuteAndUpdate(); - PermuteAndUpdate(); - - const V2x64U sum0L = v0L + mul0L; - const V2x64U sum1L = v1L + mul1L; - const V2x64U sum0H = v0H + mul0H; - const V2x64U sum1H = v1H + mul1H; - const V2x64U hashL = ModularReduction(sum1L, sum0L); - const V2x64U hashH = ModularReduction(sum1H, sum0H); - StoreUnaligned(hashL, &(*result)[0]); - StoreUnaligned(hashH, &(*result)[2]); - } - - static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer_bytes) { - __m128i* buffer = reinterpret_cast<__m128i*>(buffer_bytes); - const __m128i zero = _mm_setzero_si128(); - _mm_store_si128(buffer + 0, zero); - _mm_store_si128(buffer + 1, zero); - } - - static HH_INLINE void CopyPartial(const char* HH_RESTRICT from, - const size_t size_mod32, - char* HH_RESTRICT buffer) { - for (size_t i = 0; i < size_mod32; ++i) { - buffer[i] = from[i]; - } - } - - static HH_INLINE void AppendPartial(const char* HH_RESTRICT from, - const size_t size_mod32, - char* HH_RESTRICT buffer, - const size_t buffer_valid) { - for (size_t i = 0; i < size_mod32; ++i) { - buffer[buffer_valid + i] = from[i]; - } - } - - HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from, - const size_t size_mod32, - const char* HH_RESTRICT buffer, - const size_t buffer_valid) { - HHPacket tmp HH_ALIGNAS(32); - for (size_t i = 0; i < buffer_valid; ++i) { - tmp[i] = buffer[i]; - } - for (size_t i = 0; i < size_mod32; ++i) { - tmp[buffer_valid + i] = from[i]; - } - Update(tmp); - } - - private: - // Swap 32-bit halves of each lane (caller swaps 128-bit halves) - static HH_INLINE V2x64U Rotate64By32(const V2x64U& v) { - return V2x64U(_mm_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1))); - } - - // Rotates 32-bit lanes by "count" bits. - static HH_INLINE void Rotate32By(V2x64U* HH_RESTRICT vH, - V2x64U* HH_RESTRICT vL, - const uint64_t count) { - // WARNING: the shift count is 64 bits, so we can't reuse vsize_mod32, - // which is broadcast into 32-bit lanes. - const __m128i count_left = _mm_cvtsi64_si128(count); - const __m128i count_right = _mm_cvtsi64_si128(32 - count); - const V2x64U shifted_leftL(_mm_sll_epi32(*vL, count_left)); - const V2x64U shifted_leftH(_mm_sll_epi32(*vH, count_left)); - const V2x64U shifted_rightL(_mm_srl_epi32(*vL, count_right)); - const V2x64U shifted_rightH(_mm_srl_epi32(*vH, count_right)); - *vL = shifted_leftL | shifted_rightL; - *vH = shifted_leftH | shifted_rightH; - } - - static HH_INLINE V2x64U ZipperMerge(const V2x64U& v) { - // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to - // varying degrees. In descending order of goodness, bytes - // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. - // As expected, the upper and lower bytes are much worse. - // For each 64-bit lane, our objectives are: - // 1) maximizing and equalizing total goodness across each lane's bytes; - // 2) mixing with bytes from the neighboring lane; - // 3) placing the worst bytes in the upper 32 bits because those will not - // be used in the next 32x32 multiplication. - const uint64_t hi = 0x070806090D0A040Bull; - const uint64_t lo = 0x000F010E05020C03ull; - return V2x64U(_mm_shuffle_epi8(v, V2x64U(hi, lo))); - } - - HH_INLINE void Update(const V2x64U& packetH, const V2x64U& packetL) { - v1L += packetL; - v1H += packetH; - v1L += mul0L; - v1H += mul0H; - mul0L ^= V2x64U(_mm_mul_epu32(v1L, Rotate64By32(v0L))); - mul0H ^= V2x64U(_mm_mul_epu32(v1H, v0H >> 32)); - v0L += mul1L; - v0H += mul1H; - mul1L ^= V2x64U(_mm_mul_epu32(v0L, Rotate64By32(v1L))); - mul1H ^= V2x64U(_mm_mul_epu32(v0H, v1H >> 32)); - v0L += ZipperMerge(v1L); - v0H += ZipperMerge(v1H); - v1L += ZipperMerge(v0L); - v1H += ZipperMerge(v0H); - } - - HH_INLINE void PermuteAndUpdate() { - // It is slightly better to permute v0 than v1; it will be added to v1. - // AVX-2 Permute also swaps 128-bit halves, so swap input operands. - Update(Rotate64By32(v0L), Rotate64By32(v0H)); - } - - // Returns zero-initialized vector with the lower "size" = 0, 4, 8 or 12 - // bytes loaded from "bytes". Serves as a replacement for AVX2 maskload_epi32. - static HH_INLINE V2x64U LoadMultipleOfFour(const char* bytes, - const size_t size) { - const uint32_t* words = reinterpret_cast<const uint32_t*>(bytes); - // Mask of 1-bits where the final 4 bytes should be inserted (replacement - // for variable shift/insert using broadcast+blend). - V2x64U mask4(_mm_cvtsi64_si128(0xFFFFFFFFULL)); // 'insert' into lane 0 - V2x64U ret(0); - if (size & 8) { - ret = V2x64U(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(words))); - // mask4 = 0 ~0 0 0 ('insert' into lane 2) - mask4 = V2x64U(_mm_slli_si128(mask4, 8)); - words += 2; - } - // Final 4 (possibly after the 8 above); 'insert' into lane 0 or 2 of ret. - if (size & 4) { - const __m128i word2 = _mm_cvtsi32_si128(words[0]); - // = 0 word2 0 word2; mask4 will select which lane to keep. - const V2x64U broadcast(_mm_shuffle_epi32(word2, 0x00)); - // (slightly faster than blendv_epi8) - ret |= V2x64U(broadcast & mask4); - } - return ret; - } - - // XORs x << 1 and x << 2 into *out after clearing the upper two bits of x. - // Bit shifts are only possible on independent 64-bit lanes. We therefore - // insert the upper bits of x[0] that were lost into x[1]. - // Thanks to D. Lemire for helpful comments! - static HH_INLINE void XorByShift128Left12(const V2x64U& x, - V2x64U* HH_RESTRICT out) { - const V2x64U zero(_mm_setzero_si128()); - const V2x64U sign_bit128(_mm_insert_epi32(zero, 0x80000000u, 3)); - const V2x64U top_bits2 = x >> (64 - 2); - HH_COMPILER_FENCE; - const V2x64U shifted1_unmasked = x + x; // (avoids needing port0) - - // Only the lower half of top_bits1 will be used, so we - // can compute it before clearing the upper two bits of x. - const V2x64U top_bits1 = x >> (64 - 1); - const V2x64U shifted2 = shifted1_unmasked + shifted1_unmasked; - HH_COMPILER_FENCE; - - const V2x64U new_low_bits2(_mm_slli_si128(top_bits2, 8)); - *out ^= shifted2; - // The result must be as if the upper two bits of the input had been clear, - // otherwise we're no longer computing a reduction. - const V2x64U shifted1 = AndNot(sign_bit128, shifted1_unmasked); - HH_COMPILER_FENCE; - - const V2x64U new_low_bits1(_mm_slli_si128(top_bits1, 8)); - *out ^= new_low_bits2; - *out ^= shifted1; - *out ^= new_low_bits1; - } - - // Modular reduction by the irreducible polynomial (x^128 + x^2 + x). - // Input: a 256-bit number a3210. - static HH_INLINE V2x64U ModularReduction(const V2x64U& a32_unmasked, - const V2x64U& a10) { - // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf. - V2x64U out = a10; - XorByShift128Left12(a32_unmasked, &out); - return out; - } - - V2x64U v0L; - V2x64U v0H; - V2x64U v1L; - V2x64U v1H; - V2x64U mul0L; - V2x64U mul0H; - V2x64U mul1L; - V2x64U mul1H; -}; - -} // namespace HH_TARGET_NAME -} // namespace highwayhash - -#endif // HH_DISABLE_TARGET_SPECIFIC -#endif // HIGHWAYHASH_HH_SSE41_H_ +// Copyright 2015-2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_SSE41_H_ +#define HIGHWAYHASH_HH_SSE41_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/hh_buffer.h" +#include "highwayhash/hh_types.h" +#include "highwayhash/load3.h" +#include "highwayhash/vector128.h" + +// For auto-dependency generation, we need to include all headers but not their +// contents (otherwise compilation fails because -msse4.1 is not specified). +#ifndef HH_DISABLE_TARGET_SPECIFIC + +namespace highwayhash { +// See vector128.h for why this namespace is necessary; matching it here makes +// it easier use the vector128 symbols, but requires textual inclusion. +namespace HH_TARGET_NAME { + +// J-lanes tree hashing: see http://dx.doi.org/10.4236/jis.2014.53010 +// Uses pairs of SSE4.1 instructions to emulate the AVX-2 algorithm. +class HHStateSSE41 { + public: + explicit HH_INLINE HHStateSSE41(const HHKey key) { Reset(key); } + + HH_INLINE void Reset(const HHKey key) { + // "Nothing up my sleeve numbers"; see HHStateTAVX2. + const V2x64U init0L(0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full); + const V2x64U init0H(0x243f6a8885a308d3ull, 0x13198a2e03707344ull); + const V2x64U init1L(0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull); + const V2x64U init1H(0x452821e638d01377ull, 0xbe5466cf34e90c6cull); + const V2x64U keyL = LoadUnaligned<V2x64U>(key + 0); + const V2x64U keyH = LoadUnaligned<V2x64U>(key + 2); + v0L = keyL ^ init0L; + v0H = keyH ^ init0H; + v1L = Rotate64By32(keyL) ^ init1L; + v1H = Rotate64By32(keyH) ^ init1H; + mul0L = init0L; + mul0H = init0H; + mul1L = init1L; + mul1H = init1H; + } + + HH_INLINE void Update(const HHPacket& packet_bytes) { + const uint64_t* HH_RESTRICT packet = + reinterpret_cast<const uint64_t * HH_RESTRICT>(packet_bytes); + const V2x64U packetL = LoadUnaligned<V2x64U>(packet + 0); + const V2x64U packetH = LoadUnaligned<V2x64U>(packet + 2); + Update(packetH, packetL); + } + + HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) { + // 'Length padding' differentiates zero-valued inputs that have the same + // size/32. mod32 is sufficient because each Update behaves as if a + // counter were injected, because the state is large and mixed thoroughly. + const V4x32U vsize_mod32(static_cast<uint32_t>(size_mod32)); + // Equivalent to storing size_mod32 in packet. + v0L += V2x64U(vsize_mod32); + v0H += V2x64U(vsize_mod32); + // Boosts the avalanche effect of mod32. + Rotate32By(&v1H, &v1L, size_mod32); + + const size_t size_mod4 = size_mod32 & 3; + const char* HH_RESTRICT remainder = bytes + (size_mod32 & ~3); + + if (HH_UNLIKELY(size_mod32 & 16)) { // 16..31 bytes left + const V2x64U packetL = + LoadUnaligned<V2x64U>(reinterpret_cast<const uint64_t*>(bytes)); + + V2x64U packetH = LoadMultipleOfFour(bytes + 16, size_mod32); + + const uint32_t last4 = + Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4); + + // The upper four bytes of packetH are zero, so insert there. + packetH = V2x64U(_mm_insert_epi32(packetH, last4, 3)); + Update(packetH, packetL); + } else { // size_mod32 < 16 + const V2x64U packetL = LoadMultipleOfFour(bytes, size_mod32); + + const uint64_t last4 = + Load3()(Load3::AllowUnordered(), remainder, size_mod4); + + // Rather than insert into packetL[3], it is faster to initialize + // the otherwise empty packetH. + const V2x64U packetH(_mm_cvtsi64_si128(last4)); + Update(packetH, packetL); + } + } + + HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) { + // Mix together all lanes. + PermuteAndUpdate(); + PermuteAndUpdate(); + PermuteAndUpdate(); + PermuteAndUpdate(); + + const V2x64U sum0 = v0L + mul0L; + const V2x64U sum1 = v1L + mul1L; + const V2x64U hash = sum0 + sum1; + _mm_storel_epi64(reinterpret_cast<__m128i*>(result), hash); + } + + HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) { + PermuteAndUpdate(); + PermuteAndUpdate(); + PermuteAndUpdate(); + PermuteAndUpdate(); + + const V2x64U sum0 = v0L + mul0L; + const V2x64U sum1 = v1H + mul1H; + const V2x64U hash = sum0 + sum1; + StoreUnaligned(hash, &(*result)[0]); + } + + HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) { + PermuteAndUpdate(); + PermuteAndUpdate(); + PermuteAndUpdate(); + PermuteAndUpdate(); + + const V2x64U sum0L = v0L + mul0L; + const V2x64U sum1L = v1L + mul1L; + const V2x64U sum0H = v0H + mul0H; + const V2x64U sum1H = v1H + mul1H; + const V2x64U hashL = ModularReduction(sum1L, sum0L); + const V2x64U hashH = ModularReduction(sum1H, sum0H); + StoreUnaligned(hashL, &(*result)[0]); + StoreUnaligned(hashH, &(*result)[2]); + } + + static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer_bytes) { + __m128i* buffer = reinterpret_cast<__m128i*>(buffer_bytes); + const __m128i zero = _mm_setzero_si128(); + _mm_store_si128(buffer + 0, zero); + _mm_store_si128(buffer + 1, zero); + } + + static HH_INLINE void CopyPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer) { + for (size_t i = 0; i < size_mod32; ++i) { + buffer[i] = from[i]; + } + } + + static HH_INLINE void AppendPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer, + const size_t buffer_valid) { + for (size_t i = 0; i < size_mod32; ++i) { + buffer[buffer_valid + i] = from[i]; + } + } + + HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from, + const size_t size_mod32, + const char* HH_RESTRICT buffer, + const size_t buffer_valid) { + HHPacket tmp HH_ALIGNAS(32); + for (size_t i = 0; i < buffer_valid; ++i) { + tmp[i] = buffer[i]; + } + for (size_t i = 0; i < size_mod32; ++i) { + tmp[buffer_valid + i] = from[i]; + } + Update(tmp); + } + + private: + // Swap 32-bit halves of each lane (caller swaps 128-bit halves) + static HH_INLINE V2x64U Rotate64By32(const V2x64U& v) { + return V2x64U(_mm_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1))); + } + + // Rotates 32-bit lanes by "count" bits. + static HH_INLINE void Rotate32By(V2x64U* HH_RESTRICT vH, + V2x64U* HH_RESTRICT vL, + const uint64_t count) { + // WARNING: the shift count is 64 bits, so we can't reuse vsize_mod32, + // which is broadcast into 32-bit lanes. + const __m128i count_left = _mm_cvtsi64_si128(count); + const __m128i count_right = _mm_cvtsi64_si128(32 - count); + const V2x64U shifted_leftL(_mm_sll_epi32(*vL, count_left)); + const V2x64U shifted_leftH(_mm_sll_epi32(*vH, count_left)); + const V2x64U shifted_rightL(_mm_srl_epi32(*vL, count_right)); + const V2x64U shifted_rightH(_mm_srl_epi32(*vH, count_right)); + *vL = shifted_leftL | shifted_rightL; + *vH = shifted_leftH | shifted_rightH; + } + + static HH_INLINE V2x64U ZipperMerge(const V2x64U& v) { + // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + // varying degrees. In descending order of goodness, bytes + // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + // As expected, the upper and lower bytes are much worse. + // For each 64-bit lane, our objectives are: + // 1) maximizing and equalizing total goodness across each lane's bytes; + // 2) mixing with bytes from the neighboring lane; + // 3) placing the worst bytes in the upper 32 bits because those will not + // be used in the next 32x32 multiplication. + const uint64_t hi = 0x070806090D0A040Bull; + const uint64_t lo = 0x000F010E05020C03ull; + return V2x64U(_mm_shuffle_epi8(v, V2x64U(hi, lo))); + } + + HH_INLINE void Update(const V2x64U& packetH, const V2x64U& packetL) { + v1L += packetL; + v1H += packetH; + v1L += mul0L; + v1H += mul0H; + mul0L ^= V2x64U(_mm_mul_epu32(v1L, Rotate64By32(v0L))); + mul0H ^= V2x64U(_mm_mul_epu32(v1H, v0H >> 32)); + v0L += mul1L; + v0H += mul1H; + mul1L ^= V2x64U(_mm_mul_epu32(v0L, Rotate64By32(v1L))); + mul1H ^= V2x64U(_mm_mul_epu32(v0H, v1H >> 32)); + v0L += ZipperMerge(v1L); + v0H += ZipperMerge(v1H); + v1L += ZipperMerge(v0L); + v1H += ZipperMerge(v0H); + } + + HH_INLINE void PermuteAndUpdate() { + // It is slightly better to permute v0 than v1; it will be added to v1. + // AVX-2 Permute also swaps 128-bit halves, so swap input operands. + Update(Rotate64By32(v0L), Rotate64By32(v0H)); + } + + // Returns zero-initialized vector with the lower "size" = 0, 4, 8 or 12 + // bytes loaded from "bytes". Serves as a replacement for AVX2 maskload_epi32. + static HH_INLINE V2x64U LoadMultipleOfFour(const char* bytes, + const size_t size) { + const uint32_t* words = reinterpret_cast<const uint32_t*>(bytes); + // Mask of 1-bits where the final 4 bytes should be inserted (replacement + // for variable shift/insert using broadcast+blend). + V2x64U mask4(_mm_cvtsi64_si128(0xFFFFFFFFULL)); // 'insert' into lane 0 + V2x64U ret(0); + if (size & 8) { + ret = V2x64U(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(words))); + // mask4 = 0 ~0 0 0 ('insert' into lane 2) + mask4 = V2x64U(_mm_slli_si128(mask4, 8)); + words += 2; + } + // Final 4 (possibly after the 8 above); 'insert' into lane 0 or 2 of ret. + if (size & 4) { + const __m128i word2 = _mm_cvtsi32_si128(words[0]); + // = 0 word2 0 word2; mask4 will select which lane to keep. + const V2x64U broadcast(_mm_shuffle_epi32(word2, 0x00)); + // (slightly faster than blendv_epi8) + ret |= V2x64U(broadcast & mask4); + } + return ret; + } + + // XORs x << 1 and x << 2 into *out after clearing the upper two bits of x. + // Bit shifts are only possible on independent 64-bit lanes. We therefore + // insert the upper bits of x[0] that were lost into x[1]. + // Thanks to D. Lemire for helpful comments! + static HH_INLINE void XorByShift128Left12(const V2x64U& x, + V2x64U* HH_RESTRICT out) { + const V2x64U zero(_mm_setzero_si128()); + const V2x64U sign_bit128(_mm_insert_epi32(zero, 0x80000000u, 3)); + const V2x64U top_bits2 = x >> (64 - 2); + HH_COMPILER_FENCE; + const V2x64U shifted1_unmasked = x + x; // (avoids needing port0) + + // Only the lower half of top_bits1 will be used, so we + // can compute it before clearing the upper two bits of x. + const V2x64U top_bits1 = x >> (64 - 1); + const V2x64U shifted2 = shifted1_unmasked + shifted1_unmasked; + HH_COMPILER_FENCE; + + const V2x64U new_low_bits2(_mm_slli_si128(top_bits2, 8)); + *out ^= shifted2; + // The result must be as if the upper two bits of the input had been clear, + // otherwise we're no longer computing a reduction. + const V2x64U shifted1 = AndNot(sign_bit128, shifted1_unmasked); + HH_COMPILER_FENCE; + + const V2x64U new_low_bits1(_mm_slli_si128(top_bits1, 8)); + *out ^= new_low_bits2; + *out ^= shifted1; + *out ^= new_low_bits1; + } + + // Modular reduction by the irreducible polynomial (x^128 + x^2 + x). + // Input: a 256-bit number a3210. + static HH_INLINE V2x64U ModularReduction(const V2x64U& a32_unmasked, + const V2x64U& a10) { + // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf. + V2x64U out = a10; + XorByShift128Left12(a32_unmasked, &out); + return out; + } + + V2x64U v0L; + V2x64U v0H; + V2x64U v1L; + V2x64U v1H; + V2x64U mul0L; + V2x64U mul0H; + V2x64U mul1L; + V2x64U mul1H; +}; + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HH_DISABLE_TARGET_SPECIFIC +#endif // HIGHWAYHASH_HH_SSE41_H_ diff --git a/contrib/libs/highwayhash/highwayhash/hh_types.h b/contrib/libs/highwayhash/highwayhash/hh_types.h index f350d70f65..e5b0430f17 100644 --- a/contrib/libs/highwayhash/highwayhash/hh_types.h +++ b/contrib/libs/highwayhash/highwayhash/hh_types.h @@ -1,50 +1,50 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_HH_TYPES_H_ -#define HIGHWAYHASH_HH_TYPES_H_ - -// WARNING: included from c_bindings => must be C-compatible. -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include <stddef.h> // size_t -#include <stdint.h> - -#ifdef __cplusplus -namespace highwayhash { -#endif - -// 256-bit secret key that should remain unknown to attackers. -// We recommend initializing it to a random value. -typedef uint64_t HHKey[4]; - -// How much input is hashed by one call to HHStateT::Update. -typedef char HHPacket[32]; - -// Hash 'return' types. -typedef uint64_t HHResult64; // returned directly -typedef uint64_t HHResult128[2]; -typedef uint64_t HHResult256[4]; - -// Called if a test fails, indicating which target and size. -typedef void (*HHNotify)(const char*, size_t); - -#ifdef __cplusplus -} // namespace highwayhash -#endif - -#endif // HIGHWAYHASH_HH_TYPES_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_TYPES_H_ +#define HIGHWAYHASH_HH_TYPES_H_ + +// WARNING: included from c_bindings => must be C-compatible. +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include <stddef.h> // size_t +#include <stdint.h> + +#ifdef __cplusplus +namespace highwayhash { +#endif + +// 256-bit secret key that should remain unknown to attackers. +// We recommend initializing it to a random value. +typedef uint64_t HHKey[4]; + +// How much input is hashed by one call to HHStateT::Update. +typedef char HHPacket[32]; + +// Hash 'return' types. +typedef uint64_t HHResult64; // returned directly +typedef uint64_t HHResult128[2]; +typedef uint64_t HHResult256[4]; + +// Called if a test fails, indicating which target and size. +typedef void (*HHNotify)(const char*, size_t); + +#ifdef __cplusplus +} // namespace highwayhash +#endif + +#endif // HIGHWAYHASH_HH_TYPES_H_ diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash.h b/contrib/libs/highwayhash/highwayhash/highwayhash.h index cee1c31ba4..0cebc841fe 100644 --- a/contrib/libs/highwayhash/highwayhash/highwayhash.h +++ b/contrib/libs/highwayhash/highwayhash/highwayhash.h @@ -1,202 +1,202 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_HIGHWAYHASH_H_ -#define HIGHWAYHASH_HIGHWAYHASH_H_ - -// This header's templates are useful for inlining into other CPU-specific code: -// template<TargetBits Target> CodeUsingHash() { HighwayHashT<Target>(...); }, -// and can also be instantiated with HH_TARGET when callers don't care about the -// exact implementation. Otherwise, they are implementation details of the -// highwayhash_target wrapper. Use that instead if you need to detect the best -// available implementation at runtime. - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" -#include "highwayhash/hh_types.h" -#include "highwayhash/iaca.h" - -// Include exactly one (see arch_specific.h) header, which defines a state -// object in a target-specific namespace, e.g. AVX2::HHStateAVX2. -// Attempts to use "computed includes" (#define MACRO "path/or_just_filename", -// #include MACRO) fail with 'file not found', so we need an #if chain. -#if HH_TARGET == HH_TARGET_AVX2 -#include "highwayhash/hh_avx2.h" -#elif HH_TARGET == HH_TARGET_SSE41 -#include "highwayhash/hh_sse41.h" -#elif HH_TARGET == HH_TARGET_Portable -#include "highwayhash/hh_portable.h" -#else -#error "Unknown target, add its hh_*.h include here." -#endif - -#ifndef HH_DISABLE_TARGET_SPECIFIC -namespace highwayhash { - -// Alias templates (HHStateT) cannot be specialized, so we need a helper struct. -// Note that hh_*.h don't just specialize HHStateT directly because vector128.h -// must reside in a distinct namespace (to allow including it from multiple -// translation units), and it is easier if its users, i.e. the concrete HHState, -// also reside in that same namespace, which precludes specialization. -template <TargetBits Target> -struct HHStateForTarget {}; - -template <> -struct HHStateForTarget<HH_TARGET> { - // (The namespace is sufficient and the additional HH_TARGET_NAME suffix is - // technically redundant, but it makes searching easier.) - using type = HH_TARGET_NAME::HH_ADD_TARGET_SUFFIX(HHState); -}; - -// Typically used as HHStateT<HH_TARGET>. It would be easier to just have a -// concrete type HH_STATE, but this alias template is required by the -// templates in highwayhash_target.cc. -template <TargetBits Target> -using HHStateT = typename HHStateForTarget<Target>::type; - -// Computes HighwayHash of "bytes" using the implementation chosen by "State". -// -// "state" is a HHStateT<> initialized with a key. -// "bytes" is the data to hash (possibly unaligned). -// "size" is the number of bytes to hash; we do not read any additional bytes. -// "hash" is a HHResult* (either 64, 128 or 256 bits). -// -// HighwayHash is a strong pseudorandom function with security claims -// [https://arxiv.org/abs/1612.06257]. It is intended as a safer general-purpose -// hash, about 4x faster than SipHash and 10x faster than BLAKE2. -// -// This template allows callers (e.g. tests) to invoke a specific -// implementation. It must be compiled with the flags required by the desired -// implementation. If the entire program cannot be built with these flags, use -// the wrapper in highwayhash_target.h instead. -// -// Callers wanting to hash multiple pieces of data should duplicate this -// function, calling HHStateT::Update for each input and only Finalizing once. -template <class State, typename Result> -HH_INLINE void HighwayHashT(State* HH_RESTRICT state, - const char* HH_RESTRICT bytes, const size_t size, - Result* HH_RESTRICT hash) { - // BeginIACA(); - const size_t remainder = size & (sizeof(HHPacket) - 1); - const size_t truncated = size & ~(sizeof(HHPacket) - 1); - for (size_t offset = 0; offset < truncated; offset += sizeof(HHPacket)) { - state->Update(*reinterpret_cast<const HHPacket*>(bytes + offset)); - } - - if (remainder != 0) { - state->UpdateRemainder(bytes + truncated, remainder); - } - - state->Finalize(hash); - // EndIACA(); -} - -// Wrapper class for incrementally hashing a series of data ranges. The final -// result is the same as HighwayHashT of the concatenation of all the ranges. -// This is useful for computing the hash of cords, iovecs, and similar -// data structures. -template <TargetBits Target> -class HighwayHashCatT { - public: - HH_INLINE HighwayHashCatT(const HHKey& key) : state_(key) { - // Avoids msan uninitialized-memory warnings. - HHStateT<Target>::ZeroInitialize(buffer_); - } - - // Resets the state of the hasher so it can be used to hash a new string. - HH_INLINE void Reset(const HHKey& key) { - state_.Reset(key); - buffer_usage_ = 0; - } - - // Adds "bytes" to the internal buffer, feeding it to HHStateT::Update as - // required. Call this as often as desired. Only reads bytes within the - // interval [bytes, bytes + num_bytes). "num_bytes" == 0 has no effect. - // There are no alignment requirements. - HH_INLINE void Append(const char* HH_RESTRICT bytes, size_t num_bytes) { - // BeginIACA(); - const size_t capacity = sizeof(HHPacket) - buffer_usage_; - // New bytes fit within buffer, but still not enough to Update. - if (HH_UNLIKELY(num_bytes < capacity)) { - HHStateT<Target>::AppendPartial(bytes, num_bytes, buffer_, buffer_usage_); - buffer_usage_ += num_bytes; - return; - } - - // HACK: ensures the state is kept in SIMD registers; otherwise, Update - // constantly load/stores its operands, which is much slower. - // Restrict-qualified pointers to external state or the state_ member are - // not sufficient for keeping this in registers. - HHStateT<Target> state_copy = state_; - - // Have prior bytes to flush. - const size_t buffer_usage = buffer_usage_; - if (HH_LIKELY(buffer_usage != 0)) { - // Calls update with prior buffer contents plus new data. Does not modify - // the buffer because some implementations can load into SIMD registers - // and Append to them directly. - state_copy.AppendAndUpdate(bytes, capacity, buffer_, buffer_usage); - bytes += capacity; - num_bytes -= capacity; - } - - // Buffer currently empty => Update directly from the source. - while (num_bytes >= sizeof(HHPacket)) { - state_copy.Update(*reinterpret_cast<const HHPacket*>(bytes)); - bytes += sizeof(HHPacket); - num_bytes -= sizeof(HHPacket); - } - - // Unconditionally assign even if zero because we didn't reset to zero - // after the AppendAndUpdate above. - buffer_usage_ = num_bytes; - - state_ = state_copy; - - // Store any remainders in buffer, no-op if multiple of a packet. - if (HH_LIKELY(num_bytes != 0)) { - HHStateT<Target>::CopyPartial(bytes, num_bytes, buffer_); - } - // EndIACA(); - } - - // Stores the resulting 64, 128 or 256-bit hash of all data passed to Append. - // Must be called exactly once, or after a prior Reset. - template <typename Result> // HHResult* - HH_INLINE void Finalize(Result* HH_RESTRICT hash) { - // BeginIACA(); - HHStateT<Target> state_copy = state_; - const size_t buffer_usage = buffer_usage_; - if (HH_LIKELY(buffer_usage != 0)) { - state_copy.UpdateRemainder(buffer_, buffer_usage); - } - state_copy.Finalize(hash); - // EndIACA(); - } - - private: - HHPacket buffer_ HH_ALIGNAS(64); - HHStateT<Target> state_ HH_ALIGNAS(32); - // How many bytes in buffer_ (starting with offset 0) are valid. - size_t buffer_usage_ = 0; -}; - -} // namespace highwayhash -#endif // HH_DISABLE_TARGET_SPECIFIC -#endif // HIGHWAYHASH_HIGHWAYHASH_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HIGHWAYHASH_H_ +#define HIGHWAYHASH_HIGHWAYHASH_H_ + +// This header's templates are useful for inlining into other CPU-specific code: +// template<TargetBits Target> CodeUsingHash() { HighwayHashT<Target>(...); }, +// and can also be instantiated with HH_TARGET when callers don't care about the +// exact implementation. Otherwise, they are implementation details of the +// highwayhash_target wrapper. Use that instead if you need to detect the best +// available implementation at runtime. + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/hh_types.h" +#include "highwayhash/iaca.h" + +// Include exactly one (see arch_specific.h) header, which defines a state +// object in a target-specific namespace, e.g. AVX2::HHStateAVX2. +// Attempts to use "computed includes" (#define MACRO "path/or_just_filename", +// #include MACRO) fail with 'file not found', so we need an #if chain. +#if HH_TARGET == HH_TARGET_AVX2 +#include "highwayhash/hh_avx2.h" +#elif HH_TARGET == HH_TARGET_SSE41 +#include "highwayhash/hh_sse41.h" +#elif HH_TARGET == HH_TARGET_Portable +#include "highwayhash/hh_portable.h" +#else +#error "Unknown target, add its hh_*.h include here." +#endif + +#ifndef HH_DISABLE_TARGET_SPECIFIC +namespace highwayhash { + +// Alias templates (HHStateT) cannot be specialized, so we need a helper struct. +// Note that hh_*.h don't just specialize HHStateT directly because vector128.h +// must reside in a distinct namespace (to allow including it from multiple +// translation units), and it is easier if its users, i.e. the concrete HHState, +// also reside in that same namespace, which precludes specialization. +template <TargetBits Target> +struct HHStateForTarget {}; + +template <> +struct HHStateForTarget<HH_TARGET> { + // (The namespace is sufficient and the additional HH_TARGET_NAME suffix is + // technically redundant, but it makes searching easier.) + using type = HH_TARGET_NAME::HH_ADD_TARGET_SUFFIX(HHState); +}; + +// Typically used as HHStateT<HH_TARGET>. It would be easier to just have a +// concrete type HH_STATE, but this alias template is required by the +// templates in highwayhash_target.cc. +template <TargetBits Target> +using HHStateT = typename HHStateForTarget<Target>::type; + +// Computes HighwayHash of "bytes" using the implementation chosen by "State". +// +// "state" is a HHStateT<> initialized with a key. +// "bytes" is the data to hash (possibly unaligned). +// "size" is the number of bytes to hash; we do not read any additional bytes. +// "hash" is a HHResult* (either 64, 128 or 256 bits). +// +// HighwayHash is a strong pseudorandom function with security claims +// [https://arxiv.org/abs/1612.06257]. It is intended as a safer general-purpose +// hash, about 4x faster than SipHash and 10x faster than BLAKE2. +// +// This template allows callers (e.g. tests) to invoke a specific +// implementation. It must be compiled with the flags required by the desired +// implementation. If the entire program cannot be built with these flags, use +// the wrapper in highwayhash_target.h instead. +// +// Callers wanting to hash multiple pieces of data should duplicate this +// function, calling HHStateT::Update for each input and only Finalizing once. +template <class State, typename Result> +HH_INLINE void HighwayHashT(State* HH_RESTRICT state, + const char* HH_RESTRICT bytes, const size_t size, + Result* HH_RESTRICT hash) { + // BeginIACA(); + const size_t remainder = size & (sizeof(HHPacket) - 1); + const size_t truncated = size & ~(sizeof(HHPacket) - 1); + for (size_t offset = 0; offset < truncated; offset += sizeof(HHPacket)) { + state->Update(*reinterpret_cast<const HHPacket*>(bytes + offset)); + } + + if (remainder != 0) { + state->UpdateRemainder(bytes + truncated, remainder); + } + + state->Finalize(hash); + // EndIACA(); +} + +// Wrapper class for incrementally hashing a series of data ranges. The final +// result is the same as HighwayHashT of the concatenation of all the ranges. +// This is useful for computing the hash of cords, iovecs, and similar +// data structures. +template <TargetBits Target> +class HighwayHashCatT { + public: + HH_INLINE HighwayHashCatT(const HHKey& key) : state_(key) { + // Avoids msan uninitialized-memory warnings. + HHStateT<Target>::ZeroInitialize(buffer_); + } + + // Resets the state of the hasher so it can be used to hash a new string. + HH_INLINE void Reset(const HHKey& key) { + state_.Reset(key); + buffer_usage_ = 0; + } + + // Adds "bytes" to the internal buffer, feeding it to HHStateT::Update as + // required. Call this as often as desired. Only reads bytes within the + // interval [bytes, bytes + num_bytes). "num_bytes" == 0 has no effect. + // There are no alignment requirements. + HH_INLINE void Append(const char* HH_RESTRICT bytes, size_t num_bytes) { + // BeginIACA(); + const size_t capacity = sizeof(HHPacket) - buffer_usage_; + // New bytes fit within buffer, but still not enough to Update. + if (HH_UNLIKELY(num_bytes < capacity)) { + HHStateT<Target>::AppendPartial(bytes, num_bytes, buffer_, buffer_usage_); + buffer_usage_ += num_bytes; + return; + } + + // HACK: ensures the state is kept in SIMD registers; otherwise, Update + // constantly load/stores its operands, which is much slower. + // Restrict-qualified pointers to external state or the state_ member are + // not sufficient for keeping this in registers. + HHStateT<Target> state_copy = state_; + + // Have prior bytes to flush. + const size_t buffer_usage = buffer_usage_; + if (HH_LIKELY(buffer_usage != 0)) { + // Calls update with prior buffer contents plus new data. Does not modify + // the buffer because some implementations can load into SIMD registers + // and Append to them directly. + state_copy.AppendAndUpdate(bytes, capacity, buffer_, buffer_usage); + bytes += capacity; + num_bytes -= capacity; + } + + // Buffer currently empty => Update directly from the source. + while (num_bytes >= sizeof(HHPacket)) { + state_copy.Update(*reinterpret_cast<const HHPacket*>(bytes)); + bytes += sizeof(HHPacket); + num_bytes -= sizeof(HHPacket); + } + + // Unconditionally assign even if zero because we didn't reset to zero + // after the AppendAndUpdate above. + buffer_usage_ = num_bytes; + + state_ = state_copy; + + // Store any remainders in buffer, no-op if multiple of a packet. + if (HH_LIKELY(num_bytes != 0)) { + HHStateT<Target>::CopyPartial(bytes, num_bytes, buffer_); + } + // EndIACA(); + } + + // Stores the resulting 64, 128 or 256-bit hash of all data passed to Append. + // Must be called exactly once, or after a prior Reset. + template <typename Result> // HHResult* + HH_INLINE void Finalize(Result* HH_RESTRICT hash) { + // BeginIACA(); + HHStateT<Target> state_copy = state_; + const size_t buffer_usage = buffer_usage_; + if (HH_LIKELY(buffer_usage != 0)) { + state_copy.UpdateRemainder(buffer_, buffer_usage); + } + state_copy.Finalize(hash); + // EndIACA(); + } + + private: + HHPacket buffer_ HH_ALIGNAS(64); + HHStateT<Target> state_ HH_ALIGNAS(32); + // How many bytes in buffer_ (starting with offset 0) are valid. + size_t buffer_usage_ = 0; +}; + +} // namespace highwayhash +#endif // HH_DISABLE_TARGET_SPECIFIC +#endif // HIGHWAYHASH_HIGHWAYHASH_H_ diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_target.cc b/contrib/libs/highwayhash/highwayhash/highwayhash_target.cc index 74022f64bf..f7dc4a0d54 100644 --- a/contrib/libs/highwayhash/highwayhash/highwayhash_target.cc +++ b/contrib/libs/highwayhash/highwayhash/highwayhash_target.cc @@ -1,104 +1,104 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// WARNING: this is a "restricted" source file; avoid including any headers -// unless they are also restricted. See arch_specific.h for details. - -#include "highwayhash/highwayhash_target.h" - -#include "highwayhash/highwayhash.h" - -#ifndef HH_DISABLE_TARGET_SPECIFIC -namespace highwayhash { - -extern "C" { -uint64_t HH_ADD_TARGET_SUFFIX(HighwayHash64_)(const HHKey key, - const char* bytes, - const uint64_t size) { - HHStateT<HH_TARGET> state(key); - HHResult64 result; - HighwayHashT(&state, bytes, size, &result); - return result; -} -} // extern "C" - -template <TargetBits Target> -void HighwayHash<Target>::operator()(const HHKey& key, - const char* HH_RESTRICT bytes, - const size_t size, - HHResult64* HH_RESTRICT hash) const { - HHStateT<Target> state(key); - HighwayHashT(&state, bytes, size, hash); -} - -template <TargetBits Target> -void HighwayHash<Target>::operator()(const HHKey& key, - const char* HH_RESTRICT bytes, - const size_t size, - HHResult128* HH_RESTRICT hash) const { - HHStateT<Target> state(key); - HighwayHashT(&state, bytes, size, hash); -} - -template <TargetBits Target> -void HighwayHash<Target>::operator()(const HHKey& key, - const char* HH_RESTRICT bytes, - const size_t size, - HHResult256* HH_RESTRICT hash) const { - HHStateT<Target> state(key); - HighwayHashT(&state, bytes, size, hash); -} - -template <TargetBits Target> -void HighwayHashCat<Target>::operator()(const HHKey& key, - const StringView* HH_RESTRICT fragments, - const size_t num_fragments, - HHResult64* HH_RESTRICT hash) const { - HighwayHashCatT<Target> cat(key); - for (size_t i = 0; i < num_fragments; ++i) { - cat.Append(fragments[i].data, fragments[i].num_bytes); - } - cat.Finalize(hash); -} - -template <TargetBits Target> -void HighwayHashCat<Target>::operator()(const HHKey& key, - const StringView* HH_RESTRICT fragments, - const size_t num_fragments, - HHResult128* HH_RESTRICT hash) const { - HighwayHashCatT<Target> cat(key); - for (size_t i = 0; i < num_fragments; ++i) { - cat.Append(fragments[i].data, fragments[i].num_bytes); - } - cat.Finalize(hash); -} - -template <TargetBits Target> -void HighwayHashCat<Target>::operator()(const HHKey& key, - const StringView* HH_RESTRICT fragments, - const size_t num_fragments, - HHResult256* HH_RESTRICT hash) const { - HighwayHashCatT<Target> cat(key); - for (size_t i = 0; i < num_fragments; ++i) { - cat.Append(fragments[i].data, fragments[i].num_bytes); - } - cat.Finalize(hash); -} - -// Instantiate for the current target. -template struct HighwayHash<HH_TARGET>; -template struct HighwayHashCat<HH_TARGET>; - -} // namespace highwayhash -#endif // HH_DISABLE_TARGET_SPECIFIC +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#include "highwayhash/highwayhash_target.h" + +#include "highwayhash/highwayhash.h" + +#ifndef HH_DISABLE_TARGET_SPECIFIC +namespace highwayhash { + +extern "C" { +uint64_t HH_ADD_TARGET_SUFFIX(HighwayHash64_)(const HHKey key, + const char* bytes, + const uint64_t size) { + HHStateT<HH_TARGET> state(key); + HHResult64 result; + HighwayHashT(&state, bytes, size, &result); + return result; +} +} // extern "C" + +template <TargetBits Target> +void HighwayHash<Target>::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const size_t size, + HHResult64* HH_RESTRICT hash) const { + HHStateT<Target> state(key); + HighwayHashT(&state, bytes, size, hash); +} + +template <TargetBits Target> +void HighwayHash<Target>::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const size_t size, + HHResult128* HH_RESTRICT hash) const { + HHStateT<Target> state(key); + HighwayHashT(&state, bytes, size, hash); +} + +template <TargetBits Target> +void HighwayHash<Target>::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const size_t size, + HHResult256* HH_RESTRICT hash) const { + HHStateT<Target> state(key); + HighwayHashT(&state, bytes, size, hash); +} + +template <TargetBits Target> +void HighwayHashCat<Target>::operator()(const HHKey& key, + const StringView* HH_RESTRICT fragments, + const size_t num_fragments, + HHResult64* HH_RESTRICT hash) const { + HighwayHashCatT<Target> cat(key); + for (size_t i = 0; i < num_fragments; ++i) { + cat.Append(fragments[i].data, fragments[i].num_bytes); + } + cat.Finalize(hash); +} + +template <TargetBits Target> +void HighwayHashCat<Target>::operator()(const HHKey& key, + const StringView* HH_RESTRICT fragments, + const size_t num_fragments, + HHResult128* HH_RESTRICT hash) const { + HighwayHashCatT<Target> cat(key); + for (size_t i = 0; i < num_fragments; ++i) { + cat.Append(fragments[i].data, fragments[i].num_bytes); + } + cat.Finalize(hash); +} + +template <TargetBits Target> +void HighwayHashCat<Target>::operator()(const HHKey& key, + const StringView* HH_RESTRICT fragments, + const size_t num_fragments, + HHResult256* HH_RESTRICT hash) const { + HighwayHashCatT<Target> cat(key); + for (size_t i = 0; i < num_fragments; ++i) { + cat.Append(fragments[i].data, fragments[i].num_bytes); + } + cat.Finalize(hash); +} + +// Instantiate for the current target. +template struct HighwayHash<HH_TARGET>; +template struct HighwayHashCat<HH_TARGET>; + +} // namespace highwayhash +#endif // HH_DISABLE_TARGET_SPECIFIC diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_target.h b/contrib/libs/highwayhash/highwayhash/highwayhash_target.h index 08b803f191..3d6f33f236 100644 --- a/contrib/libs/highwayhash/highwayhash/highwayhash_target.h +++ b/contrib/libs/highwayhash/highwayhash/highwayhash_target.h @@ -1,91 +1,91 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ -#define HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ - -// Adapter for the InstructionSets::Run dispatcher, which invokes the best -// implementations available on the current CPU. - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" -#include "highwayhash/hh_types.h" - -namespace highwayhash { - -// Usage: InstructionSets::Run<HighwayHash>(key, bytes, size, hash). -// This incurs some small dispatch overhead. If the entire program is compiled -// for the target CPU, you can instead call HighwayHashT directly to avoid any -// overhead. This template is instantiated in the source file, which is -// compiled once for every target with the required flags (e.g. -mavx2). -template <TargetBits Target> -struct HighwayHash { - // Stores a 64/128/256 bit hash of "bytes" using the HighwayHashT - // implementation for the "Target" CPU. The hash result is identical - // regardless of which implementation is used. - // - // "key" is a (randomly generated or hard-coded) HHKey. - // "bytes" is the data to hash (possibly unaligned). - // "size" is the number of bytes to hash; we do not read any additional bytes. - // "hash" is a HHResult* (either 64, 128 or 256 bits). - // - // HighwayHash is a strong pseudorandom function with security claims - // [https://arxiv.org/abs/1612.06257]. It is intended as a safer - // general-purpose hash, 5x faster than SipHash and 10x faster than BLAKE2. - void operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const size_t size, HHResult64* HH_RESTRICT hash) const; - void operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const size_t size, HHResult128* HH_RESTRICT hash) const; - void operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const size_t size, HHResult256* HH_RESTRICT hash) const; -}; - -// Replacement for C++17 std::string_view that avoids dependencies. -// A struct requires fewer allocations when calling HighwayHashCat with -// non-const "num_fragments". -struct StringView { - const char* data; // not necessarily aligned/padded - size_t num_bytes; // possibly zero -}; - -// Note: this interface avoids dispatch overhead per fragment. -template <TargetBits Target> -struct HighwayHashCat { - // Stores a 64/128/256 bit hash of all "num_fragments" "fragments" using the - // HighwayHashCatT implementation for "Target". The hash result is identical - // to HighwayHash of the flattened data, regardless of Target. - // - // "key" is a (randomly generated or hard-coded) HHKey. - // "fragments" contain unaligned pointers and the number of valid bytes. - // "num_fragments" indicates the number of entries in "fragments". - // "hash" is a HHResult* (either 64, 128 or 256 bits). - void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments, - const size_t num_fragments, - HHResult64* HH_RESTRICT hash) const; - void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments, - const size_t num_fragments, - HHResult128* HH_RESTRICT hash) const; - void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments, - const size_t num_fragments, - HHResult256* HH_RESTRICT hash) const; -}; - -} // namespace highwayhash - -#endif // HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ +#define HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ + +// Adapter for the InstructionSets::Run dispatcher, which invokes the best +// implementations available on the current CPU. + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/hh_types.h" + +namespace highwayhash { + +// Usage: InstructionSets::Run<HighwayHash>(key, bytes, size, hash). +// This incurs some small dispatch overhead. If the entire program is compiled +// for the target CPU, you can instead call HighwayHashT directly to avoid any +// overhead. This template is instantiated in the source file, which is +// compiled once for every target with the required flags (e.g. -mavx2). +template <TargetBits Target> +struct HighwayHash { + // Stores a 64/128/256 bit hash of "bytes" using the HighwayHashT + // implementation for the "Target" CPU. The hash result is identical + // regardless of which implementation is used. + // + // "key" is a (randomly generated or hard-coded) HHKey. + // "bytes" is the data to hash (possibly unaligned). + // "size" is the number of bytes to hash; we do not read any additional bytes. + // "hash" is a HHResult* (either 64, 128 or 256 bits). + // + // HighwayHash is a strong pseudorandom function with security claims + // [https://arxiv.org/abs/1612.06257]. It is intended as a safer + // general-purpose hash, 5x faster than SipHash and 10x faster than BLAKE2. + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, HHResult64* HH_RESTRICT hash) const; + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, HHResult128* HH_RESTRICT hash) const; + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, HHResult256* HH_RESTRICT hash) const; +}; + +// Replacement for C++17 std::string_view that avoids dependencies. +// A struct requires fewer allocations when calling HighwayHashCat with +// non-const "num_fragments". +struct StringView { + const char* data; // not necessarily aligned/padded + size_t num_bytes; // possibly zero +}; + +// Note: this interface avoids dispatch overhead per fragment. +template <TargetBits Target> +struct HighwayHashCat { + // Stores a 64/128/256 bit hash of all "num_fragments" "fragments" using the + // HighwayHashCatT implementation for "Target". The hash result is identical + // to HighwayHash of the flattened data, regardless of Target. + // + // "key" is a (randomly generated or hard-coded) HHKey. + // "fragments" contain unaligned pointers and the number of valid bytes. + // "num_fragments" indicates the number of entries in "fragments". + // "hash" is a HHResult* (either 64, 128 or 256 bits). + void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments, + const size_t num_fragments, + HHResult64* HH_RESTRICT hash) const; + void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments, + const size_t num_fragments, + HHResult128* HH_RESTRICT hash) const; + void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments, + const size_t num_fragments, + HHResult256* HH_RESTRICT hash) const; +}; + +} // namespace highwayhash + +#endif // HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_test.cc b/contrib/libs/highwayhash/highwayhash/highwayhash_test.cc index b0f8b88712..d7f914af78 100644 --- a/contrib/libs/highwayhash/highwayhash/highwayhash_test.cc +++ b/contrib/libs/highwayhash/highwayhash/highwayhash_test.cc @@ -1,388 +1,388 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Ensures each implementation of HighwayHash returns consistent and unchanging -// hash values. - -#include "highwayhash/highwayhash_test_target.h" - -#include <stddef.h> -#include <atomic> -#include <cstdio> -#include <cstdlib> -#include <vector> - -#ifdef HH_GOOGLETEST -#include "testing/base/public/gunit.h" -#endif - -#include "highwayhash/data_parallel.h" -#include "highwayhash/instruction_sets.h" - -// Define to nonzero in order to print the (new) golden outputs. -#define PRINT_RESULTS 0 - -namespace highwayhash { -namespace { - -// Known-good outputs are verified for all lengths in [0, 64]. -const size_t kMaxSize = 64; - -#if PRINT_RESULTS -void Print(const HHResult64 result) { printf("0x%016lXull,\n", result); } - -// For HHResult128/256. -template <int kNumLanes> -void Print(const HHResult64 (&result)[kNumLanes]) { - printf("{ "); - for (int i = 0; i < kNumLanes; ++i) { - if (i != 0) { - printf(", "); - } - printf("0x%016lXull", result[i]); - } - printf("},\n"); -} -#endif // PRINT_RESULTS - -// Called when any test fails; exits immediately because one mismatch usually -// implies many others. -void OnFailure(const char* target_name, const size_t size) { - printf("Mismatch at size %zu\n", size); -#ifdef HH_GOOGLETEST - EXPECT_TRUE(false); -#endif - exit(1); -} - -// Verifies every combination of implementation and input size. Returns which -// targets were run/verified. -template <typename Result> -TargetBits VerifyImplementations(const Result (&known_good)[kMaxSize + 1]) { - const HHKey key = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL, - 0x1716151413121110ULL, 0x1F1E1D1C1B1A1918ULL}; - - TargetBits targets = ~0U; - - // For each test input: empty string, 00, 00 01, ... - char in[kMaxSize + 1] = {0}; - // Fast enough that we don't need a thread pool. - for (uint64_t size = 0; size <= kMaxSize; ++size) { - in[size] = static_cast<char>(size); -#if PRINT_RESULTS - Result actual; - targets &= InstructionSets::Run<HighwayHash>(key, in, size, &actual); - Print(actual); -#else - const Result* expected = &known_good[size]; - targets &= InstructionSets::RunAll<HighwayHashTest>(key, in, size, expected, - &OnFailure); -#endif - } - return targets; -} - -// Cat - -void OnCatFailure(const char* target_name, const size_t size) { - printf("Cat mismatch at size %zu\n", size); -#ifdef HH_GOOGLETEST - EXPECT_TRUE(false); -#endif - exit(1); -} - -// Returns which targets were run/verified. -template <typename Result> -TargetBits VerifyCat(ThreadPool* pool) { - // Reversed order vs prior test. - const HHKey key = {0x1F1E1D1C1B1A1918ULL, 0x1716151413121110ULL, - 0x0F0E0D0C0B0A0908ULL, 0x0706050403020100ULL}; - - const size_t kMaxSize = 3 * 35; - std::vector<char> flat; - flat.reserve(kMaxSize); - srand(129); - for (size_t size = 0; size < kMaxSize; ++size) { - flat.push_back(static_cast<char>(rand() & 0xFF)); - } - - std::atomic<TargetBits> targets{~0U}; - - pool->Run(0, kMaxSize, [&key, &flat, &targets](const uint32_t i) { - Result dummy; - targets.fetch_and(InstructionSets::RunAll<HighwayHashCatTest>( - key, flat.data(), i, &dummy, &OnCatFailure)); - }); - return targets.load(); -} - -const HHResult64 kExpected64[kMaxSize + 1] = { - 0x907A56DE22C26E53ull, 0x7EAB43AAC7CDDD78ull, 0xB8D0569AB0B53D62ull, - 0x5C6BEFAB8A463D80ull, 0xF205A46893007EDAull, 0x2B8A1668E4A94541ull, - 0xBD4CCC325BEFCA6Full, 0x4D02AE1738F59482ull, 0xE1205108E55F3171ull, - 0x32D2644EC77A1584ull, 0xF6E10ACDB103A90Bull, 0xC3BBF4615B415C15ull, - 0x243CC2040063FA9Cull, 0xA89A58CE65E641FFull, 0x24B031A348455A23ull, - 0x40793F86A449F33Bull, 0xCFAB3489F97EB832ull, 0x19FE67D2C8C5C0E2ull, - 0x04DD90A69C565CC2ull, 0x75D9518E2371C504ull, 0x38AD9B1141D3DD16ull, - 0x0264432CCD8A70E0ull, 0xA9DB5A6288683390ull, 0xD7B05492003F028Cull, - 0x205F615AEA59E51Eull, 0xEEE0C89621052884ull, 0x1BFC1A93A7284F4Full, - 0x512175B5B70DA91Dull, 0xF71F8976A0A2C639ull, 0xAE093FEF1F84E3E7ull, - 0x22CA92B01161860Full, 0x9FC7007CCF035A68ull, 0xA0C964D9ECD580FCull, - 0x2C90F73CA03181FCull, 0x185CF84E5691EB9Eull, 0x4FC1F5EF2752AA9Bull, - 0xF5B7391A5E0A33EBull, 0xB9B84B83B4E96C9Cull, 0x5E42FE712A5CD9B4ull, - 0xA150F2F90C3F97DCull, 0x7FA522D75E2D637Dull, 0x181AD0CC0DFFD32Bull, - 0x3889ED981E854028ull, 0xFB4297E8C586EE2Dull, 0x6D064A45BB28059Cull, - 0x90563609B3EC860Cull, 0x7AA4FCE94097C666ull, 0x1326BAC06B911E08ull, - 0xB926168D2B154F34ull, 0x9919848945B1948Dull, 0xA2A98FC534825EBEull, - 0xE9809095213EF0B6ull, 0x582E5483707BC0E9ull, 0x086E9414A88A6AF5ull, - 0xEE86B98D20F6743Dull, 0xF89B7FF609B1C0A7ull, 0x4C7D9CC19E22C3E8ull, - 0x9A97005024562A6Full, 0x5DD41CF423E6EBEFull, 0xDF13609C0468E227ull, - 0x6E0DA4F64188155Aull, 0xB755BA4B50D7D4A1ull, 0x887A3484647479BDull, - 0xAB8EEBE9BF2139A0ull, 0x75542C5D4CD2A6FFull}; - -const HHResult128 kExpected128[kMaxSize + 1] = { - {0x0679D1E884C28A7Cull, 0x2BCA2547F904748Dull}, - {0x7F3A39BCC2D897B9ull, 0x4A7E113CA064D91Full}, - {0x6AB34B92C5AB85BFull, 0xED7AC546689D76C2ull}, - {0xAC6AF8405A4A7DBEull, 0xD78FB7953256C3E1ull}, - {0x5A6E8CF789B86448ull, 0x834EF47C1BEDC218ull}, - {0x8EBFE0B573F425A3ull, 0xBCFCC410CB84325Aull}, - {0xA1E19717CAB8F1D6ull, 0x2AA50671881F877Dull}, - {0x0B595302950DA1ECull, 0x46932DE27204B388ull}, - {0x02FB033F200F89D4ull, 0xFEC3D7BB3B421F92ull}, - {0x0A5479D46CC1EADEull, 0x0C16A2D5A0F1C3DEull}, - {0xF759E41DDD621106ull, 0xB43D70116E004750ull}, - {0x980010BC36A4E98Full, 0x27479317AE00BBD1ull}, - {0x3BABF3B23761A379ull, 0xACCDC28E0256F326ull}, - {0x5780CD04269E142Eull, 0xBB70EE3F23BDEDA9ull}, - {0x4A401F1937E99EC3ull, 0x4B3D1385D6B4E214ull}, - {0x045C6EDE080E2CB0ull, 0x7327B45D2132DC89ull}, - {0x97E1624BEB1C1756ull, 0xB7137E1B69D45024ull}, - {0x31DBA8E3DB0BF012ull, 0x3E66E6A78A729B16ull}, - {0x34D6DF1B5D8AF2A7ull, 0x4F1A47FCBC39EB55ull}, - {0xE2C6BE2D47E5DCBCull, 0xD2FF85284E307C1Full}, - {0xDA681E06098EC892ull, 0x71AD98355019FED1ull}, - {0xC4FBD72B1F2FC30Bull, 0x327549B6C9FDEDD5ull}, - {0x14F429D1C20F0EB5ull, 0x228B40C92F3FA369ull}, - {0xF5C9535333206D01ull, 0xB6FC46FCCA65F9CCull}, - {0x3049FAD9DB729D2Dull, 0xB84C931C45F781EAull}, - {0x7C6FFE6F3706DC04ull, 0x4F94583806AE3C62ull}, - {0x9EF95EB28BE1CCE0ull, 0xAD9D5B96A0D15BFEull}, - {0x63D0ED54AF2985E6ull, 0xDFAFB1B6485C1B01ull}, - {0xA46C8A2FE498D46Cull, 0xF4DBAEC0FF03BAD6ull}, - {0xED978A0FBB3E5158ull, 0x060D144D57FBE6FDull}, - {0x53F1D80C8922E4E5ull, 0x1324880D932140C9ull}, - {0xDD363B03563870CEull, 0x0DFDB79F4F34184Bull}, - {0x4E702701AE65DB38ull, 0x1B67E0A2E2DBFB04ull}, - {0x240DA388551D0822ull, 0x2FF1BB584AC4BD61ull}, - {0x3FAFB8B7C26499ABull, 0x072516308E889132ull}, - {0x0AB452339406AB22ull, 0x751DBB7FF9472D42ull}, - {0x83BA782DB6EB1186ull, 0x4391544D9318DC29ull}, - {0x25077ECDAAB201E8ull, 0x695E0E95446D63A2ull}, - {0x1AF0BF12F91F17D4ull, 0x5BB8FF299368D22Cull}, - {0x338C09CBAF701E38ull, 0xA7D24D5E7C06DC78ull}, - {0x5AB58D6555D28B56ull, 0xE781413A9AE1310Full}, - {0xB0281CD10BCA7B89ull, 0xF49873B45C0F7274ull}, - {0x67EEBD6D71E57B06ull, 0x9421CB1DB54EEDDFull}, - {0x00DAB867E37EDA65ull, 0x6477E454191E213Full}, - {0x9AF9C4817C24C82Eull, 0xAE3A73522F311EEBull}, - {0xD8A334E30D23C6E6ull, 0xAF57EF86CCCF12FFull}, - {0x0353A48FC9E139DDull, 0x27D5626170A7DD0Full}, - {0x0DA12E888EB61876ull, 0x67B17DF10CB365CDull}, - {0x967CD764883A5E85ull, 0x570D7C9A774A6AB4ull}, - {0xA8DF13980C81E533ull, 0x9C33FE4797F87F1Aull}, - {0xCABB59F53AE75FF2ull, 0x6D25512E77172E7Aull}, - {0xB24E7F0C7DA62BE7ull, 0x2442F94890F57D89ull}, - {0x7DCBA0A5B9689BBDull, 0x700FC8D13DA4CC60ull}, - {0x1E8E014B97A9F828ull, 0xF858EFCA33E8A502ull}, - {0x4DAF4E31F34D10C7ull, 0x47E382D0A5A8C613ull}, - {0x577CAB4EF626BB28ull, 0xF6ED27E594C5795Full}, - {0x989188C958586C96ull, 0x8B3A2CB0D5B48FD9ull}, - {0x13CC58F5A076C088ull, 0x932A0FD21D4B422Cull}, - {0xD067380DAD885647ull, 0xC1020E396B31BB4Aull}, - {0x47D05A73072758D0ull, 0x5CF6075A0AEB5D78ull}, - {0x54441D7AE94E2D4Eull, 0x3B4F67953ABD3EA4ull}, - {0xEDD4250C3733EEBCull, 0x26E365AA1167C723ull}, - {0x92D02D2A641DA598ull, 0x3DAF5EB24A0C2A94ull}, - {0xAE6CF7FE2D76CA56ull, 0xC7918532A42D2F5Dull}, - {0xAD24762A08D96F1Bull, 0x729083EC59FA8DF7ull}}; - -const HHResult256 kExpected256[kMaxSize + 1] = { - {0xC6DC0C823434863Full, 0x6A42CCB644CBFAD9ull, 0x18DEF6A60EA5D873ull, - 0x3596F663D00D1225ull}, - {0x00518B3D2BD22424ull, 0xE5791619BF612E97ull, 0xF4DAF07017FAF99Dull, - 0xE36AE62C5509B5D6ull}, - {0x81021CC5067D8526ull, 0xBEEFC1BC87A6911Aull, 0xE2AEC605F80657FEull, - 0x3C6576B5DF982327ull}, - {0x118D72C0B5DB2C70ull, 0x0BE2E64BF538CA74ull, 0x667B33FE41DDAA74ull, - 0xB6199539303E13E1ull}, - {0x4AC9B8B2E4FD873Bull, 0xDE0FE265A45FFC97ull, 0x1FC1476F896ADA3Bull, - 0x7680B4AE30B371E7ull}, - {0x518ABC6B5E88214Full, 0xFD62A05B2B06026Bull, 0x9C978E8B38DBE795ull, - 0x41412401886FF054ull}, - {0x2DEDEF0832BEA7D9ull, 0x44EFE0AEAB7944FCull, 0x09AA7C9374A1E980ull, - 0x714DB8B507C507FBull}, - {0x6FA2135DE3D3D3AAull, 0xC0EEA9A890E36156ull, 0xFAC1DB8C817DB095ull, - 0x7B42789096836327ull}, - {0x27257C518B1FFC5Cull, 0x26CC8E669DA1AB0Full, 0xCD7B17C661A0A680ull, - 0x31D0A7EC0AA3B9BFull}, - {0xB91869900A1AF26Cull, 0x95B0D74B7FF20B43ull, 0x2A6CABF6F931B575ull, - 0x69734DC9E66A1965ull}, - {0xDD7DA31F5C4DD30Full, 0x08940D249A0A7B69ull, 0xAE7D3AD1C5EA81F2ull, - 0x96701DB5C6602B21ull}, - {0x2E4A230847E64687ull, 0xF96176C38E48B038ull, 0x9ED0B88A3026E1BCull, - 0x9AAB5DCA46FCFE19ull}, - {0x3E5CF04BFBAC2642ull, 0x591A3581001709DFull, 0xA0288F5FA63C10A2ull, - 0x85B94D3641A2C108ull}, - {0x454A95FAD8901350ull, 0x5546E8E75D2AC833ull, 0xCF5FF2ACB4B5F2C1ull, - 0x14F314318028D62Eull}, - {0x0DED251FB81F34A9ull, 0xC42111DB31618AA6ull, 0xC1C3352B70B00C5Dull, - 0xDC8947DBC398F0C2ull}, - {0xC591A100AB4E9E72ull, 0x4CCFD2A7B0D8D911ull, 0x6FEDFDDE1BA3F770ull, - 0x03E5C5A2F6E708A1ull}, - {0x537C42CC5E7B448Aull, 0xA7343E04249B2231ull, 0x2CB51D697EFE9B6Dull, - 0x589D83141A699A97ull}, - {0x3F7E6EA60343B870ull, 0x4E27E907E296D4D7ull, 0x87525BF1AABBF794ull, - 0x6B03C4DC206EC628ull}, - {0x741BA4D7A856E03Cull, 0x3798422CB64C9AFAull, 0xB1D89C9720D33FDDull, - 0x08DE607FC4E3B5C3ull}, - {0x77D77342C85BA466ull, 0xA01C603C58F6D97Eull, 0x342AF0A7309EA4EAull, - 0x9C958EB3F6A64B94ull}, - {0x9EDCADDD1FFC763Full, 0xBD9BAA6E9BE936EFull, 0xAAB0F78F1A4A94F7ull, - 0xE71D9CA601DA4C02ull}, - {0xE3AA0D0A422BF888ull, 0x07734C8173411035ull, 0x8A085019DE545AF6ull, - 0xBC3C520B1221A779ull}, - {0x16170C02C5E5439Dull, 0x45C6004513BFC174ull, 0x35CF3AD65D225EC8ull, - 0xE10BAA702D37C90Eull}, - {0x6BD63B47EA43ABC6ull, 0xCC08BE8A651E24C0ull, 0xB564F0FC6FF8998Aull, - 0x3EE409A34232E589ull}, - {0xD6CEE5574355BB81ull, 0x8E31FF40B271A16Dull, 0xC3ECEDBEEACCCAE9ull, - 0x19386CD3A23B92E9ull}, - {0x32475E05D248DBB1ull, 0xF2396A122830E72Cull, 0xB88395678C0DB899ull, - 0x8BD410A22A247066ull}, - {0x0BFA3B3C4775EB43ull, 0x496596C36FB2A200ull, 0xA00F533EF150D7DDull, - 0xB5D70BBCABB572C4ull}, - {0x932B0ED33ED691B1ull, 0xB58394EDCEA3C53Dull, 0xB935E0786B132755ull, - 0x3E0998322B3F74BAull}, - {0xE21F2CE1BDD156A7ull, 0x764518A56E1363B5ull, 0x461251D3EC39B93Full, - 0x33C1FE46C9664CC4ull}, - {0x8ABD3F6184C9CD7Dull, 0x8195816637017FC0ull, 0x284B3E93524765DEull, - 0x56147BDBA9362D0Eull}, - {0x1F050672342807B6ull, 0x9B0AD1091A83910Dull, 0xF23AD4A58C3B1E21ull, - 0xCC986EC0BEA16781ull}, - {0x053164DEF96B10CEull, 0x1D5ADA15E36D8F6Cull, 0x06FB43534C0472EFull, - 0x021C0ED1FDEA0948ull}, - {0xF62BA4C5A665E602ull, 0x490D89FD89430C56ull, 0x18F423BE8A9B7E3Cull, - 0x769E5DDA4DCAC619ull}, - {0xDABD25FAF07A6684ull, 0xACA85CD21536B927ull, 0xAC05E050B4E3D3D1ull, - 0xBE427B2475CCD981ull}, - {0x89A2B35A34F89F8Cull, 0x1A0E51B2875D34E6ull, 0xBA573CF45E123919ull, - 0x1C50815B08F1138Aull}, - {0x3390CCBE60F2AFF7ull, 0xD9E2D245643E79C2ull, 0x1104A78F85D3CDF5ull, - 0x7E55F38F9C53A58Full}, - {0xC189AE1A9D456C0Eull, 0x06AA4C3D4204A40Full, 0x4B383405A9D451A9ull, - 0x7EA34CBCAEF0C31Eull}, - {0xB45FA7CC19AE4DDFull, 0x306C418E9BA67420ull, 0xDF16D80D4D48C096ull, - 0xD3169E50BC8D75CCull}, - {0x5894367013710C89ull, 0xD39EE6D584E76AF3ull, 0x5C55A414BCDDE505ull, - 0x8FA97D561CB174BFull}, - {0x87355749D59F39DDull, 0x26B8B311E72C50F4ull, 0x1911A8CBCE53E37Bull, - 0x5C256452C39B95F6ull}, - {0x8B9E87C9ABC82821ull, 0x12A5FC06B69CDC2Dull, 0xF95104FF805E5E1Dull, - 0xE5D4D2257AD5592Eull}, - {0x5A89242B02E1E048ull, 0x771602AAD1880A7Eull, 0x0F34507608387843ull, - 0x7AFB45F3EA4F0F24ull}, - {0x3BE3800150FDDE00ull, 0x7871908FF91AD81Aull, 0xA00E07F351BB15C1ull, - 0x429658E7FD10D11Aull}, - {0x2B2B1A6CD1BA454Cull, 0xF19E8CA5C022308Aull, 0xAEFA0EB6F7C3CF74ull, - 0x21F4330A5258E7C7ull}, - {0xD1C806622910A9BEull, 0xFE224EF598F541B1ull, 0xB95A435AEC4DD849ull, - 0xD942A277AB57E68Eull}, - {0x16BF7116E8D2B328ull, 0xB37DC98EA931FC13ull, 0x18E8859A592C8C11ull, - 0x11590F16C4C61716ull}, - {0xD046122D4C7B24AEull, 0xBD0899DFD7345611ull, 0x91AAECB50DE6DFF9ull, - 0x6EDC4896BAA90FFAull}, - {0x2FE97B8135EA956Dull, 0xFBA50900FB4EF23Cull, 0x0BC907363F7EA368ull, - 0xA5C982D3094BCEE2ull}, - {0x247BFB5BA3A0F245ull, 0x6ACBDD4AFFDB03EBull, 0xA4237427D373B619ull, - 0xFA9C041D302B728Cull}, - {0xF93109909D6B80EFull, 0xD1321A6BEE302794ull, 0xD63E1E7985C458D3ull, - 0x644CD44F6C6FDE95ull}, - {0xD0522C663FBE65B0ull, 0x78F366F302EA33F5ull, 0xB9ED66D1CB87C891ull, - 0x0CEB2298BA9D1C1Aull}, - {0x60D60E9B569264E8ull, 0xE34447A5741417EAull, 0x04522108BDF3AFC3ull, - 0x90F4FE2D585B25FAull}, - {0xAF411662AAB81B12ull, 0x3AD58EBBA1BA2F39ull, 0x73E0E8EB5879E37Dull, - 0xCE0E8F8F613D3FC5ull}, - {0xCA756CB9E1FDF1C6ull, 0x89731D81712D34BDull, 0xBF520B2D830959C2ull, - 0xD35ED12BB24CE9EFull}, - {0x5FB2B65ABF038045ull, 0x3F2D32F8532E14D6ull, 0x06443CC95CDD58C8ull, - 0x30FC6FBE8CCE8EB8ull}, - {0x94A9774F02848D73ull, 0x83F9AFC4C0B48768ull, 0xDB7BF5FBD9B25A26ull, - 0x7F7D50266FFA639Bull}, - {0x352A775C646259DDull, 0xB2B532B472539832ull, 0x9981AE050A2FB38Cull, - 0xE13641E804F6DC00ull}, - {0x080E005A04E73352ull, 0x0314F6EA196A210Cull, 0x29EA80869CE307A4ull, - 0x4FABEB9ADE04BE00ull}, - {0x5674A4A533335ADFull, 0x3C7C0650FF6C585Bull, 0x384E4F8246446812ull, - 0xAE2DADA5E0EB6D81ull}, - {0xB6CE794A89B0A1F7ull, 0x0DC2B87EC9473CDDull, 0x349A006CA2899C88ull, - 0x4B411CB7DF6BF33Cull}, - {0xD79BB5606CE6BDAFull, 0x4040EA447818A5C1ull, 0x53D58C5710475284ull, - 0x3DA8730E092608BAull}, - {0x5900A2DAA12E085Cull, 0x80D490C510C493DDull, 0x4BDF17B0247C8D1Bull, - 0xA8649490D6CFCE67ull}, - {0xFBDAB07B10180D47ull, 0xED6C196BDC43E292ull, 0xE7D494077FA2791Dull, - 0xC7108D4FD01BBF85ull}, - {0x4365D6236E6AE467ull, 0xB3D540909D4308A5ull, 0xE38207ABD4588D68ull, - 0xBBD42849A8C92313ull}, - {0x064DB5FE415126F5ull, 0x248AF8FB29A9C595ull, 0x508633A742B3FFF7ull, - 0x24CFDCA800C34770ull}}; - -void RunTests() { - // TODO(janwas): detect number of cores. - ThreadPool pool(4); - - TargetBits tested = ~0U; - tested &= VerifyImplementations(kExpected64); - tested &= VerifyImplementations(kExpected128); - tested &= VerifyImplementations(kExpected256); - // Any failure causes immediate exit, so apparently all succeeded. - HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) { - printf("%10s: OK\n", TargetName(target)); - }); - - tested = ~0U; - tested &= VerifyCat<HHResult64>(&pool); - tested &= VerifyCat<HHResult128>(&pool); - tested &= VerifyCat<HHResult256>(&pool); - HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) { - printf("%10sCat: OK\n", TargetName(target)); - }); -} - -#ifdef HH_GOOGLETEST -TEST(HighwayhashTest, OutputMatchesExpectations) { RunTests(); } -#endif - -} // namespace -} // namespace highwayhash - -#ifndef HH_GOOGLETEST -int main(int argc, char* argv[]) { - highwayhash::RunTests(); - return 0; -} -#endif +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Ensures each implementation of HighwayHash returns consistent and unchanging +// hash values. + +#include "highwayhash/highwayhash_test_target.h" + +#include <stddef.h> +#include <atomic> +#include <cstdio> +#include <cstdlib> +#include <vector> + +#ifdef HH_GOOGLETEST +#include "testing/base/public/gunit.h" +#endif + +#include "highwayhash/data_parallel.h" +#include "highwayhash/instruction_sets.h" + +// Define to nonzero in order to print the (new) golden outputs. +#define PRINT_RESULTS 0 + +namespace highwayhash { +namespace { + +// Known-good outputs are verified for all lengths in [0, 64]. +const size_t kMaxSize = 64; + +#if PRINT_RESULTS +void Print(const HHResult64 result) { printf("0x%016lXull,\n", result); } + +// For HHResult128/256. +template <int kNumLanes> +void Print(const HHResult64 (&result)[kNumLanes]) { + printf("{ "); + for (int i = 0; i < kNumLanes; ++i) { + if (i != 0) { + printf(", "); + } + printf("0x%016lXull", result[i]); + } + printf("},\n"); +} +#endif // PRINT_RESULTS + +// Called when any test fails; exits immediately because one mismatch usually +// implies many others. +void OnFailure(const char* target_name, const size_t size) { + printf("Mismatch at size %zu\n", size); +#ifdef HH_GOOGLETEST + EXPECT_TRUE(false); +#endif + exit(1); +} + +// Verifies every combination of implementation and input size. Returns which +// targets were run/verified. +template <typename Result> +TargetBits VerifyImplementations(const Result (&known_good)[kMaxSize + 1]) { + const HHKey key = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL, + 0x1716151413121110ULL, 0x1F1E1D1C1B1A1918ULL}; + + TargetBits targets = ~0U; + + // For each test input: empty string, 00, 00 01, ... + char in[kMaxSize + 1] = {0}; + // Fast enough that we don't need a thread pool. + for (uint64_t size = 0; size <= kMaxSize; ++size) { + in[size] = static_cast<char>(size); +#if PRINT_RESULTS + Result actual; + targets &= InstructionSets::Run<HighwayHash>(key, in, size, &actual); + Print(actual); +#else + const Result* expected = &known_good[size]; + targets &= InstructionSets::RunAll<HighwayHashTest>(key, in, size, expected, + &OnFailure); +#endif + } + return targets; +} + +// Cat + +void OnCatFailure(const char* target_name, const size_t size) { + printf("Cat mismatch at size %zu\n", size); +#ifdef HH_GOOGLETEST + EXPECT_TRUE(false); +#endif + exit(1); +} + +// Returns which targets were run/verified. +template <typename Result> +TargetBits VerifyCat(ThreadPool* pool) { + // Reversed order vs prior test. + const HHKey key = {0x1F1E1D1C1B1A1918ULL, 0x1716151413121110ULL, + 0x0F0E0D0C0B0A0908ULL, 0x0706050403020100ULL}; + + const size_t kMaxSize = 3 * 35; + std::vector<char> flat; + flat.reserve(kMaxSize); + srand(129); + for (size_t size = 0; size < kMaxSize; ++size) { + flat.push_back(static_cast<char>(rand() & 0xFF)); + } + + std::atomic<TargetBits> targets{~0U}; + + pool->Run(0, kMaxSize, [&key, &flat, &targets](const uint32_t i) { + Result dummy; + targets.fetch_and(InstructionSets::RunAll<HighwayHashCatTest>( + key, flat.data(), i, &dummy, &OnCatFailure)); + }); + return targets.load(); +} + +const HHResult64 kExpected64[kMaxSize + 1] = { + 0x907A56DE22C26E53ull, 0x7EAB43AAC7CDDD78ull, 0xB8D0569AB0B53D62ull, + 0x5C6BEFAB8A463D80ull, 0xF205A46893007EDAull, 0x2B8A1668E4A94541ull, + 0xBD4CCC325BEFCA6Full, 0x4D02AE1738F59482ull, 0xE1205108E55F3171ull, + 0x32D2644EC77A1584ull, 0xF6E10ACDB103A90Bull, 0xC3BBF4615B415C15ull, + 0x243CC2040063FA9Cull, 0xA89A58CE65E641FFull, 0x24B031A348455A23ull, + 0x40793F86A449F33Bull, 0xCFAB3489F97EB832ull, 0x19FE67D2C8C5C0E2ull, + 0x04DD90A69C565CC2ull, 0x75D9518E2371C504ull, 0x38AD9B1141D3DD16ull, + 0x0264432CCD8A70E0ull, 0xA9DB5A6288683390ull, 0xD7B05492003F028Cull, + 0x205F615AEA59E51Eull, 0xEEE0C89621052884ull, 0x1BFC1A93A7284F4Full, + 0x512175B5B70DA91Dull, 0xF71F8976A0A2C639ull, 0xAE093FEF1F84E3E7ull, + 0x22CA92B01161860Full, 0x9FC7007CCF035A68ull, 0xA0C964D9ECD580FCull, + 0x2C90F73CA03181FCull, 0x185CF84E5691EB9Eull, 0x4FC1F5EF2752AA9Bull, + 0xF5B7391A5E0A33EBull, 0xB9B84B83B4E96C9Cull, 0x5E42FE712A5CD9B4ull, + 0xA150F2F90C3F97DCull, 0x7FA522D75E2D637Dull, 0x181AD0CC0DFFD32Bull, + 0x3889ED981E854028ull, 0xFB4297E8C586EE2Dull, 0x6D064A45BB28059Cull, + 0x90563609B3EC860Cull, 0x7AA4FCE94097C666ull, 0x1326BAC06B911E08ull, + 0xB926168D2B154F34ull, 0x9919848945B1948Dull, 0xA2A98FC534825EBEull, + 0xE9809095213EF0B6ull, 0x582E5483707BC0E9ull, 0x086E9414A88A6AF5ull, + 0xEE86B98D20F6743Dull, 0xF89B7FF609B1C0A7ull, 0x4C7D9CC19E22C3E8ull, + 0x9A97005024562A6Full, 0x5DD41CF423E6EBEFull, 0xDF13609C0468E227ull, + 0x6E0DA4F64188155Aull, 0xB755BA4B50D7D4A1ull, 0x887A3484647479BDull, + 0xAB8EEBE9BF2139A0ull, 0x75542C5D4CD2A6FFull}; + +const HHResult128 kExpected128[kMaxSize + 1] = { + {0x0679D1E884C28A7Cull, 0x2BCA2547F904748Dull}, + {0x7F3A39BCC2D897B9ull, 0x4A7E113CA064D91Full}, + {0x6AB34B92C5AB85BFull, 0xED7AC546689D76C2ull}, + {0xAC6AF8405A4A7DBEull, 0xD78FB7953256C3E1ull}, + {0x5A6E8CF789B86448ull, 0x834EF47C1BEDC218ull}, + {0x8EBFE0B573F425A3ull, 0xBCFCC410CB84325Aull}, + {0xA1E19717CAB8F1D6ull, 0x2AA50671881F877Dull}, + {0x0B595302950DA1ECull, 0x46932DE27204B388ull}, + {0x02FB033F200F89D4ull, 0xFEC3D7BB3B421F92ull}, + {0x0A5479D46CC1EADEull, 0x0C16A2D5A0F1C3DEull}, + {0xF759E41DDD621106ull, 0xB43D70116E004750ull}, + {0x980010BC36A4E98Full, 0x27479317AE00BBD1ull}, + {0x3BABF3B23761A379ull, 0xACCDC28E0256F326ull}, + {0x5780CD04269E142Eull, 0xBB70EE3F23BDEDA9ull}, + {0x4A401F1937E99EC3ull, 0x4B3D1385D6B4E214ull}, + {0x045C6EDE080E2CB0ull, 0x7327B45D2132DC89ull}, + {0x97E1624BEB1C1756ull, 0xB7137E1B69D45024ull}, + {0x31DBA8E3DB0BF012ull, 0x3E66E6A78A729B16ull}, + {0x34D6DF1B5D8AF2A7ull, 0x4F1A47FCBC39EB55ull}, + {0xE2C6BE2D47E5DCBCull, 0xD2FF85284E307C1Full}, + {0xDA681E06098EC892ull, 0x71AD98355019FED1ull}, + {0xC4FBD72B1F2FC30Bull, 0x327549B6C9FDEDD5ull}, + {0x14F429D1C20F0EB5ull, 0x228B40C92F3FA369ull}, + {0xF5C9535333206D01ull, 0xB6FC46FCCA65F9CCull}, + {0x3049FAD9DB729D2Dull, 0xB84C931C45F781EAull}, + {0x7C6FFE6F3706DC04ull, 0x4F94583806AE3C62ull}, + {0x9EF95EB28BE1CCE0ull, 0xAD9D5B96A0D15BFEull}, + {0x63D0ED54AF2985E6ull, 0xDFAFB1B6485C1B01ull}, + {0xA46C8A2FE498D46Cull, 0xF4DBAEC0FF03BAD6ull}, + {0xED978A0FBB3E5158ull, 0x060D144D57FBE6FDull}, + {0x53F1D80C8922E4E5ull, 0x1324880D932140C9ull}, + {0xDD363B03563870CEull, 0x0DFDB79F4F34184Bull}, + {0x4E702701AE65DB38ull, 0x1B67E0A2E2DBFB04ull}, + {0x240DA388551D0822ull, 0x2FF1BB584AC4BD61ull}, + {0x3FAFB8B7C26499ABull, 0x072516308E889132ull}, + {0x0AB452339406AB22ull, 0x751DBB7FF9472D42ull}, + {0x83BA782DB6EB1186ull, 0x4391544D9318DC29ull}, + {0x25077ECDAAB201E8ull, 0x695E0E95446D63A2ull}, + {0x1AF0BF12F91F17D4ull, 0x5BB8FF299368D22Cull}, + {0x338C09CBAF701E38ull, 0xA7D24D5E7C06DC78ull}, + {0x5AB58D6555D28B56ull, 0xE781413A9AE1310Full}, + {0xB0281CD10BCA7B89ull, 0xF49873B45C0F7274ull}, + {0x67EEBD6D71E57B06ull, 0x9421CB1DB54EEDDFull}, + {0x00DAB867E37EDA65ull, 0x6477E454191E213Full}, + {0x9AF9C4817C24C82Eull, 0xAE3A73522F311EEBull}, + {0xD8A334E30D23C6E6ull, 0xAF57EF86CCCF12FFull}, + {0x0353A48FC9E139DDull, 0x27D5626170A7DD0Full}, + {0x0DA12E888EB61876ull, 0x67B17DF10CB365CDull}, + {0x967CD764883A5E85ull, 0x570D7C9A774A6AB4ull}, + {0xA8DF13980C81E533ull, 0x9C33FE4797F87F1Aull}, + {0xCABB59F53AE75FF2ull, 0x6D25512E77172E7Aull}, + {0xB24E7F0C7DA62BE7ull, 0x2442F94890F57D89ull}, + {0x7DCBA0A5B9689BBDull, 0x700FC8D13DA4CC60ull}, + {0x1E8E014B97A9F828ull, 0xF858EFCA33E8A502ull}, + {0x4DAF4E31F34D10C7ull, 0x47E382D0A5A8C613ull}, + {0x577CAB4EF626BB28ull, 0xF6ED27E594C5795Full}, + {0x989188C958586C96ull, 0x8B3A2CB0D5B48FD9ull}, + {0x13CC58F5A076C088ull, 0x932A0FD21D4B422Cull}, + {0xD067380DAD885647ull, 0xC1020E396B31BB4Aull}, + {0x47D05A73072758D0ull, 0x5CF6075A0AEB5D78ull}, + {0x54441D7AE94E2D4Eull, 0x3B4F67953ABD3EA4ull}, + {0xEDD4250C3733EEBCull, 0x26E365AA1167C723ull}, + {0x92D02D2A641DA598ull, 0x3DAF5EB24A0C2A94ull}, + {0xAE6CF7FE2D76CA56ull, 0xC7918532A42D2F5Dull}, + {0xAD24762A08D96F1Bull, 0x729083EC59FA8DF7ull}}; + +const HHResult256 kExpected256[kMaxSize + 1] = { + {0xC6DC0C823434863Full, 0x6A42CCB644CBFAD9ull, 0x18DEF6A60EA5D873ull, + 0x3596F663D00D1225ull}, + {0x00518B3D2BD22424ull, 0xE5791619BF612E97ull, 0xF4DAF07017FAF99Dull, + 0xE36AE62C5509B5D6ull}, + {0x81021CC5067D8526ull, 0xBEEFC1BC87A6911Aull, 0xE2AEC605F80657FEull, + 0x3C6576B5DF982327ull}, + {0x118D72C0B5DB2C70ull, 0x0BE2E64BF538CA74ull, 0x667B33FE41DDAA74ull, + 0xB6199539303E13E1ull}, + {0x4AC9B8B2E4FD873Bull, 0xDE0FE265A45FFC97ull, 0x1FC1476F896ADA3Bull, + 0x7680B4AE30B371E7ull}, + {0x518ABC6B5E88214Full, 0xFD62A05B2B06026Bull, 0x9C978E8B38DBE795ull, + 0x41412401886FF054ull}, + {0x2DEDEF0832BEA7D9ull, 0x44EFE0AEAB7944FCull, 0x09AA7C9374A1E980ull, + 0x714DB8B507C507FBull}, + {0x6FA2135DE3D3D3AAull, 0xC0EEA9A890E36156ull, 0xFAC1DB8C817DB095ull, + 0x7B42789096836327ull}, + {0x27257C518B1FFC5Cull, 0x26CC8E669DA1AB0Full, 0xCD7B17C661A0A680ull, + 0x31D0A7EC0AA3B9BFull}, + {0xB91869900A1AF26Cull, 0x95B0D74B7FF20B43ull, 0x2A6CABF6F931B575ull, + 0x69734DC9E66A1965ull}, + {0xDD7DA31F5C4DD30Full, 0x08940D249A0A7B69ull, 0xAE7D3AD1C5EA81F2ull, + 0x96701DB5C6602B21ull}, + {0x2E4A230847E64687ull, 0xF96176C38E48B038ull, 0x9ED0B88A3026E1BCull, + 0x9AAB5DCA46FCFE19ull}, + {0x3E5CF04BFBAC2642ull, 0x591A3581001709DFull, 0xA0288F5FA63C10A2ull, + 0x85B94D3641A2C108ull}, + {0x454A95FAD8901350ull, 0x5546E8E75D2AC833ull, 0xCF5FF2ACB4B5F2C1ull, + 0x14F314318028D62Eull}, + {0x0DED251FB81F34A9ull, 0xC42111DB31618AA6ull, 0xC1C3352B70B00C5Dull, + 0xDC8947DBC398F0C2ull}, + {0xC591A100AB4E9E72ull, 0x4CCFD2A7B0D8D911ull, 0x6FEDFDDE1BA3F770ull, + 0x03E5C5A2F6E708A1ull}, + {0x537C42CC5E7B448Aull, 0xA7343E04249B2231ull, 0x2CB51D697EFE9B6Dull, + 0x589D83141A699A97ull}, + {0x3F7E6EA60343B870ull, 0x4E27E907E296D4D7ull, 0x87525BF1AABBF794ull, + 0x6B03C4DC206EC628ull}, + {0x741BA4D7A856E03Cull, 0x3798422CB64C9AFAull, 0xB1D89C9720D33FDDull, + 0x08DE607FC4E3B5C3ull}, + {0x77D77342C85BA466ull, 0xA01C603C58F6D97Eull, 0x342AF0A7309EA4EAull, + 0x9C958EB3F6A64B94ull}, + {0x9EDCADDD1FFC763Full, 0xBD9BAA6E9BE936EFull, 0xAAB0F78F1A4A94F7ull, + 0xE71D9CA601DA4C02ull}, + {0xE3AA0D0A422BF888ull, 0x07734C8173411035ull, 0x8A085019DE545AF6ull, + 0xBC3C520B1221A779ull}, + {0x16170C02C5E5439Dull, 0x45C6004513BFC174ull, 0x35CF3AD65D225EC8ull, + 0xE10BAA702D37C90Eull}, + {0x6BD63B47EA43ABC6ull, 0xCC08BE8A651E24C0ull, 0xB564F0FC6FF8998Aull, + 0x3EE409A34232E589ull}, + {0xD6CEE5574355BB81ull, 0x8E31FF40B271A16Dull, 0xC3ECEDBEEACCCAE9ull, + 0x19386CD3A23B92E9ull}, + {0x32475E05D248DBB1ull, 0xF2396A122830E72Cull, 0xB88395678C0DB899ull, + 0x8BD410A22A247066ull}, + {0x0BFA3B3C4775EB43ull, 0x496596C36FB2A200ull, 0xA00F533EF150D7DDull, + 0xB5D70BBCABB572C4ull}, + {0x932B0ED33ED691B1ull, 0xB58394EDCEA3C53Dull, 0xB935E0786B132755ull, + 0x3E0998322B3F74BAull}, + {0xE21F2CE1BDD156A7ull, 0x764518A56E1363B5ull, 0x461251D3EC39B93Full, + 0x33C1FE46C9664CC4ull}, + {0x8ABD3F6184C9CD7Dull, 0x8195816637017FC0ull, 0x284B3E93524765DEull, + 0x56147BDBA9362D0Eull}, + {0x1F050672342807B6ull, 0x9B0AD1091A83910Dull, 0xF23AD4A58C3B1E21ull, + 0xCC986EC0BEA16781ull}, + {0x053164DEF96B10CEull, 0x1D5ADA15E36D8F6Cull, 0x06FB43534C0472EFull, + 0x021C0ED1FDEA0948ull}, + {0xF62BA4C5A665E602ull, 0x490D89FD89430C56ull, 0x18F423BE8A9B7E3Cull, + 0x769E5DDA4DCAC619ull}, + {0xDABD25FAF07A6684ull, 0xACA85CD21536B927ull, 0xAC05E050B4E3D3D1ull, + 0xBE427B2475CCD981ull}, + {0x89A2B35A34F89F8Cull, 0x1A0E51B2875D34E6ull, 0xBA573CF45E123919ull, + 0x1C50815B08F1138Aull}, + {0x3390CCBE60F2AFF7ull, 0xD9E2D245643E79C2ull, 0x1104A78F85D3CDF5ull, + 0x7E55F38F9C53A58Full}, + {0xC189AE1A9D456C0Eull, 0x06AA4C3D4204A40Full, 0x4B383405A9D451A9ull, + 0x7EA34CBCAEF0C31Eull}, + {0xB45FA7CC19AE4DDFull, 0x306C418E9BA67420ull, 0xDF16D80D4D48C096ull, + 0xD3169E50BC8D75CCull}, + {0x5894367013710C89ull, 0xD39EE6D584E76AF3ull, 0x5C55A414BCDDE505ull, + 0x8FA97D561CB174BFull}, + {0x87355749D59F39DDull, 0x26B8B311E72C50F4ull, 0x1911A8CBCE53E37Bull, + 0x5C256452C39B95F6ull}, + {0x8B9E87C9ABC82821ull, 0x12A5FC06B69CDC2Dull, 0xF95104FF805E5E1Dull, + 0xE5D4D2257AD5592Eull}, + {0x5A89242B02E1E048ull, 0x771602AAD1880A7Eull, 0x0F34507608387843ull, + 0x7AFB45F3EA4F0F24ull}, + {0x3BE3800150FDDE00ull, 0x7871908FF91AD81Aull, 0xA00E07F351BB15C1ull, + 0x429658E7FD10D11Aull}, + {0x2B2B1A6CD1BA454Cull, 0xF19E8CA5C022308Aull, 0xAEFA0EB6F7C3CF74ull, + 0x21F4330A5258E7C7ull}, + {0xD1C806622910A9BEull, 0xFE224EF598F541B1ull, 0xB95A435AEC4DD849ull, + 0xD942A277AB57E68Eull}, + {0x16BF7116E8D2B328ull, 0xB37DC98EA931FC13ull, 0x18E8859A592C8C11ull, + 0x11590F16C4C61716ull}, + {0xD046122D4C7B24AEull, 0xBD0899DFD7345611ull, 0x91AAECB50DE6DFF9ull, + 0x6EDC4896BAA90FFAull}, + {0x2FE97B8135EA956Dull, 0xFBA50900FB4EF23Cull, 0x0BC907363F7EA368ull, + 0xA5C982D3094BCEE2ull}, + {0x247BFB5BA3A0F245ull, 0x6ACBDD4AFFDB03EBull, 0xA4237427D373B619ull, + 0xFA9C041D302B728Cull}, + {0xF93109909D6B80EFull, 0xD1321A6BEE302794ull, 0xD63E1E7985C458D3ull, + 0x644CD44F6C6FDE95ull}, + {0xD0522C663FBE65B0ull, 0x78F366F302EA33F5ull, 0xB9ED66D1CB87C891ull, + 0x0CEB2298BA9D1C1Aull}, + {0x60D60E9B569264E8ull, 0xE34447A5741417EAull, 0x04522108BDF3AFC3ull, + 0x90F4FE2D585B25FAull}, + {0xAF411662AAB81B12ull, 0x3AD58EBBA1BA2F39ull, 0x73E0E8EB5879E37Dull, + 0xCE0E8F8F613D3FC5ull}, + {0xCA756CB9E1FDF1C6ull, 0x89731D81712D34BDull, 0xBF520B2D830959C2ull, + 0xD35ED12BB24CE9EFull}, + {0x5FB2B65ABF038045ull, 0x3F2D32F8532E14D6ull, 0x06443CC95CDD58C8ull, + 0x30FC6FBE8CCE8EB8ull}, + {0x94A9774F02848D73ull, 0x83F9AFC4C0B48768ull, 0xDB7BF5FBD9B25A26ull, + 0x7F7D50266FFA639Bull}, + {0x352A775C646259DDull, 0xB2B532B472539832ull, 0x9981AE050A2FB38Cull, + 0xE13641E804F6DC00ull}, + {0x080E005A04E73352ull, 0x0314F6EA196A210Cull, 0x29EA80869CE307A4ull, + 0x4FABEB9ADE04BE00ull}, + {0x5674A4A533335ADFull, 0x3C7C0650FF6C585Bull, 0x384E4F8246446812ull, + 0xAE2DADA5E0EB6D81ull}, + {0xB6CE794A89B0A1F7ull, 0x0DC2B87EC9473CDDull, 0x349A006CA2899C88ull, + 0x4B411CB7DF6BF33Cull}, + {0xD79BB5606CE6BDAFull, 0x4040EA447818A5C1ull, 0x53D58C5710475284ull, + 0x3DA8730E092608BAull}, + {0x5900A2DAA12E085Cull, 0x80D490C510C493DDull, 0x4BDF17B0247C8D1Bull, + 0xA8649490D6CFCE67ull}, + {0xFBDAB07B10180D47ull, 0xED6C196BDC43E292ull, 0xE7D494077FA2791Dull, + 0xC7108D4FD01BBF85ull}, + {0x4365D6236E6AE467ull, 0xB3D540909D4308A5ull, 0xE38207ABD4588D68ull, + 0xBBD42849A8C92313ull}, + {0x064DB5FE415126F5ull, 0x248AF8FB29A9C595ull, 0x508633A742B3FFF7ull, + 0x24CFDCA800C34770ull}}; + +void RunTests() { + // TODO(janwas): detect number of cores. + ThreadPool pool(4); + + TargetBits tested = ~0U; + tested &= VerifyImplementations(kExpected64); + tested &= VerifyImplementations(kExpected128); + tested &= VerifyImplementations(kExpected256); + // Any failure causes immediate exit, so apparently all succeeded. + HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) { + printf("%10s: OK\n", TargetName(target)); + }); + + tested = ~0U; + tested &= VerifyCat<HHResult64>(&pool); + tested &= VerifyCat<HHResult128>(&pool); + tested &= VerifyCat<HHResult256>(&pool); + HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) { + printf("%10sCat: OK\n", TargetName(target)); + }); +} + +#ifdef HH_GOOGLETEST +TEST(HighwayhashTest, OutputMatchesExpectations) { RunTests(); } +#endif + +} // namespace +} // namespace highwayhash + +#ifndef HH_GOOGLETEST +int main(int argc, char* argv[]) { + highwayhash::RunTests(); + return 0; +} +#endif diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_test_avx2.cc b/contrib/libs/highwayhash/highwayhash/highwayhash_test_avx2.cc index f1efe0b5f0..6e12132e8c 100644 --- a/contrib/libs/highwayhash/highwayhash/highwayhash_test_avx2.cc +++ b/contrib/libs/highwayhash/highwayhash/highwayhash_test_avx2.cc @@ -1,19 +1,19 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// WARNING: this is a "restricted" source file; avoid including any headers -// unless they are also restricted. See arch_specific.h for details. - -#define HH_TARGET_NAME AVX2 -#include "highwayhash/highwayhash_test_target.cc" +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME AVX2 +#include "highwayhash/highwayhash_test_target.cc" diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_test_portable.cc b/contrib/libs/highwayhash/highwayhash/highwayhash_test_portable.cc index 04930a7e12..e5bee564a7 100644 --- a/contrib/libs/highwayhash/highwayhash/highwayhash_test_portable.cc +++ b/contrib/libs/highwayhash/highwayhash/highwayhash_test_portable.cc @@ -1,19 +1,19 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// WARNING: this is a "restricted" source file; avoid including any headers -// unless they are also restricted. See arch_specific.h for details. - -#define HH_TARGET_NAME Portable -#include "highwayhash/highwayhash_test_target.cc" +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME Portable +#include "highwayhash/highwayhash_test_target.cc" diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_test_sse41.cc b/contrib/libs/highwayhash/highwayhash/highwayhash_test_sse41.cc index 2d6e83d66f..1ae43bcca9 100644 --- a/contrib/libs/highwayhash/highwayhash/highwayhash_test_sse41.cc +++ b/contrib/libs/highwayhash/highwayhash/highwayhash_test_sse41.cc @@ -1,19 +1,19 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// WARNING: this is a "restricted" source file; avoid including any headers -// unless they are also restricted. See arch_specific.h for details. - -#define HH_TARGET_NAME SSE41 -#include "highwayhash/highwayhash_test_target.cc" +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME SSE41 +#include "highwayhash/highwayhash_test_target.cc" diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.cc b/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.cc index 701c14b927..b00704b83c 100644 --- a/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.cc +++ b/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.cc @@ -1,211 +1,211 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// WARNING: this is a "restricted" source file; avoid including any headers -// unless they are also restricted. See arch_specific.h for details. - -#include "highwayhash/highwayhash_test_target.h" - -#include "highwayhash/highwayhash.h" - -#ifndef HH_DISABLE_TARGET_SPECIFIC -namespace highwayhash { -namespace { - -void NotifyIfUnequal(const size_t size, const HHResult64& expected, - const HHResult64& actual, const HHNotify notify) { - if (expected != actual) { - (*notify)(TargetName(HH_TARGET), size); - } -} - -// Overload for HHResult128 or HHResult256 (arrays). -template <size_t kNumLanes> -void NotifyIfUnequal(const size_t size, const uint64_t (&expected)[kNumLanes], - const uint64_t (&actual)[kNumLanes], - const HHNotify notify) { - for (size_t i = 0; i < kNumLanes; ++i) { - if (expected[i] != actual[i]) { - (*notify)(TargetName(HH_TARGET), size); - return; - } - } -} - -// Shared logic for all HighwayHashTest::operator() overloads. -template <typename Result> -void TestHighwayHash(HHStateT<HH_TARGET>* HH_RESTRICT state, - const char* HH_RESTRICT bytes, const size_t size, - const Result* expected, const HHNotify notify) { - Result actual; - HighwayHashT(state, bytes, size, &actual); - NotifyIfUnequal(size, *expected, actual, notify); -} - -// Shared logic for all HighwayHashCatTest::operator() overloads. -template <typename Result> -void TestHighwayHashCat(const HHKey& key, const char* HH_RESTRICT bytes, - const size_t size, const Result* expected, - const HHNotify notify) { - // Slightly faster to compute the expected prefix hashes only once. - // Use new instead of vector to avoid headers with inline functions. - Result* results = new Result[size + 1]; - for (size_t i = 0; i <= size; ++i) { - HHStateT<HH_TARGET> state_flat(key); - HighwayHashT(&state_flat, bytes, i, &results[i]); - } - - // Splitting into three fragments/Append should cover all codepaths. - const size_t max_fragment_size = size / 3; - for (size_t size1 = 0; size1 < max_fragment_size; ++size1) { - for (size_t size2 = 0; size2 < max_fragment_size; ++size2) { - for (size_t size3 = 0; size3 < max_fragment_size; ++size3) { - HighwayHashCatT<HH_TARGET> cat(key); - const char* pos = bytes; - cat.Append(pos, size1); - pos += size1; - cat.Append(pos, size2); - pos += size2; - cat.Append(pos, size3); - pos += size3; - - Result result_cat; - cat.Finalize(&result_cat); - - const size_t total_size = pos - bytes; - NotifyIfUnequal(total_size, results[total_size], result_cat, notify); - } - } - } - - delete[] results; -} - -} // namespace - -template <TargetBits Target> -void HighwayHashTest<Target>::operator()(const HHKey& key, - const char* HH_RESTRICT bytes, - const size_t size, - const HHResult64* expected, - const HHNotify notify) const { - HHStateT<Target> state(key); - TestHighwayHash(&state, bytes, size, expected, notify); -} - -template <TargetBits Target> -void HighwayHashTest<Target>::operator()(const HHKey& key, - const char* HH_RESTRICT bytes, - const size_t size, - const HHResult128* expected, - const HHNotify notify) const { - HHStateT<Target> state(key); - TestHighwayHash(&state, bytes, size, expected, notify); -} - -template <TargetBits Target> -void HighwayHashTest<Target>::operator()(const HHKey& key, - const char* HH_RESTRICT bytes, - const size_t size, - const HHResult256* expected, - const HHNotify notify) const { - HHStateT<Target> state(key); - TestHighwayHash(&state, bytes, size, expected, notify); -} - -template <TargetBits Target> -void HighwayHashCatTest<Target>::operator()(const HHKey& key, - const char* HH_RESTRICT bytes, - const uint64_t size, - const HHResult64* expected, - const HHNotify notify) const { - TestHighwayHashCat(key, bytes, size, expected, notify); -} - -template <TargetBits Target> -void HighwayHashCatTest<Target>::operator()(const HHKey& key, - const char* HH_RESTRICT bytes, - const uint64_t size, - const HHResult128* expected, - const HHNotify notify) const { - TestHighwayHashCat(key, bytes, size, expected, notify); -} - -template <TargetBits Target> -void HighwayHashCatTest<Target>::operator()(const HHKey& key, - const char* HH_RESTRICT bytes, - const uint64_t size, - const HHResult256* expected, - const HHNotify notify) const { - TestHighwayHashCat(key, bytes, size, expected, notify); -} - -// Instantiate for the current target. -template struct HighwayHashTest<HH_TARGET>; -template struct HighwayHashCatTest<HH_TARGET>; - -//----------------------------------------------------------------------------- -// benchmark - -namespace { - -template <TargetBits Target> -uint64_t RunHighway(const size_t size) { - static const HHKey key HH_ALIGNAS(32) = {0, 1, 2, 3}; - char in[kMaxBenchmarkInputSize]; - in[0] = static_cast<char>(size & 0xFF); - HHResult64 result; - HHStateT<Target> state(key); - HighwayHashT(&state, in, size, &result); - return result; -} - -template <TargetBits Target> -uint64_t RunHighwayCat(const size_t size) { - static const HHKey key HH_ALIGNAS(32) = {0, 1, 2, 3}; - HH_ALIGNAS(64) HighwayHashCatT<Target> cat(key); - char in[kMaxBenchmarkInputSize]; - in[0] = static_cast<char>(size & 0xFF); - const size_t half_size = size / 2; - cat.Append(in, half_size); - cat.Append(in + half_size, size - half_size); - HHResult64 result; - cat.Finalize(&result); - return result; -} - -} // namespace - -template <TargetBits Target> -void HighwayHashBenchmark<Target>::operator()(DurationsForInputs* input_map, - NotifyBenchmark notify, - void* context) const { - MeasureDurations(&RunHighway<Target>, input_map); - notify("HighwayHash", TargetName(Target), input_map, context); -} - -template <TargetBits Target> -void HighwayHashCatBenchmark<Target>::operator()(DurationsForInputs* input_map, - NotifyBenchmark notify, - void* context) const { - MeasureDurations(&RunHighwayCat<Target>, input_map); - notify("HighwayHashCat", TargetName(Target), input_map, context); -} - -// Instantiate for the current target. -template struct HighwayHashBenchmark<HH_TARGET>; -template struct HighwayHashCatBenchmark<HH_TARGET>; - -} // namespace highwayhash -#endif // HH_DISABLE_TARGET_SPECIFIC +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#include "highwayhash/highwayhash_test_target.h" + +#include "highwayhash/highwayhash.h" + +#ifndef HH_DISABLE_TARGET_SPECIFIC +namespace highwayhash { +namespace { + +void NotifyIfUnequal(const size_t size, const HHResult64& expected, + const HHResult64& actual, const HHNotify notify) { + if (expected != actual) { + (*notify)(TargetName(HH_TARGET), size); + } +} + +// Overload for HHResult128 or HHResult256 (arrays). +template <size_t kNumLanes> +void NotifyIfUnequal(const size_t size, const uint64_t (&expected)[kNumLanes], + const uint64_t (&actual)[kNumLanes], + const HHNotify notify) { + for (size_t i = 0; i < kNumLanes; ++i) { + if (expected[i] != actual[i]) { + (*notify)(TargetName(HH_TARGET), size); + return; + } + } +} + +// Shared logic for all HighwayHashTest::operator() overloads. +template <typename Result> +void TestHighwayHash(HHStateT<HH_TARGET>* HH_RESTRICT state, + const char* HH_RESTRICT bytes, const size_t size, + const Result* expected, const HHNotify notify) { + Result actual; + HighwayHashT(state, bytes, size, &actual); + NotifyIfUnequal(size, *expected, actual, notify); +} + +// Shared logic for all HighwayHashCatTest::operator() overloads. +template <typename Result> +void TestHighwayHashCat(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, const Result* expected, + const HHNotify notify) { + // Slightly faster to compute the expected prefix hashes only once. + // Use new instead of vector to avoid headers with inline functions. + Result* results = new Result[size + 1]; + for (size_t i = 0; i <= size; ++i) { + HHStateT<HH_TARGET> state_flat(key); + HighwayHashT(&state_flat, bytes, i, &results[i]); + } + + // Splitting into three fragments/Append should cover all codepaths. + const size_t max_fragment_size = size / 3; + for (size_t size1 = 0; size1 < max_fragment_size; ++size1) { + for (size_t size2 = 0; size2 < max_fragment_size; ++size2) { + for (size_t size3 = 0; size3 < max_fragment_size; ++size3) { + HighwayHashCatT<HH_TARGET> cat(key); + const char* pos = bytes; + cat.Append(pos, size1); + pos += size1; + cat.Append(pos, size2); + pos += size2; + cat.Append(pos, size3); + pos += size3; + + Result result_cat; + cat.Finalize(&result_cat); + + const size_t total_size = pos - bytes; + NotifyIfUnequal(total_size, results[total_size], result_cat, notify); + } + } + } + + delete[] results; +} + +} // namespace + +template <TargetBits Target> +void HighwayHashTest<Target>::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const size_t size, + const HHResult64* expected, + const HHNotify notify) const { + HHStateT<Target> state(key); + TestHighwayHash(&state, bytes, size, expected, notify); +} + +template <TargetBits Target> +void HighwayHashTest<Target>::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const size_t size, + const HHResult128* expected, + const HHNotify notify) const { + HHStateT<Target> state(key); + TestHighwayHash(&state, bytes, size, expected, notify); +} + +template <TargetBits Target> +void HighwayHashTest<Target>::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const size_t size, + const HHResult256* expected, + const HHNotify notify) const { + HHStateT<Target> state(key); + TestHighwayHash(&state, bytes, size, expected, notify); +} + +template <TargetBits Target> +void HighwayHashCatTest<Target>::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const uint64_t size, + const HHResult64* expected, + const HHNotify notify) const { + TestHighwayHashCat(key, bytes, size, expected, notify); +} + +template <TargetBits Target> +void HighwayHashCatTest<Target>::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const uint64_t size, + const HHResult128* expected, + const HHNotify notify) const { + TestHighwayHashCat(key, bytes, size, expected, notify); +} + +template <TargetBits Target> +void HighwayHashCatTest<Target>::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const uint64_t size, + const HHResult256* expected, + const HHNotify notify) const { + TestHighwayHashCat(key, bytes, size, expected, notify); +} + +// Instantiate for the current target. +template struct HighwayHashTest<HH_TARGET>; +template struct HighwayHashCatTest<HH_TARGET>; + +//----------------------------------------------------------------------------- +// benchmark + +namespace { + +template <TargetBits Target> +uint64_t RunHighway(const size_t size) { + static const HHKey key HH_ALIGNAS(32) = {0, 1, 2, 3}; + char in[kMaxBenchmarkInputSize]; + in[0] = static_cast<char>(size & 0xFF); + HHResult64 result; + HHStateT<Target> state(key); + HighwayHashT(&state, in, size, &result); + return result; +} + +template <TargetBits Target> +uint64_t RunHighwayCat(const size_t size) { + static const HHKey key HH_ALIGNAS(32) = {0, 1, 2, 3}; + HH_ALIGNAS(64) HighwayHashCatT<Target> cat(key); + char in[kMaxBenchmarkInputSize]; + in[0] = static_cast<char>(size & 0xFF); + const size_t half_size = size / 2; + cat.Append(in, half_size); + cat.Append(in + half_size, size - half_size); + HHResult64 result; + cat.Finalize(&result); + return result; +} + +} // namespace + +template <TargetBits Target> +void HighwayHashBenchmark<Target>::operator()(DurationsForInputs* input_map, + NotifyBenchmark notify, + void* context) const { + MeasureDurations(&RunHighway<Target>, input_map); + notify("HighwayHash", TargetName(Target), input_map, context); +} + +template <TargetBits Target> +void HighwayHashCatBenchmark<Target>::operator()(DurationsForInputs* input_map, + NotifyBenchmark notify, + void* context) const { + MeasureDurations(&RunHighwayCat<Target>, input_map); + notify("HighwayHashCat", TargetName(Target), input_map, context); +} + +// Instantiate for the current target. +template struct HighwayHashBenchmark<HH_TARGET>; +template struct HighwayHashCatBenchmark<HH_TARGET>; + +} // namespace highwayhash +#endif // HH_DISABLE_TARGET_SPECIFIC diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.h b/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.h index b89695d346..88cca8c168 100644 --- a/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.h +++ b/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.h @@ -1,89 +1,89 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ -#define HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ - -// Tests called by InstructionSets::RunAll, so we can verify all -// implementations supported by the current CPU. - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include <stddef.h> - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" -#include "highwayhash/hh_types.h" -#include "highwayhash/nanobenchmark.h" - -namespace highwayhash { - -// Verifies the hash result matches "expected" and calls "notify" if not. -template <TargetBits Target> -struct HighwayHashTest { - void operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const size_t size, const HHResult64* expected, - const HHNotify notify) const; - void operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const size_t size, const HHResult128* expected, - const HHNotify notify) const; - void operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const size_t size, const HHResult256* expected, - const HHNotify notify) const; -}; - -// For every possible partition of "bytes" into zero to three fragments, -// verifies HighwayHashCat returns the same result as HighwayHashT of the -// concatenated fragments, and calls "notify" if not. The value of "expected" -// is ignored; it is only used for overloading. -template <TargetBits Target> -struct HighwayHashCatTest { - void operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const uint64_t size, const HHResult64* expected, - const HHNotify notify) const; - void operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const uint64_t size, const HHResult128* expected, - const HHNotify notify) const; - void operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const uint64_t size, const HHResult256* expected, - const HHNotify notify) const; -}; - -// Called by benchmark with prefix, target_name, input_map, context. -// This function must set input_map->num_items to 0. -using NotifyBenchmark = void (*)(const char*, const char*, DurationsForInputs*, - void*); - -constexpr size_t kMaxBenchmarkInputSize = 1024; - -// Calls "notify" with benchmark results for the input sizes specified by -// "input_map" (<= kMaxBenchmarkInputSize) plus a "context" parameter. -template <TargetBits Target> -struct HighwayHashBenchmark { - void operator()(DurationsForInputs* input_map, NotifyBenchmark notify, - void* context) const; -}; - -template <TargetBits Target> -struct HighwayHashCatBenchmark { - void operator()(DurationsForInputs* input_map, NotifyBenchmark notify, - void* context) const; -}; - -} // namespace highwayhash - -#endif // HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ +#define HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ + +// Tests called by InstructionSets::RunAll, so we can verify all +// implementations supported by the current CPU. + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include <stddef.h> + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/hh_types.h" +#include "highwayhash/nanobenchmark.h" + +namespace highwayhash { + +// Verifies the hash result matches "expected" and calls "notify" if not. +template <TargetBits Target> +struct HighwayHashTest { + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, const HHResult64* expected, + const HHNotify notify) const; + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, const HHResult128* expected, + const HHNotify notify) const; + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, const HHResult256* expected, + const HHNotify notify) const; +}; + +// For every possible partition of "bytes" into zero to three fragments, +// verifies HighwayHashCat returns the same result as HighwayHashT of the +// concatenated fragments, and calls "notify" if not. The value of "expected" +// is ignored; it is only used for overloading. +template <TargetBits Target> +struct HighwayHashCatTest { + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const uint64_t size, const HHResult64* expected, + const HHNotify notify) const; + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const uint64_t size, const HHResult128* expected, + const HHNotify notify) const; + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const uint64_t size, const HHResult256* expected, + const HHNotify notify) const; +}; + +// Called by benchmark with prefix, target_name, input_map, context. +// This function must set input_map->num_items to 0. +using NotifyBenchmark = void (*)(const char*, const char*, DurationsForInputs*, + void*); + +constexpr size_t kMaxBenchmarkInputSize = 1024; + +// Calls "notify" with benchmark results for the input sizes specified by +// "input_map" (<= kMaxBenchmarkInputSize) plus a "context" parameter. +template <TargetBits Target> +struct HighwayHashBenchmark { + void operator()(DurationsForInputs* input_map, NotifyBenchmark notify, + void* context) const; +}; + +template <TargetBits Target> +struct HighwayHashCatBenchmark { + void operator()(DurationsForInputs* input_map, NotifyBenchmark notify, + void* context) const; +}; + +} // namespace highwayhash + +#endif // HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ diff --git a/contrib/libs/highwayhash/highwayhash/iaca.h b/contrib/libs/highwayhash/highwayhash/iaca.h index 80e1013ae0..3a075544d4 100644 --- a/contrib/libs/highwayhash/highwayhash/iaca.h +++ b/contrib/libs/highwayhash/highwayhash/iaca.h @@ -1,63 +1,63 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_IACA_H_ -#define HIGHWAYHASH_IACA_H_ - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include "highwayhash/compiler_specific.h" - -// IACA (Intel's Code Analyzer, go/intel-iaca) analyzes instruction latencies, -// but only for code between special markers. These functions embed such markers -// in an executable, but only for reading via IACA - they deliberately trigger -// a crash if executed to ensure they are removed in normal builds. - -// Default off; callers must `#define HH_ENABLE_IACA 1` before including this. -#ifndef HH_ENABLE_IACA -#define HH_ENABLE_IACA 0 -#endif - -namespace highwayhash { - -#if HH_ENABLE_IACA && (HH_GCC_VERSION || HH_CLANG_VERSION) - -// Call before the region of interest. Fences hopefully prevent reordering. -static HH_INLINE void BeginIACA() { - HH_COMPILER_FENCE; - asm volatile( - ".byte 0x0F, 0x0B\n\t" // UD2 - "movl $111, %ebx\n\t" - ".byte 0x64, 0x67, 0x90\n\t"); - HH_COMPILER_FENCE; -} - -// Call after the region of interest. Fences hopefully prevent reordering. -static HH_INLINE void EndIACA() { - HH_COMPILER_FENCE; - asm volatile( - "movl $222, %ebx\n\t" - ".byte 0x64, 0x67, 0x90\n\t" - ".byte 0x0F, 0x0B\n\t"); // UD2 - HH_COMPILER_FENCE; -} - -#endif - -} // namespace highwayhash - -#endif // HIGHWAYHASH_IACA_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_IACA_H_ +#define HIGHWAYHASH_IACA_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/compiler_specific.h" + +// IACA (Intel's Code Analyzer, go/intel-iaca) analyzes instruction latencies, +// but only for code between special markers. These functions embed such markers +// in an executable, but only for reading via IACA - they deliberately trigger +// a crash if executed to ensure they are removed in normal builds. + +// Default off; callers must `#define HH_ENABLE_IACA 1` before including this. +#ifndef HH_ENABLE_IACA +#define HH_ENABLE_IACA 0 +#endif + +namespace highwayhash { + +#if HH_ENABLE_IACA && (HH_GCC_VERSION || HH_CLANG_VERSION) + +// Call before the region of interest. Fences hopefully prevent reordering. +static HH_INLINE void BeginIACA() { + HH_COMPILER_FENCE; + asm volatile( + ".byte 0x0F, 0x0B\n\t" // UD2 + "movl $111, %ebx\n\t" + ".byte 0x64, 0x67, 0x90\n\t"); + HH_COMPILER_FENCE; +} + +// Call after the region of interest. Fences hopefully prevent reordering. +static HH_INLINE void EndIACA() { + HH_COMPILER_FENCE; + asm volatile( + "movl $222, %ebx\n\t" + ".byte 0x64, 0x67, 0x90\n\t" + ".byte 0x0F, 0x0B\n\t"); // UD2 + HH_COMPILER_FENCE; +} + +#endif + +} // namespace highwayhash + +#endif // HIGHWAYHASH_IACA_H_ diff --git a/contrib/libs/highwayhash/highwayhash/instruction_sets.cc b/contrib/libs/highwayhash/highwayhash/instruction_sets.cc index a02e1f81d9..5760cd6303 100644 --- a/contrib/libs/highwayhash/highwayhash/instruction_sets.cc +++ b/contrib/libs/highwayhash/highwayhash/instruction_sets.cc @@ -1,141 +1,141 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "highwayhash/instruction_sets.h" -#include "highwayhash/arch_specific.h" - -// Currently there are only specialized targets for X64; other architectures -// only use HH_TARGET_Portable, in which case Supported() just returns that. -#if HH_ARCH_X64 - -#include <atomic> - -namespace highwayhash { - -namespace { - -bool IsBitSet(const uint32_t reg, const int index) { - return (reg & (1U << index)) != 0; -} - -// Returns the lower 32 bits of extended control register 0. -// Requires CPU support for "OSXSAVE" (see below). -uint32_t ReadXCR0() { -#if HH_MSC_VERSION - return static_cast<uint32_t>(_xgetbv(0)); -#else - uint32_t xcr0, xcr0_high; - const uint32_t index = 0; - asm volatile(".byte 0x0F, 0x01, 0xD0" - : "=a"(xcr0), "=d"(xcr0_high) - : "c"(index)); - return xcr0; -#endif -} - -// 0 iff not yet initialized by Supported(). -// Not function-local => no compiler-generated locking. -std::atomic<TargetBits> supported_{0}; - -// Bits indicating which instruction set extensions are supported. -enum { - kBitSSE = 1 << 0, - kBitSSE2 = 1 << 1, - kBitSSE3 = 1 << 2, - kBitSSSE3 = 1 << 3, - kBitSSE41 = 1 << 4, - kBitSSE42 = 1 << 5, - kBitAVX = 1 << 6, - kBitAVX2 = 1 << 7, - kBitFMA = 1 << 8, - kBitLZCNT = 1 << 9, - kBitBMI = 1 << 10, - kBitBMI2 = 1 << 11, - - kGroupAVX2 = kBitAVX | kBitAVX2 | kBitFMA | kBitLZCNT | kBitBMI | kBitBMI2, - kGroupSSE41 = kBitSSE | kBitSSE2 | kBitSSE3 | kBitSSSE3 | kBitSSE41 -}; - -} // namespace - -TargetBits InstructionSets::Supported() { - TargetBits supported = supported_.load(std::memory_order_acquire); - // Already initialized, return that. - if (HH_LIKELY(supported)) { - return supported; - } - - uint32_t flags = 0; - uint32_t abcd[4]; - - Cpuid(0, 0, abcd); - const uint32_t max_level = abcd[0]; - - // Standard feature flags - Cpuid(1, 0, abcd); - flags |= IsBitSet(abcd[3], 25) ? kBitSSE : 0; - flags |= IsBitSet(abcd[3], 26) ? kBitSSE2 : 0; - flags |= IsBitSet(abcd[2], 0) ? kBitSSE3 : 0; - flags |= IsBitSet(abcd[2], 9) ? kBitSSSE3 : 0; - flags |= IsBitSet(abcd[2], 19) ? kBitSSE41 : 0; - flags |= IsBitSet(abcd[2], 20) ? kBitSSE42 : 0; - flags |= IsBitSet(abcd[2], 12) ? kBitFMA : 0; - flags |= IsBitSet(abcd[2], 28) ? kBitAVX : 0; - const bool has_osxsave = IsBitSet(abcd[2], 27); - - // Extended feature flags - Cpuid(0x80000001U, 0, abcd); - flags |= IsBitSet(abcd[2], 5) ? kBitLZCNT : 0; - - // Extended features - if (max_level >= 7) { - Cpuid(7, 0, abcd); - flags |= IsBitSet(abcd[1], 3) ? kBitBMI : 0; - flags |= IsBitSet(abcd[1], 5) ? kBitAVX2 : 0; - flags |= IsBitSet(abcd[1], 8) ? kBitBMI2 : 0; - } - - // Verify OS support for XSAVE, without which XMM/YMM registers are not - // preserved across context switches and are not safe to use. - if (has_osxsave) { - const uint32_t xcr0 = ReadXCR0(); - // XMM - if ((xcr0 & 2) == 0) { - flags &= ~(kBitSSE | kBitSSE2 | kBitSSE3 | kBitSSSE3 | kBitSSE41 | - kBitSSE42 | kBitAVX | kBitAVX2 | kBitFMA); - } - // YMM - if ((xcr0 & 4) == 0) { - flags &= ~(kBitAVX | kBitAVX2); - } - } - - // Also indicates "supported" has been initialized. - supported = HH_TARGET_Portable; - - // Set target bit(s) if all their group's flags are all set. - if ((flags & kGroupAVX2) == kGroupAVX2) { - supported |= HH_TARGET_AVX2; - } - if ((flags & kGroupSSE41) == kGroupSSE41) { - supported |= HH_TARGET_SSE41; - } - - supported_.store(supported, std::memory_order_release); - return supported; -} - -} // namespace highwayhash - -#endif // HH_ARCH_X64 +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highwayhash/instruction_sets.h" +#include "highwayhash/arch_specific.h" + +// Currently there are only specialized targets for X64; other architectures +// only use HH_TARGET_Portable, in which case Supported() just returns that. +#if HH_ARCH_X64 + +#include <atomic> + +namespace highwayhash { + +namespace { + +bool IsBitSet(const uint32_t reg, const int index) { + return (reg & (1U << index)) != 0; +} + +// Returns the lower 32 bits of extended control register 0. +// Requires CPU support for "OSXSAVE" (see below). +uint32_t ReadXCR0() { +#if HH_MSC_VERSION + return static_cast<uint32_t>(_xgetbv(0)); +#else + uint32_t xcr0, xcr0_high; + const uint32_t index = 0; + asm volatile(".byte 0x0F, 0x01, 0xD0" + : "=a"(xcr0), "=d"(xcr0_high) + : "c"(index)); + return xcr0; +#endif +} + +// 0 iff not yet initialized by Supported(). +// Not function-local => no compiler-generated locking. +std::atomic<TargetBits> supported_{0}; + +// Bits indicating which instruction set extensions are supported. +enum { + kBitSSE = 1 << 0, + kBitSSE2 = 1 << 1, + kBitSSE3 = 1 << 2, + kBitSSSE3 = 1 << 3, + kBitSSE41 = 1 << 4, + kBitSSE42 = 1 << 5, + kBitAVX = 1 << 6, + kBitAVX2 = 1 << 7, + kBitFMA = 1 << 8, + kBitLZCNT = 1 << 9, + kBitBMI = 1 << 10, + kBitBMI2 = 1 << 11, + + kGroupAVX2 = kBitAVX | kBitAVX2 | kBitFMA | kBitLZCNT | kBitBMI | kBitBMI2, + kGroupSSE41 = kBitSSE | kBitSSE2 | kBitSSE3 | kBitSSSE3 | kBitSSE41 +}; + +} // namespace + +TargetBits InstructionSets::Supported() { + TargetBits supported = supported_.load(std::memory_order_acquire); + // Already initialized, return that. + if (HH_LIKELY(supported)) { + return supported; + } + + uint32_t flags = 0; + uint32_t abcd[4]; + + Cpuid(0, 0, abcd); + const uint32_t max_level = abcd[0]; + + // Standard feature flags + Cpuid(1, 0, abcd); + flags |= IsBitSet(abcd[3], 25) ? kBitSSE : 0; + flags |= IsBitSet(abcd[3], 26) ? kBitSSE2 : 0; + flags |= IsBitSet(abcd[2], 0) ? kBitSSE3 : 0; + flags |= IsBitSet(abcd[2], 9) ? kBitSSSE3 : 0; + flags |= IsBitSet(abcd[2], 19) ? kBitSSE41 : 0; + flags |= IsBitSet(abcd[2], 20) ? kBitSSE42 : 0; + flags |= IsBitSet(abcd[2], 12) ? kBitFMA : 0; + flags |= IsBitSet(abcd[2], 28) ? kBitAVX : 0; + const bool has_osxsave = IsBitSet(abcd[2], 27); + + // Extended feature flags + Cpuid(0x80000001U, 0, abcd); + flags |= IsBitSet(abcd[2], 5) ? kBitLZCNT : 0; + + // Extended features + if (max_level >= 7) { + Cpuid(7, 0, abcd); + flags |= IsBitSet(abcd[1], 3) ? kBitBMI : 0; + flags |= IsBitSet(abcd[1], 5) ? kBitAVX2 : 0; + flags |= IsBitSet(abcd[1], 8) ? kBitBMI2 : 0; + } + + // Verify OS support for XSAVE, without which XMM/YMM registers are not + // preserved across context switches and are not safe to use. + if (has_osxsave) { + const uint32_t xcr0 = ReadXCR0(); + // XMM + if ((xcr0 & 2) == 0) { + flags &= ~(kBitSSE | kBitSSE2 | kBitSSE3 | kBitSSSE3 | kBitSSE41 | + kBitSSE42 | kBitAVX | kBitAVX2 | kBitFMA); + } + // YMM + if ((xcr0 & 4) == 0) { + flags &= ~(kBitAVX | kBitAVX2); + } + } + + // Also indicates "supported" has been initialized. + supported = HH_TARGET_Portable; + + // Set target bit(s) if all their group's flags are all set. + if ((flags & kGroupAVX2) == kGroupAVX2) { + supported |= HH_TARGET_AVX2; + } + if ((flags & kGroupSSE41) == kGroupSSE41) { + supported |= HH_TARGET_SSE41; + } + + supported_.store(supported, std::memory_order_release); + return supported; +} + +} // namespace highwayhash + +#endif // HH_ARCH_X64 diff --git a/contrib/libs/highwayhash/highwayhash/instruction_sets.h b/contrib/libs/highwayhash/highwayhash/instruction_sets.h index 88bc1bc374..5d2251b654 100644 --- a/contrib/libs/highwayhash/highwayhash/instruction_sets.h +++ b/contrib/libs/highwayhash/highwayhash/instruction_sets.h @@ -1,88 +1,88 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_INSTRUCTION_SETS_H_ -#define HIGHWAYHASH_INSTRUCTION_SETS_H_ - -// Calls the best specialization of a template supported by the current CPU. -// -// Usage: for each dispatch site, declare a Functor template with a 'Target' -// argument, add a source file defining its operator() and instantiating -// Functor<HH_TARGET>, add a cc_library_for_targets rule for that source file, -// and call InstructionSets::Run<Functor>(/*args*/). - -#include <utility> // std::forward - -#include "highwayhash/arch_specific.h" // HH_TARGET_* -#include "highwayhash/compiler_specific.h" - -namespace highwayhash { - -// Detects TargetBits and calls specializations of a user-defined functor. -class InstructionSets { - public: -// Returns bit array of HH_TARGET_* supported by the current CPU. -// The HH_TARGET_Portable bit is guaranteed to be set. -#if HH_ARCH_X64 - static TargetBits Supported(); -#else - static HH_INLINE TargetBits Supported() { return HH_TARGET_Portable; } -#endif - - // Chooses the best available "Target" for the current CPU, runs the - // corresponding Func<Target>::operator()(args) and returns that Target - // (a single bit). The overhead of dispatching is low, about 4 cycles, but - // this should only be called infrequently (e.g. hoisting it out of loops). - template <template <TargetBits> class Func, typename... Args> - static HH_INLINE TargetBits Run(Args&&... args) { -#if HH_ARCH_X64 - const TargetBits supported = Supported(); - if (supported & HH_TARGET_AVX2) { - Func<HH_TARGET_AVX2>()(std::forward<Args>(args)...); - return HH_TARGET_AVX2; - } - if (supported & HH_TARGET_SSE41) { - Func<HH_TARGET_SSE41>()(std::forward<Args>(args)...); - return HH_TARGET_SSE41; - } -#endif // HH_ARCH_X64 - - Func<HH_TARGET_Portable>()(std::forward<Args>(args)...); - return HH_TARGET_Portable; - } - - // Calls Func<Target>::operator()(args) for all Target supported by the - // current CPU, and returns their HH_TARGET_* bits. - template <template <TargetBits> class Func, typename... Args> - static HH_INLINE TargetBits RunAll(Args&&... args) { -#if HH_ARCH_X64 - const TargetBits supported = Supported(); - if (supported & HH_TARGET_AVX2) { - Func<HH_TARGET_AVX2>()(std::forward<Args>(args)...); - } - if (supported & HH_TARGET_SSE41) { - Func<HH_TARGET_SSE41>()(std::forward<Args>(args)...); - } -#else - const TargetBits supported = HH_TARGET_Portable; -#endif // HH_ARCH_X64 - - Func<HH_TARGET_Portable>()(std::forward<Args>(args)...); - return supported; // i.e. all that were run - } -}; - -} // namespace highwayhash - -#endif // HIGHWAYHASH_INSTRUCTION_SETS_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_INSTRUCTION_SETS_H_ +#define HIGHWAYHASH_INSTRUCTION_SETS_H_ + +// Calls the best specialization of a template supported by the current CPU. +// +// Usage: for each dispatch site, declare a Functor template with a 'Target' +// argument, add a source file defining its operator() and instantiating +// Functor<HH_TARGET>, add a cc_library_for_targets rule for that source file, +// and call InstructionSets::Run<Functor>(/*args*/). + +#include <utility> // std::forward + +#include "highwayhash/arch_specific.h" // HH_TARGET_* +#include "highwayhash/compiler_specific.h" + +namespace highwayhash { + +// Detects TargetBits and calls specializations of a user-defined functor. +class InstructionSets { + public: +// Returns bit array of HH_TARGET_* supported by the current CPU. +// The HH_TARGET_Portable bit is guaranteed to be set. +#if HH_ARCH_X64 + static TargetBits Supported(); +#else + static HH_INLINE TargetBits Supported() { return HH_TARGET_Portable; } +#endif + + // Chooses the best available "Target" for the current CPU, runs the + // corresponding Func<Target>::operator()(args) and returns that Target + // (a single bit). The overhead of dispatching is low, about 4 cycles, but + // this should only be called infrequently (e.g. hoisting it out of loops). + template <template <TargetBits> class Func, typename... Args> + static HH_INLINE TargetBits Run(Args&&... args) { +#if HH_ARCH_X64 + const TargetBits supported = Supported(); + if (supported & HH_TARGET_AVX2) { + Func<HH_TARGET_AVX2>()(std::forward<Args>(args)...); + return HH_TARGET_AVX2; + } + if (supported & HH_TARGET_SSE41) { + Func<HH_TARGET_SSE41>()(std::forward<Args>(args)...); + return HH_TARGET_SSE41; + } +#endif // HH_ARCH_X64 + + Func<HH_TARGET_Portable>()(std::forward<Args>(args)...); + return HH_TARGET_Portable; + } + + // Calls Func<Target>::operator()(args) for all Target supported by the + // current CPU, and returns their HH_TARGET_* bits. + template <template <TargetBits> class Func, typename... Args> + static HH_INLINE TargetBits RunAll(Args&&... args) { +#if HH_ARCH_X64 + const TargetBits supported = Supported(); + if (supported & HH_TARGET_AVX2) { + Func<HH_TARGET_AVX2>()(std::forward<Args>(args)...); + } + if (supported & HH_TARGET_SSE41) { + Func<HH_TARGET_SSE41>()(std::forward<Args>(args)...); + } +#else + const TargetBits supported = HH_TARGET_Portable; +#endif // HH_ARCH_X64 + + Func<HH_TARGET_Portable>()(std::forward<Args>(args)...); + return supported; // i.e. all that were run + } +}; + +} // namespace highwayhash + +#endif // HIGHWAYHASH_INSTRUCTION_SETS_H_ diff --git a/contrib/libs/highwayhash/highwayhash/load3.h b/contrib/libs/highwayhash/highwayhash/load3.h index e226b19520..0bf0da9c4d 100644 --- a/contrib/libs/highwayhash/highwayhash/load3.h +++ b/contrib/libs/highwayhash/highwayhash/load3.h @@ -1,144 +1,144 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_HH_LOAD3_H_ -#define HIGHWAYHASH_HH_LOAD3_H_ - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include <stddef.h> -#include <stdint.h> - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" -#include "highwayhash/endianess.h" - -namespace highwayhash { -// To prevent ODR violations when including this from multiple translation -// units (TU) that are compiled with different flags, the contents must reside -// in a namespace whose name is unique to the TU. NOTE: this behavior is -// incompatible with precompiled modules and requires textual inclusion instead. -namespace HH_TARGET_NAME { - -// Loads 0 to 3 bytes from a given location using one of several policies. -// These are potentially faster than 8-bit loads, but require certain additional -// promises by the caller: that 'out of bounds' memory accesses are allowed, -// and/or that the bytes may be permuted or duplicated. -class Load3 { - public: - // In increasing order of complexity: - struct AllowReadBeforeAndReturn {}; - struct AllowReadBefore {}; - struct AllowUnordered {}; - struct AllowNone {}; - - // Up to 4 preceding bytes may be read and returned along with the 0..3 - // valid bytes. The valid bytes are in little-endian order, except that the - // preceding bytes occupy the least-significant bytes. - HH_INLINE uint32_t operator()(AllowReadBeforeAndReturn, const char* from, - const size_t size_mod4) { - // It's safe to read before "from", so we can load 32 bits, which is faster - // than individual byte loads. We assume little-endian byte order, so - // big-endian platforms will need to swap. Type punning can generate - // incorrect code if compiled with strict aliasing; the only safe - // alternatives are memcpy and reading through char*. We must avoid memcpy - // because string.h must not be included per the warning above. On GCC and - // Clang, we can use a builtin instead. - uint32_t last4; - Copy(from + size_mod4 - 4, 4, reinterpret_cast<char*>(&last4)); - return host_from_le32(last4); - } - - // As above, but preceding bytes are removed and upper byte(s) are zero. - HH_INLINE uint64_t operator()(AllowReadBefore, const char* from, - const size_t size_mod4) { - // Shift 0..3 valid bytes into LSB as if loaded in little-endian order. - // 64-bit type enables 32-bit shift when size_mod4 == 0. - uint64_t last3 = operator()(AllowReadBeforeAndReturn(), from, size_mod4); - last3 >>= 32 - (size_mod4 * 8); - return last3; - } - - // The bytes need not be loaded in little-endian order. This particular order - // (and the duplication of some bytes depending on "size_mod4") was chosen for - // computational convenience and can no longer be changed because it is part - // of the HighwayHash length padding definition. - HH_INLINE uint64_t operator()(AllowUnordered, const char* from, - const size_t size_mod4) { - uint64_t last3 = 0; - // Not allowed to read any bytes; early-out is faster than reading from a - // constant array of zeros. - if (size_mod4 == 0) { - return last3; - } - - // These indices are chosen as an easy-to-compute sequence containing the - // same elements as [0, size), but repeated and/or reordered. This enables - // unconditional loads, which outperform conditional 8 or 16+8 bit loads. - const uint64_t idx0 = 0; - const uint64_t idx1 = size_mod4 >> 1; - const uint64_t idx2 = size_mod4 - 1; - // Store into least significant bytes (avoids one shift). - last3 = static_cast<uint64_t>(from[idx0]); - last3 += static_cast<uint64_t>(from[idx1]) << 8; - last3 += static_cast<uint64_t>(from[idx2]) << 16; - return last3; - } - - // Must read exactly [0, size) bytes in little-endian order. - HH_INLINE uint64_t operator()(AllowNone, const char* from, - const size_t size_mod4) { - // We need to load in little-endian order without accessing anything outside - // [from, from + size_mod4). Unrolling is faster than looping backwards. - uint64_t last3 = 0; - if (size_mod4 >= 1) { - last3 += U64FromChar(from[0]); - } - if (size_mod4 >= 2) { - last3 += U64FromChar(from[1]) << 8; - } - if (size_mod4 == 3) { - last3 += U64FromChar(from[2]) << 16; - } - return last3; - } - - private: - static HH_INLINE uint32_t U32FromChar(const char c) { - return static_cast<uint32_t>(static_cast<unsigned char>(c)); - } - - static HH_INLINE uint64_t U64FromChar(const char c) { - return static_cast<uint64_t>(static_cast<unsigned char>(c)); - } - - static HH_INLINE void Copy(const char* HH_RESTRICT from, const size_t size, - char* HH_RESTRICT to) { -#if HH_MSC_VERSION - for (size_t i = 0; i < size; ++i) { - to[i] = from[i]; - } -#else - __builtin_memcpy(to, from, size); -#endif - } -}; - -} // namespace HH_TARGET_NAME -} // namespace highwayhash - -#endif // HIGHWAYHASH_LOAD3_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_LOAD3_H_ +#define HIGHWAYHASH_HH_LOAD3_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include <stddef.h> +#include <stdint.h> + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/endianess.h" + +namespace highwayhash { +// To prevent ODR violations when including this from multiple translation +// units (TU) that are compiled with different flags, the contents must reside +// in a namespace whose name is unique to the TU. NOTE: this behavior is +// incompatible with precompiled modules and requires textual inclusion instead. +namespace HH_TARGET_NAME { + +// Loads 0 to 3 bytes from a given location using one of several policies. +// These are potentially faster than 8-bit loads, but require certain additional +// promises by the caller: that 'out of bounds' memory accesses are allowed, +// and/or that the bytes may be permuted or duplicated. +class Load3 { + public: + // In increasing order of complexity: + struct AllowReadBeforeAndReturn {}; + struct AllowReadBefore {}; + struct AllowUnordered {}; + struct AllowNone {}; + + // Up to 4 preceding bytes may be read and returned along with the 0..3 + // valid bytes. The valid bytes are in little-endian order, except that the + // preceding bytes occupy the least-significant bytes. + HH_INLINE uint32_t operator()(AllowReadBeforeAndReturn, const char* from, + const size_t size_mod4) { + // It's safe to read before "from", so we can load 32 bits, which is faster + // than individual byte loads. We assume little-endian byte order, so + // big-endian platforms will need to swap. Type punning can generate + // incorrect code if compiled with strict aliasing; the only safe + // alternatives are memcpy and reading through char*. We must avoid memcpy + // because string.h must not be included per the warning above. On GCC and + // Clang, we can use a builtin instead. + uint32_t last4; + Copy(from + size_mod4 - 4, 4, reinterpret_cast<char*>(&last4)); + return host_from_le32(last4); + } + + // As above, but preceding bytes are removed and upper byte(s) are zero. + HH_INLINE uint64_t operator()(AllowReadBefore, const char* from, + const size_t size_mod4) { + // Shift 0..3 valid bytes into LSB as if loaded in little-endian order. + // 64-bit type enables 32-bit shift when size_mod4 == 0. + uint64_t last3 = operator()(AllowReadBeforeAndReturn(), from, size_mod4); + last3 >>= 32 - (size_mod4 * 8); + return last3; + } + + // The bytes need not be loaded in little-endian order. This particular order + // (and the duplication of some bytes depending on "size_mod4") was chosen for + // computational convenience and can no longer be changed because it is part + // of the HighwayHash length padding definition. + HH_INLINE uint64_t operator()(AllowUnordered, const char* from, + const size_t size_mod4) { + uint64_t last3 = 0; + // Not allowed to read any bytes; early-out is faster than reading from a + // constant array of zeros. + if (size_mod4 == 0) { + return last3; + } + + // These indices are chosen as an easy-to-compute sequence containing the + // same elements as [0, size), but repeated and/or reordered. This enables + // unconditional loads, which outperform conditional 8 or 16+8 bit loads. + const uint64_t idx0 = 0; + const uint64_t idx1 = size_mod4 >> 1; + const uint64_t idx2 = size_mod4 - 1; + // Store into least significant bytes (avoids one shift). + last3 = static_cast<uint64_t>(from[idx0]); + last3 += static_cast<uint64_t>(from[idx1]) << 8; + last3 += static_cast<uint64_t>(from[idx2]) << 16; + return last3; + } + + // Must read exactly [0, size) bytes in little-endian order. + HH_INLINE uint64_t operator()(AllowNone, const char* from, + const size_t size_mod4) { + // We need to load in little-endian order without accessing anything outside + // [from, from + size_mod4). Unrolling is faster than looping backwards. + uint64_t last3 = 0; + if (size_mod4 >= 1) { + last3 += U64FromChar(from[0]); + } + if (size_mod4 >= 2) { + last3 += U64FromChar(from[1]) << 8; + } + if (size_mod4 == 3) { + last3 += U64FromChar(from[2]) << 16; + } + return last3; + } + + private: + static HH_INLINE uint32_t U32FromChar(const char c) { + return static_cast<uint32_t>(static_cast<unsigned char>(c)); + } + + static HH_INLINE uint64_t U64FromChar(const char c) { + return static_cast<uint64_t>(static_cast<unsigned char>(c)); + } + + static HH_INLINE void Copy(const char* HH_RESTRICT from, const size_t size, + char* HH_RESTRICT to) { +#if HH_MSC_VERSION + for (size_t i = 0; i < size; ++i) { + to[i] = from[i]; + } +#else + __builtin_memcpy(to, from, size); +#endif + } +}; + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HIGHWAYHASH_LOAD3_H_ diff --git a/contrib/libs/highwayhash/highwayhash/nanobenchmark.cc b/contrib/libs/highwayhash/highwayhash/nanobenchmark.cc index f0ba6ad35b..5929016f6f 100644 --- a/contrib/libs/highwayhash/highwayhash/nanobenchmark.cc +++ b/contrib/libs/highwayhash/highwayhash/nanobenchmark.cc @@ -1,437 +1,437 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "highwayhash/nanobenchmark.h" - -#include <algorithm> -#include <cmath> -#include <cstdio> -#include <map> -#include <random> -#include <vector> - -#include <stddef.h> - -#include "highwayhash/os_specific.h" -#include "highwayhash/robust_statistics.h" -#include "highwayhash/tsc_timer.h" - -namespace highwayhash { -namespace { - -// Enables sanity checks that verify correct operation at the cost of -// longer benchmark runs. -#ifndef NANOBENCHMARK_ENABLE_CHECKS -#define NANOBENCHMARK_ENABLE_CHECKS 0 -#endif - -#define NANOBENCHMARK_CHECK_ALWAYS(condition) \ - while (!(condition)) { \ - printf("Nanobenchmark check failed at line %d\n", __LINE__); \ - abort(); \ - } - -#if NANOBENCHMARK_ENABLE_CHECKS -#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition) -#else -#define NANOBENCHMARK_CHECK(condition) -#endif - -#if HH_MSC_VERSION - -// MSVC does not support inline assembly anymore (and never supported GCC's -// RTL constraints used below). -#pragma optimize("", off) -// Self-assignment with #pragma optimize("off") might be expected to prevent -// elision, but it does not with MSVC 2015. -void UseCharPointer(volatile const char*) {} -#pragma optimize("", on) - -template <class T> -inline void PreventElision(T&& output) { - UseCharPointer(reinterpret_cast<volatile const char*>(&output)); -} - -#else - -// Prevents the compiler from eliding the computations that led to "output". -// Works by indicating to the compiler that "output" is being read and modified. -// The +r constraint avoids unnecessary writes to memory, but only works for -// FuncOutput. -template <class T> -inline void PreventElision(T&& output) { - asm volatile("" : "+r"(output) : : "memory"); -} - -#endif - -HH_NOINLINE FuncOutput Func1(const FuncInput input) { return input + 1; } -HH_NOINLINE FuncOutput Func2(const FuncInput input) { return input + 2; } - -// Cycles elapsed = difference between two cycle counts. Must be unsigned to -// ensure wraparound on overflow. -using Duration = uint32_t; - -// Even with high-priority pinned threads and frequency throttling disabled, -// elapsed times are noisy due to interrupts or SMM operations. It might help -// to detect such events via transactions and omit affected measurements. -// Unfortunately, TSX is currently unavailable due to a bug. We achieve -// repeatable results with a robust measure of the central tendency ("mode"). - -// Returns time elapsed between timer Start/Stop. -Duration EstimateResolutionOnCurrentCPU(const Func func) { - // Even 128K samples are not enough to achieve repeatable results when - // throttling is enabled; the caller must perform additional aggregation. - const size_t kNumSamples = 512; - Duration samples[kNumSamples]; - for (size_t i = 0; i < kNumSamples; ++i) { - const volatile Duration t0 = Start<Duration>(); - PreventElision(func(i)); - const volatile Duration t1 = Stop<Duration>(); - NANOBENCHMARK_CHECK(t0 <= t1); - samples[i] = t1 - t0; - } - CountingSort(samples, samples + kNumSamples); - const Duration resolution = Mode(samples, kNumSamples); - NANOBENCHMARK_CHECK(resolution != 0); - return resolution; -} - -// Returns mode of EstimateResolutionOnCurrentCPU across all CPUs. This -// increases repeatability because some CPUs may be throttled or slowed down by -// interrupts. -Duration EstimateResolution(const Func func_to_measure) { - Func func = (func_to_measure == &Func2) ? &Func1 : &Func2; - - const size_t kNumSamples = 512; - std::vector<Duration> resolutions; - resolutions.reserve(kNumSamples); - - const auto cpus = AvailableCPUs(); - const size_t repetitions_per_cpu = kNumSamples / cpus.size(); - - auto affinity = GetThreadAffinity(); - for (const int cpu : cpus) { - PinThreadToCPU(cpu); - for (size_t i = 0; i < repetitions_per_cpu; ++i) { - resolutions.push_back(EstimateResolutionOnCurrentCPU(func)); - } - } - SetThreadAffinity(affinity); - free(affinity); - - Duration* const begin = resolutions.data(); - CountingSort(begin, begin + resolutions.size()); - const Duration resolution = Mode(begin, resolutions.size()); - printf("Resolution %lu\n", long(resolution)); - return resolution; -} - -// Returns cycles elapsed when running an empty region, i.e. the timer -// resolution/overhead, which will be deducted from other measurements and -// also used by InitReplicas. -Duration Resolution(const Func func) { - // Initialization is expensive and should only happen once. - static const Duration resolution = EstimateResolution(func); - return resolution; -} - -// Returns cycles elapsed when passing each of "inputs" (after in-place -// shuffling) to "func", which must return something it has computed -// so the compiler does not optimize it away. -Duration CyclesElapsed(const Duration resolution, const Func func, - std::vector<FuncInput>* inputs) { - // This benchmark attempts to measure the performance of "func" when - // called with realistic inputs, which we assume are randomly drawn - // from the given "inputs" distribution, so we shuffle those values. - std::random_shuffle(inputs->begin(), inputs->end()); - - const Duration t0 = Start<Duration>(); - for (const FuncInput input : *inputs) { - PreventElision(func(input)); - } - const Duration t1 = Stop<Duration>(); - const Duration elapsed = t1 - t0; - NANOBENCHMARK_CHECK(elapsed > resolution); - return elapsed - resolution; -} - -// Stores input values for a series of calls to the function to measure. -// We assume inputs are drawn from a known discrete probability distribution, -// modeled as a vector<FuncInput> v. The probability of a value X -// in v is count(v.begin(), v.end(), X) / v.size(). -class Inputs { - Inputs(const Inputs&) = delete; - Inputs& operator=(const Inputs&) = delete; - - public: - Inputs(const Duration resolution, const std::vector<FuncInput>& distribution, - const Func func) - : unique_(InitUnique(distribution)), - replicas_(InitReplicas(distribution, resolution, func)), - num_replicas_(replicas_.size() / distribution.size()) { - printf("NumReplicas %zu\n", num_replicas_); - } - - // Returns vector of the unique values from the input distribution. - const std::vector<FuncInput>& Unique() const { return unique_; } - - // Returns how many instances of "distribution" are in "replicas_", i.e. - // the number of occurrences of an input value that occurred only once - // in the distribution. This is the divisor for computing the duration - // of a single call. - size_t NumReplicas() const { return num_replicas_; } - - // Returns the (replicated) input distribution. Modified by caller - // (shuffled in-place) => not thread-safe. - std::vector<FuncInput>& Replicas() { return replicas_; } - - // Returns a copy of Replicas() with NumReplicas() occurrences of "input" - // removed. Used for the leave-one-out measurement. - std::vector<FuncInput> Without(const FuncInput input_to_remove) const { - // "input_to_remove" should be in the original distribution. - NANOBENCHMARK_CHECK(std::find(unique_.begin(), unique_.end(), - input_to_remove) != unique_.end()); - - std::vector<FuncInput> copy = replicas_; - auto pos = std::partition(copy.begin(), copy.end(), - [input_to_remove](const FuncInput input) { - return input_to_remove != input; - }); - // Must occur at least num_replicas_ times. - NANOBENCHMARK_CHECK(copy.end() - pos >= num_replicas_); - // (Avoids unused-variable warning.) - PreventElision(&*pos); - copy.resize(copy.size() - num_replicas_); - return copy; - } - - private: - // Returns a copy with any duplicate values removed. Initializing unique_ - // through this function allows it to be const. - static std::vector<FuncInput> InitUnique( - const std::vector<FuncInput>& distribution) { - std::vector<FuncInput> unique = distribution; - std::sort(unique.begin(), unique.end()); - unique.erase(std::unique(unique.begin(), unique.end()), unique.end()); - // Our leave-one-out measurement technique only makes sense when - // there are multiple input values. - NANOBENCHMARK_CHECK(unique.size() >= 2); - return unique; - } - - // Returns how many replicas of "distribution" are required before - // CyclesElapsed is large enough compared to the timer resolution. - static std::vector<FuncInput> InitReplicas( - const std::vector<FuncInput>& distribution, const Duration resolution, - const Func func) { - // We compute the difference in duration for inputs = Replicas() vs. - // Without(). Dividing this by num_replicas must yield a value where the - // quantization error (from the timer resolution) is sufficiently small. - const uint64_t min_elapsed = distribution.size() * resolution * 400; - - std::vector<FuncInput> replicas; - for (;;) { - AppendReplica(distribution, &replicas); - -#if NANOBENCHMARK_ENABLE_CHECKS - const uint64_t t0 = Start64(); -#endif - const Duration elapsed = CyclesElapsed(resolution, func, &replicas); -#if NANOBENCHMARK_ENABLE_CHECKS - const uint64_t t1 = Stop64(); -#endif - // Ensure the 32-bit timer didn't and won't overflow. - NANOBENCHMARK_CHECK((t1 - t0) < (1ULL << 30)); - - if (elapsed >= min_elapsed) { - return replicas; - } - } - } - - // Appends all values in "distribution" to "replicas". - static void AppendReplica(const std::vector<FuncInput>& distribution, - std::vector<FuncInput>* replicas) { - replicas->reserve(replicas->size() + distribution.size()); - for (const FuncInput input : distribution) { - replicas->push_back(input); - } - } - - const std::vector<FuncInput> unique_; - - // Modified by caller (shuffled in-place) => non-const. - std::vector<FuncInput> replicas_; - - // Initialized from replicas_. - const size_t num_replicas_; -}; - -// Holds samples of measured durations, and (robustly) reduces them to a -// single result for each unique input value. -class DurationSamples { - public: - DurationSamples(const std::vector<FuncInput>& unique_inputs, - const size_t num_samples) - : num_samples_(num_samples) { - // Preallocate storage. - for (const FuncInput input : unique_inputs) { - samples_for_input_[input].reserve(num_samples); - } - } - - void Add(const FuncInput input, const Duration sample) { - // "input" should be one of the values passed to the ctor. - NANOBENCHMARK_CHECK(samples_for_input_.find(input) != - samples_for_input_.end()); - - samples_for_input_[input].push_back(sample); - } - - // Invokes "lambda" for each (input, duration) pair. The per-call duration - // is the central tendency (the mode) of the samples. - template <class Lambda> - void Reduce(const Lambda& lambda) { - for (auto& input_and_samples : samples_for_input_) { - const FuncInput input = input_and_samples.first; - std::vector<Duration>& samples = input_and_samples.second; - - NANOBENCHMARK_CHECK(samples.size() <= num_samples_); - std::sort(samples.begin(), samples.end()); - const Duration duration = Mode(samples.data(), samples.size()); - lambda(input, duration); - } - } - - private: - const size_t num_samples_; - std::map<FuncInput, std::vector<Duration>> samples_for_input_; -}; - -// Gathers "num_samples" durations via repeated leave-one-out measurements. -DurationSamples GatherDurationSamples(const Duration resolution, Inputs& inputs, - const Func func, - const size_t num_samples) { - DurationSamples samples(inputs.Unique(), num_samples); - for (size_t i = 0; i < num_samples; ++i) { - // Total duration for all shuffled input values. This may change over time, - // so recompute it for each sample. - const Duration total = CyclesElapsed(resolution, func, &inputs.Replicas()); - - for (const FuncInput input : inputs.Unique()) { - // To isolate the durations of the calls with this input value, - // we measure the duration without those values and subtract that - // from the total, and later divide by NumReplicas. - std::vector<FuncInput> without = inputs.Without(input); - for (int rep = 0; rep < 3; ++rep) { - const Duration elapsed = CyclesElapsed(resolution, func, &without); - if (elapsed < total) { - samples.Add(input, total - elapsed); - break; - } - } - } - } - return samples; -} - -} // namespace - -DurationsForInputs::DurationsForInputs(const FuncInput* inputs, - const size_t num_inputs, - const size_t max_durations) - : num_items(0), - inputs_(inputs), - num_inputs_(num_inputs), - max_durations_(max_durations), - all_durations_(new float[num_inputs * max_durations]) { - NANOBENCHMARK_CHECK(num_inputs != 0); - NANOBENCHMARK_CHECK(max_durations != 0); - - items = new Item[num_inputs]; - for (size_t i = 0; i < num_inputs_; ++i) { - items[i].input = 0; // initialized later - items[i].num_durations = 0; - items[i].durations = all_durations_ + i * max_durations; - } -} - -DurationsForInputs::~DurationsForInputs() { - delete[] all_durations_; - delete[] items; -} - -void DurationsForInputs::AddItem(const FuncInput input, const float sample) { - for (size_t i = 0; i < num_items; ++i) { - NANOBENCHMARK_CHECK(items[i].input != input); - } - Item& item = items[num_items]; - item.input = input; - item.num_durations = 1; - item.durations[0] = sample; - ++num_items; -} - -void DurationsForInputs::AddSample(const FuncInput input, const float sample) { - for (size_t i = 0; i < num_items; ++i) { - Item& item = items[i]; - if (item.input == input) { - item.durations[item.num_durations] = sample; - ++item.num_durations; - return; - } - } - NANOBENCHMARK_CHECK(!"Item not found"); -} - -void DurationsForInputs::Item::PrintMedianAndVariability() { - // Copy so that Median can modify. - std::vector<float> duration_vec(durations, durations + num_durations); - const float median = Median(&duration_vec); - const float variability = MedianAbsoluteDeviation(duration_vec, median); - printf("%5zu: median=%5.1f cycles; median abs. deviation=%4.1f cycles\n", - input, median, variability); -} - -void MeasureDurations(const Func func, DurationsForInputs* input_map) { - const Duration resolution = Resolution(func); - - // Adds enough 'replicas' of the distribution to measure "func" given - // the timer resolution. - const std::vector<FuncInput> distribution( - input_map->inputs_, input_map->inputs_ + input_map->num_inputs_); - Inputs inputs(resolution, distribution, func); - const double per_call = 1.0 / static_cast<int>(inputs.NumReplicas()); - - // First iteration: populate input_map items. - auto samples = GatherDurationSamples(resolution, inputs, func, 512); - samples.Reduce( - [per_call, input_map](const FuncInput input, const Duration duration) { - const float sample = static_cast<float>(duration * per_call); - input_map->AddItem(input, sample); - }); - - // Subsequent iteration(s): append to input_map items' array. - for (size_t rep = 1; rep < input_map->max_durations_; ++rep) { - auto samples = GatherDurationSamples(resolution, inputs, func, 512); - samples.Reduce( - [per_call, input_map](const FuncInput input, const Duration duration) { - const float sample = static_cast<float>(duration * per_call); - input_map->AddSample(input, sample); - }); - } -} - -} // namespace highwayhash +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highwayhash/nanobenchmark.h" + +#include <algorithm> +#include <cmath> +#include <cstdio> +#include <map> +#include <random> +#include <vector> + +#include <stddef.h> + +#include "highwayhash/os_specific.h" +#include "highwayhash/robust_statistics.h" +#include "highwayhash/tsc_timer.h" + +namespace highwayhash { +namespace { + +// Enables sanity checks that verify correct operation at the cost of +// longer benchmark runs. +#ifndef NANOBENCHMARK_ENABLE_CHECKS +#define NANOBENCHMARK_ENABLE_CHECKS 0 +#endif + +#define NANOBENCHMARK_CHECK_ALWAYS(condition) \ + while (!(condition)) { \ + printf("Nanobenchmark check failed at line %d\n", __LINE__); \ + abort(); \ + } + +#if NANOBENCHMARK_ENABLE_CHECKS +#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition) +#else +#define NANOBENCHMARK_CHECK(condition) +#endif + +#if HH_MSC_VERSION + +// MSVC does not support inline assembly anymore (and never supported GCC's +// RTL constraints used below). +#pragma optimize("", off) +// Self-assignment with #pragma optimize("off") might be expected to prevent +// elision, but it does not with MSVC 2015. +void UseCharPointer(volatile const char*) {} +#pragma optimize("", on) + +template <class T> +inline void PreventElision(T&& output) { + UseCharPointer(reinterpret_cast<volatile const char*>(&output)); +} + +#else + +// Prevents the compiler from eliding the computations that led to "output". +// Works by indicating to the compiler that "output" is being read and modified. +// The +r constraint avoids unnecessary writes to memory, but only works for +// FuncOutput. +template <class T> +inline void PreventElision(T&& output) { + asm volatile("" : "+r"(output) : : "memory"); +} + +#endif + +HH_NOINLINE FuncOutput Func1(const FuncInput input) { return input + 1; } +HH_NOINLINE FuncOutput Func2(const FuncInput input) { return input + 2; } + +// Cycles elapsed = difference between two cycle counts. Must be unsigned to +// ensure wraparound on overflow. +using Duration = uint32_t; + +// Even with high-priority pinned threads and frequency throttling disabled, +// elapsed times are noisy due to interrupts or SMM operations. It might help +// to detect such events via transactions and omit affected measurements. +// Unfortunately, TSX is currently unavailable due to a bug. We achieve +// repeatable results with a robust measure of the central tendency ("mode"). + +// Returns time elapsed between timer Start/Stop. +Duration EstimateResolutionOnCurrentCPU(const Func func) { + // Even 128K samples are not enough to achieve repeatable results when + // throttling is enabled; the caller must perform additional aggregation. + const size_t kNumSamples = 512; + Duration samples[kNumSamples]; + for (size_t i = 0; i < kNumSamples; ++i) { + const volatile Duration t0 = Start<Duration>(); + PreventElision(func(i)); + const volatile Duration t1 = Stop<Duration>(); + NANOBENCHMARK_CHECK(t0 <= t1); + samples[i] = t1 - t0; + } + CountingSort(samples, samples + kNumSamples); + const Duration resolution = Mode(samples, kNumSamples); + NANOBENCHMARK_CHECK(resolution != 0); + return resolution; +} + +// Returns mode of EstimateResolutionOnCurrentCPU across all CPUs. This +// increases repeatability because some CPUs may be throttled or slowed down by +// interrupts. +Duration EstimateResolution(const Func func_to_measure) { + Func func = (func_to_measure == &Func2) ? &Func1 : &Func2; + + const size_t kNumSamples = 512; + std::vector<Duration> resolutions; + resolutions.reserve(kNumSamples); + + const auto cpus = AvailableCPUs(); + const size_t repetitions_per_cpu = kNumSamples / cpus.size(); + + auto affinity = GetThreadAffinity(); + for (const int cpu : cpus) { + PinThreadToCPU(cpu); + for (size_t i = 0; i < repetitions_per_cpu; ++i) { + resolutions.push_back(EstimateResolutionOnCurrentCPU(func)); + } + } + SetThreadAffinity(affinity); + free(affinity); + + Duration* const begin = resolutions.data(); + CountingSort(begin, begin + resolutions.size()); + const Duration resolution = Mode(begin, resolutions.size()); + printf("Resolution %lu\n", long(resolution)); + return resolution; +} + +// Returns cycles elapsed when running an empty region, i.e. the timer +// resolution/overhead, which will be deducted from other measurements and +// also used by InitReplicas. +Duration Resolution(const Func func) { + // Initialization is expensive and should only happen once. + static const Duration resolution = EstimateResolution(func); + return resolution; +} + +// Returns cycles elapsed when passing each of "inputs" (after in-place +// shuffling) to "func", which must return something it has computed +// so the compiler does not optimize it away. +Duration CyclesElapsed(const Duration resolution, const Func func, + std::vector<FuncInput>* inputs) { + // This benchmark attempts to measure the performance of "func" when + // called with realistic inputs, which we assume are randomly drawn + // from the given "inputs" distribution, so we shuffle those values. + std::random_shuffle(inputs->begin(), inputs->end()); + + const Duration t0 = Start<Duration>(); + for (const FuncInput input : *inputs) { + PreventElision(func(input)); + } + const Duration t1 = Stop<Duration>(); + const Duration elapsed = t1 - t0; + NANOBENCHMARK_CHECK(elapsed > resolution); + return elapsed - resolution; +} + +// Stores input values for a series of calls to the function to measure. +// We assume inputs are drawn from a known discrete probability distribution, +// modeled as a vector<FuncInput> v. The probability of a value X +// in v is count(v.begin(), v.end(), X) / v.size(). +class Inputs { + Inputs(const Inputs&) = delete; + Inputs& operator=(const Inputs&) = delete; + + public: + Inputs(const Duration resolution, const std::vector<FuncInput>& distribution, + const Func func) + : unique_(InitUnique(distribution)), + replicas_(InitReplicas(distribution, resolution, func)), + num_replicas_(replicas_.size() / distribution.size()) { + printf("NumReplicas %zu\n", num_replicas_); + } + + // Returns vector of the unique values from the input distribution. + const std::vector<FuncInput>& Unique() const { return unique_; } + + // Returns how many instances of "distribution" are in "replicas_", i.e. + // the number of occurrences of an input value that occurred only once + // in the distribution. This is the divisor for computing the duration + // of a single call. + size_t NumReplicas() const { return num_replicas_; } + + // Returns the (replicated) input distribution. Modified by caller + // (shuffled in-place) => not thread-safe. + std::vector<FuncInput>& Replicas() { return replicas_; } + + // Returns a copy of Replicas() with NumReplicas() occurrences of "input" + // removed. Used for the leave-one-out measurement. + std::vector<FuncInput> Without(const FuncInput input_to_remove) const { + // "input_to_remove" should be in the original distribution. + NANOBENCHMARK_CHECK(std::find(unique_.begin(), unique_.end(), + input_to_remove) != unique_.end()); + + std::vector<FuncInput> copy = replicas_; + auto pos = std::partition(copy.begin(), copy.end(), + [input_to_remove](const FuncInput input) { + return input_to_remove != input; + }); + // Must occur at least num_replicas_ times. + NANOBENCHMARK_CHECK(copy.end() - pos >= num_replicas_); + // (Avoids unused-variable warning.) + PreventElision(&*pos); + copy.resize(copy.size() - num_replicas_); + return copy; + } + + private: + // Returns a copy with any duplicate values removed. Initializing unique_ + // through this function allows it to be const. + static std::vector<FuncInput> InitUnique( + const std::vector<FuncInput>& distribution) { + std::vector<FuncInput> unique = distribution; + std::sort(unique.begin(), unique.end()); + unique.erase(std::unique(unique.begin(), unique.end()), unique.end()); + // Our leave-one-out measurement technique only makes sense when + // there are multiple input values. + NANOBENCHMARK_CHECK(unique.size() >= 2); + return unique; + } + + // Returns how many replicas of "distribution" are required before + // CyclesElapsed is large enough compared to the timer resolution. + static std::vector<FuncInput> InitReplicas( + const std::vector<FuncInput>& distribution, const Duration resolution, + const Func func) { + // We compute the difference in duration for inputs = Replicas() vs. + // Without(). Dividing this by num_replicas must yield a value where the + // quantization error (from the timer resolution) is sufficiently small. + const uint64_t min_elapsed = distribution.size() * resolution * 400; + + std::vector<FuncInput> replicas; + for (;;) { + AppendReplica(distribution, &replicas); + +#if NANOBENCHMARK_ENABLE_CHECKS + const uint64_t t0 = Start64(); +#endif + const Duration elapsed = CyclesElapsed(resolution, func, &replicas); +#if NANOBENCHMARK_ENABLE_CHECKS + const uint64_t t1 = Stop64(); +#endif + // Ensure the 32-bit timer didn't and won't overflow. + NANOBENCHMARK_CHECK((t1 - t0) < (1ULL << 30)); + + if (elapsed >= min_elapsed) { + return replicas; + } + } + } + + // Appends all values in "distribution" to "replicas". + static void AppendReplica(const std::vector<FuncInput>& distribution, + std::vector<FuncInput>* replicas) { + replicas->reserve(replicas->size() + distribution.size()); + for (const FuncInput input : distribution) { + replicas->push_back(input); + } + } + + const std::vector<FuncInput> unique_; + + // Modified by caller (shuffled in-place) => non-const. + std::vector<FuncInput> replicas_; + + // Initialized from replicas_. + const size_t num_replicas_; +}; + +// Holds samples of measured durations, and (robustly) reduces them to a +// single result for each unique input value. +class DurationSamples { + public: + DurationSamples(const std::vector<FuncInput>& unique_inputs, + const size_t num_samples) + : num_samples_(num_samples) { + // Preallocate storage. + for (const FuncInput input : unique_inputs) { + samples_for_input_[input].reserve(num_samples); + } + } + + void Add(const FuncInput input, const Duration sample) { + // "input" should be one of the values passed to the ctor. + NANOBENCHMARK_CHECK(samples_for_input_.find(input) != + samples_for_input_.end()); + + samples_for_input_[input].push_back(sample); + } + + // Invokes "lambda" for each (input, duration) pair. The per-call duration + // is the central tendency (the mode) of the samples. + template <class Lambda> + void Reduce(const Lambda& lambda) { + for (auto& input_and_samples : samples_for_input_) { + const FuncInput input = input_and_samples.first; + std::vector<Duration>& samples = input_and_samples.second; + + NANOBENCHMARK_CHECK(samples.size() <= num_samples_); + std::sort(samples.begin(), samples.end()); + const Duration duration = Mode(samples.data(), samples.size()); + lambda(input, duration); + } + } + + private: + const size_t num_samples_; + std::map<FuncInput, std::vector<Duration>> samples_for_input_; +}; + +// Gathers "num_samples" durations via repeated leave-one-out measurements. +DurationSamples GatherDurationSamples(const Duration resolution, Inputs& inputs, + const Func func, + const size_t num_samples) { + DurationSamples samples(inputs.Unique(), num_samples); + for (size_t i = 0; i < num_samples; ++i) { + // Total duration for all shuffled input values. This may change over time, + // so recompute it for each sample. + const Duration total = CyclesElapsed(resolution, func, &inputs.Replicas()); + + for (const FuncInput input : inputs.Unique()) { + // To isolate the durations of the calls with this input value, + // we measure the duration without those values and subtract that + // from the total, and later divide by NumReplicas. + std::vector<FuncInput> without = inputs.Without(input); + for (int rep = 0; rep < 3; ++rep) { + const Duration elapsed = CyclesElapsed(resolution, func, &without); + if (elapsed < total) { + samples.Add(input, total - elapsed); + break; + } + } + } + } + return samples; +} + +} // namespace + +DurationsForInputs::DurationsForInputs(const FuncInput* inputs, + const size_t num_inputs, + const size_t max_durations) + : num_items(0), + inputs_(inputs), + num_inputs_(num_inputs), + max_durations_(max_durations), + all_durations_(new float[num_inputs * max_durations]) { + NANOBENCHMARK_CHECK(num_inputs != 0); + NANOBENCHMARK_CHECK(max_durations != 0); + + items = new Item[num_inputs]; + for (size_t i = 0; i < num_inputs_; ++i) { + items[i].input = 0; // initialized later + items[i].num_durations = 0; + items[i].durations = all_durations_ + i * max_durations; + } +} + +DurationsForInputs::~DurationsForInputs() { + delete[] all_durations_; + delete[] items; +} + +void DurationsForInputs::AddItem(const FuncInput input, const float sample) { + for (size_t i = 0; i < num_items; ++i) { + NANOBENCHMARK_CHECK(items[i].input != input); + } + Item& item = items[num_items]; + item.input = input; + item.num_durations = 1; + item.durations[0] = sample; + ++num_items; +} + +void DurationsForInputs::AddSample(const FuncInput input, const float sample) { + for (size_t i = 0; i < num_items; ++i) { + Item& item = items[i]; + if (item.input == input) { + item.durations[item.num_durations] = sample; + ++item.num_durations; + return; + } + } + NANOBENCHMARK_CHECK(!"Item not found"); +} + +void DurationsForInputs::Item::PrintMedianAndVariability() { + // Copy so that Median can modify. + std::vector<float> duration_vec(durations, durations + num_durations); + const float median = Median(&duration_vec); + const float variability = MedianAbsoluteDeviation(duration_vec, median); + printf("%5zu: median=%5.1f cycles; median abs. deviation=%4.1f cycles\n", + input, median, variability); +} + +void MeasureDurations(const Func func, DurationsForInputs* input_map) { + const Duration resolution = Resolution(func); + + // Adds enough 'replicas' of the distribution to measure "func" given + // the timer resolution. + const std::vector<FuncInput> distribution( + input_map->inputs_, input_map->inputs_ + input_map->num_inputs_); + Inputs inputs(resolution, distribution, func); + const double per_call = 1.0 / static_cast<int>(inputs.NumReplicas()); + + // First iteration: populate input_map items. + auto samples = GatherDurationSamples(resolution, inputs, func, 512); + samples.Reduce( + [per_call, input_map](const FuncInput input, const Duration duration) { + const float sample = static_cast<float>(duration * per_call); + input_map->AddItem(input, sample); + }); + + // Subsequent iteration(s): append to input_map items' array. + for (size_t rep = 1; rep < input_map->max_durations_; ++rep) { + auto samples = GatherDurationSamples(resolution, inputs, func, 512); + samples.Reduce( + [per_call, input_map](const FuncInput input, const Duration duration) { + const float sample = static_cast<float>(duration * per_call); + input_map->AddSample(input, sample); + }); + } +} + +} // namespace highwayhash diff --git a/contrib/libs/highwayhash/highwayhash/nanobenchmark.h b/contrib/libs/highwayhash/highwayhash/nanobenchmark.h index 1cf6426e0f..ba4ca5a9bb 100644 --- a/contrib/libs/highwayhash/highwayhash/nanobenchmark.h +++ b/contrib/libs/highwayhash/highwayhash/nanobenchmark.h @@ -1,158 +1,158 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_NANOBENCHMARK_H_ -#define HIGHWAYHASH_NANOBENCHMARK_H_ - -// Benchmarks functions of a single integer argument with realistic branch -// prediction hit rates. Uses a robust estimator to summarize the measurements. -// Measurements are precise to about 0.2 cycles. -// -// Example: -// #include "highwayhash/nanobenchmark.h" -// using namespace highwayhash; -// -// uint64_t RegionToMeasure(size_t size) { -// char from[8] = {static_cast<char>(size)}; -// char to[8]; -// memcpy(to, from, size); -// return to[0]; -// } -// -// PinThreadToRandomCPU(); -// -// static const size_t distribution[] = {3, 3, 4, 4, 7, 7, 8, 8}; -// DurationsForInputs input_map = MakeDurationsForInputs(distribution, 10); -// MeasureDurations(&RegionToMeasure, &input_map); -// for (size_t i = 0; i < input_map.num_items; ++i) { -// input_map.items[i].PrintMedianAndVariability(); -// } -// -// Output: -// 3: median= 25.2 cycles; median abs. deviation= 0.1 cycles -// 4: median= 13.5 cycles; median abs. deviation= 0.1 cycles -// 7: median= 13.5 cycles; median abs. deviation= 0.1 cycles -// 8: median= 27.5 cycles; median abs. deviation= 0.2 cycles -// (7 is presumably faster because it can use two unaligned 32-bit load/stores.) -// -// Background: Microbenchmarks such as http://github.com/google/benchmark -// can measure elapsed times on the order of a microsecond. Shorter functions -// are typically measured by repeating them thousands of times and dividing -// the total elapsed time by this count. Unfortunately, repetition (especially -// with the same input parameter!) influences the runtime. In time-critical -// code, it is reasonable to expect warm instruction/data caches and TLBs, -// but a perfect record of which branches will be taken is unrealistic. -// Unless the application also repeatedly invokes the measured function with -// the same parameter, the benchmark is measuring something very different - -// a best-case result, almost as if the parameter were made a compile-time -// constant. This may lead to erroneous conclusions about branch-heavy -// algorithms outperforming branch-free alternatives. -// -// Our approach differs in three ways. Adding fences to the timer functions -// reduces variability due to instruction reordering, improving the timer -// resolution to about 10 nanoseconds. However, shorter functions must still -// be invoked repeatedly. For more realistic branch prediction performance, -// we vary the input parameter according to a user-specified distribution. -// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the -// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the -// central tendency of the measurement samples with the "half sample mode", -// which is more robust to outliers and skewed data than the mean or median. - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include <stddef.h> -#include <stdint.h> -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" - -namespace highwayhash { - -// Argument to the function being measured (e.g. number of bytes to copy). -using FuncInput = size_t; - -// "Proof of work" returned by the function to ensure it is not elided. -using FuncOutput = uint64_t; - -// Function to measure (cannot use std::function in a restricted header). -using Func = FuncOutput (*)(FuncInput); - -// Flat map of input -> durations[]. -class DurationsForInputs { - public: - struct Item { - void PrintMedianAndVariability(); - - FuncInput input; // read-only (set by AddItem). - size_t num_durations; // written so far: [0, max_durations). - float* durations; // max_durations entries; points into all_durations. - }; - - // "inputs" is an array of "num_inputs" (not necessarily unique) arguments to - // "func". The values are chosen to maximize coverage of "func". The pointer - // must remain valid until after MeasureDurations. This represents a - // distribution, so a value's frequency should reflect its probability in the - // real application. Order does not matter; for example, a uniform - // distribution over [0, 4) could be represented as {3,0,2,1}. Repeating each - // value at least once ensures the leave-one-out distribution is closer to the - // original distribution, leading to more realistic results. - // - // "max_durations" is the number of duration samples to measure for each - // unique input value. Larger values decrease variability. - // - // Runtime is proportional to "num_inputs" * #unique * "max_durations". - DurationsForInputs(const FuncInput* inputs, const size_t num_inputs, - const size_t max_durations); - ~DurationsForInputs(); - - // Adds an item with the given "input" and "sample". Must only be called once - // per unique "input" value. - void AddItem(const FuncInput input, const float sample); - - // Adds "sample" to an already existing Item with the given "input". - void AddSample(const FuncInput input, const float sample); - - // Allow direct inspection of items[0..num_items-1] because accessor or - // ForeachItem functions are unsafe in a restricted header. - Item* items; // owned by this class, do not allocate/free. - size_t num_items; // safe to reset to zero. - - private: - friend void MeasureDurations(Func, DurationsForInputs*); - - const FuncInput* const inputs_; - const size_t num_inputs_; - const size_t max_durations_; - float* const all_durations_; -}; - -// Helper function to detect num_inputs from arrays. -template <size_t N> -static HH_INLINE DurationsForInputs MakeDurationsForInputs( - const FuncInput (&inputs)[N], const size_t max_durations) { - return DurationsForInputs(&inputs[0], N, max_durations); -} - -// Returns precise measurements of the cycles elapsed when calling "func" with -// each unique input value in "input_map", taking special care to maintain -// realistic branch prediction hit rates. -// -// "func" returns a 'proof of work' to ensure its computations are not elided. -void MeasureDurations(const Func func, DurationsForInputs* input_map); - -} // namespace highwayhash - -#endif // HIGHWAYHASH_NANOBENCHMARK_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_NANOBENCHMARK_H_ +#define HIGHWAYHASH_NANOBENCHMARK_H_ + +// Benchmarks functions of a single integer argument with realistic branch +// prediction hit rates. Uses a robust estimator to summarize the measurements. +// Measurements are precise to about 0.2 cycles. +// +// Example: +// #include "highwayhash/nanobenchmark.h" +// using namespace highwayhash; +// +// uint64_t RegionToMeasure(size_t size) { +// char from[8] = {static_cast<char>(size)}; +// char to[8]; +// memcpy(to, from, size); +// return to[0]; +// } +// +// PinThreadToRandomCPU(); +// +// static const size_t distribution[] = {3, 3, 4, 4, 7, 7, 8, 8}; +// DurationsForInputs input_map = MakeDurationsForInputs(distribution, 10); +// MeasureDurations(&RegionToMeasure, &input_map); +// for (size_t i = 0; i < input_map.num_items; ++i) { +// input_map.items[i].PrintMedianAndVariability(); +// } +// +// Output: +// 3: median= 25.2 cycles; median abs. deviation= 0.1 cycles +// 4: median= 13.5 cycles; median abs. deviation= 0.1 cycles +// 7: median= 13.5 cycles; median abs. deviation= 0.1 cycles +// 8: median= 27.5 cycles; median abs. deviation= 0.2 cycles +// (7 is presumably faster because it can use two unaligned 32-bit load/stores.) +// +// Background: Microbenchmarks such as http://github.com/google/benchmark +// can measure elapsed times on the order of a microsecond. Shorter functions +// are typically measured by repeating them thousands of times and dividing +// the total elapsed time by this count. Unfortunately, repetition (especially +// with the same input parameter!) influences the runtime. In time-critical +// code, it is reasonable to expect warm instruction/data caches and TLBs, +// but a perfect record of which branches will be taken is unrealistic. +// Unless the application also repeatedly invokes the measured function with +// the same parameter, the benchmark is measuring something very different - +// a best-case result, almost as if the parameter were made a compile-time +// constant. This may lead to erroneous conclusions about branch-heavy +// algorithms outperforming branch-free alternatives. +// +// Our approach differs in three ways. Adding fences to the timer functions +// reduces variability due to instruction reordering, improving the timer +// resolution to about 10 nanoseconds. However, shorter functions must still +// be invoked repeatedly. For more realistic branch prediction performance, +// we vary the input parameter according to a user-specified distribution. +// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the +// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the +// central tendency of the measurement samples with the "half sample mode", +// which is more robust to outliers and skewed data than the mean or median. + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include <stddef.h> +#include <stdint.h> +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" + +namespace highwayhash { + +// Argument to the function being measured (e.g. number of bytes to copy). +using FuncInput = size_t; + +// "Proof of work" returned by the function to ensure it is not elided. +using FuncOutput = uint64_t; + +// Function to measure (cannot use std::function in a restricted header). +using Func = FuncOutput (*)(FuncInput); + +// Flat map of input -> durations[]. +class DurationsForInputs { + public: + struct Item { + void PrintMedianAndVariability(); + + FuncInput input; // read-only (set by AddItem). + size_t num_durations; // written so far: [0, max_durations). + float* durations; // max_durations entries; points into all_durations. + }; + + // "inputs" is an array of "num_inputs" (not necessarily unique) arguments to + // "func". The values are chosen to maximize coverage of "func". The pointer + // must remain valid until after MeasureDurations. This represents a + // distribution, so a value's frequency should reflect its probability in the + // real application. Order does not matter; for example, a uniform + // distribution over [0, 4) could be represented as {3,0,2,1}. Repeating each + // value at least once ensures the leave-one-out distribution is closer to the + // original distribution, leading to more realistic results. + // + // "max_durations" is the number of duration samples to measure for each + // unique input value. Larger values decrease variability. + // + // Runtime is proportional to "num_inputs" * #unique * "max_durations". + DurationsForInputs(const FuncInput* inputs, const size_t num_inputs, + const size_t max_durations); + ~DurationsForInputs(); + + // Adds an item with the given "input" and "sample". Must only be called once + // per unique "input" value. + void AddItem(const FuncInput input, const float sample); + + // Adds "sample" to an already existing Item with the given "input". + void AddSample(const FuncInput input, const float sample); + + // Allow direct inspection of items[0..num_items-1] because accessor or + // ForeachItem functions are unsafe in a restricted header. + Item* items; // owned by this class, do not allocate/free. + size_t num_items; // safe to reset to zero. + + private: + friend void MeasureDurations(Func, DurationsForInputs*); + + const FuncInput* const inputs_; + const size_t num_inputs_; + const size_t max_durations_; + float* const all_durations_; +}; + +// Helper function to detect num_inputs from arrays. +template <size_t N> +static HH_INLINE DurationsForInputs MakeDurationsForInputs( + const FuncInput (&inputs)[N], const size_t max_durations) { + return DurationsForInputs(&inputs[0], N, max_durations); +} + +// Returns precise measurements of the cycles elapsed when calling "func" with +// each unique input value in "input_map", taking special care to maintain +// realistic branch prediction hit rates. +// +// "func" returns a 'proof of work' to ensure its computations are not elided. +void MeasureDurations(const Func func, DurationsForInputs* input_map); + +} // namespace highwayhash + +#endif // HIGHWAYHASH_NANOBENCHMARK_H_ diff --git a/contrib/libs/highwayhash/highwayhash/nanobenchmark_example.cc b/contrib/libs/highwayhash/highwayhash/nanobenchmark_example.cc index d95acf144a..f7b2269311 100644 --- a/contrib/libs/highwayhash/highwayhash/nanobenchmark_example.cc +++ b/contrib/libs/highwayhash/highwayhash/nanobenchmark_example.cc @@ -1,48 +1,48 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <cstddef> -#include <cstring> -#include <vector> - -#include "highwayhash/nanobenchmark.h" -#include "highwayhash/os_specific.h" - -namespace highwayhash { -namespace { - -uint64_t RegionToMeasure(FuncInput size) { - char from[8] = {static_cast<char>(size)}; - char to[8]; - memcpy(to, from, size); - return to[0]; -} - -void TestMemcpy() { - PinThreadToRandomCPU(); - static const size_t distribution[] = {3, 3, 4, 4, 7, 7, 8, 8}; - DurationsForInputs input_map = MakeDurationsForInputs(distribution, 10); - MeasureDurations(&RegionToMeasure, &input_map); - for (size_t i = 0; i < input_map.num_items; ++i) { - input_map.items[i].PrintMedianAndVariability(); - } -} - -} // namespace -} // namespace highwayhash - -int main(int argc, char* argv[]) { - highwayhash::TestMemcpy(); - return 0; -} +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <cstddef> +#include <cstring> +#include <vector> + +#include "highwayhash/nanobenchmark.h" +#include "highwayhash/os_specific.h" + +namespace highwayhash { +namespace { + +uint64_t RegionToMeasure(FuncInput size) { + char from[8] = {static_cast<char>(size)}; + char to[8]; + memcpy(to, from, size); + return to[0]; +} + +void TestMemcpy() { + PinThreadToRandomCPU(); + static const size_t distribution[] = {3, 3, 4, 4, 7, 7, 8, 8}; + DurationsForInputs input_map = MakeDurationsForInputs(distribution, 10); + MeasureDurations(&RegionToMeasure, &input_map); + for (size_t i = 0; i < input_map.num_items; ++i) { + input_map.items[i].PrintMedianAndVariability(); + } +} + +} // namespace +} // namespace highwayhash + +int main(int argc, char* argv[]) { + highwayhash::TestMemcpy(); + return 0; +} diff --git a/contrib/libs/highwayhash/highwayhash/os_specific.cc b/contrib/libs/highwayhash/highwayhash/os_specific.cc index c28b2c1ae3..5c877bc709 100644 --- a/contrib/libs/highwayhash/highwayhash/os_specific.cc +++ b/contrib/libs/highwayhash/highwayhash/os_specific.cc @@ -1,244 +1,244 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "highwayhash/os_specific.h" - -#include <algorithm> -#include <cstddef> -#include <cstdint> -#include <cstdio> -#include <cstdlib> -#include <ctime> -#include <random> - -#include "highwayhash/arch_specific.h" - -#if defined(_WIN32) || defined(_WIN64) -#define OS_WIN 1 -#define NOMINMAX -#include <windows.h> -#else -#define OS_WIN 0 -#endif - -#ifdef __linux__ -#define OS_LINUX 1 -#include <sched.h> -#include <sys/time.h> -#else -#define OS_LINUX 0 -#endif - +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highwayhash/os_specific.h" + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <cstdio> +#include <cstdlib> +#include <ctime> +#include <random> + +#include "highwayhash/arch_specific.h" + +#if defined(_WIN32) || defined(_WIN64) +#define OS_WIN 1 +#define NOMINMAX +#include <windows.h> +#else +#define OS_WIN 0 +#endif + +#ifdef __linux__ +#define OS_LINUX 1 +#include <sched.h> +#include <sys/time.h> +#else +#define OS_LINUX 0 +#endif + #if defined(__MACH__) || defined(__APPLE__) -#define OS_MAC 1 -#include <mach/mach.h> -#include <mach/mach_time.h> -#else -#define OS_MAC 0 -#endif - -#ifdef __FreeBSD__ -#define OS_FREEBSD 1 -#include <sys/cpuset.h> -#include <sys/param.h> -#include <unistd.h> -#else -#define OS_FREEBSD 0 -#endif - -namespace highwayhash { - -#define CHECK(condition) \ - while (!(condition)) { \ - printf("os_specific CHECK failed at line %d\n", __LINE__); \ - abort(); \ - } - -double Now() { -#if OS_WIN - LARGE_INTEGER counter; - (void)QueryPerformanceCounter(&counter); - static const double rcp_freq = []() { - LARGE_INTEGER freq; - (void)QueryPerformanceFrequency(&freq); - return 1.0 / freq.QuadPart; - }(); - return counter.QuadPart * rcp_freq; +#define OS_MAC 1 +#include <mach/mach.h> +#include <mach/mach_time.h> +#else +#define OS_MAC 0 +#endif + +#ifdef __FreeBSD__ +#define OS_FREEBSD 1 +#include <sys/cpuset.h> +#include <sys/param.h> +#include <unistd.h> +#else +#define OS_FREEBSD 0 +#endif + +namespace highwayhash { + +#define CHECK(condition) \ + while (!(condition)) { \ + printf("os_specific CHECK failed at line %d\n", __LINE__); \ + abort(); \ + } + +double Now() { +#if OS_WIN + LARGE_INTEGER counter; + (void)QueryPerformanceCounter(&counter); + static const double rcp_freq = []() { + LARGE_INTEGER freq; + (void)QueryPerformanceFrequency(&freq); + return 1.0 / freq.QuadPart; + }(); + return counter.QuadPart * rcp_freq; +#elif OS_MAC + const auto t = mach_absolute_time(); + // On OSX/iOS platform the elapsed time is cpu time unit + // We have to query the time base information to convert it back + // See https://developer.apple.com/library/mac/qa/qa1398/_index.html + static mach_timebase_info_data_t timebase; + if (timebase.denom == 0) { + (void)mach_timebase_info(&timebase); + } + return double(t) * timebase.numer / timebase.denom * 1E-9; +#else + timespec t; + clock_gettime(CLOCK_REALTIME, &t); + return t.tv_sec + t.tv_nsec * 1E-9; +#endif +} + +void RaiseThreadPriority() { +#if OS_WIN + BOOL ok = SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS); + CHECK(ok); + SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST); + CHECK(ok); +#elif OS_LINUX + // omit: SCHED_RR and SCHED_FIFO with sched_priority max, max-1 and max/2 + // lead to 2-3x runtime and higher variability! +#elif OS_FREEBSD #elif OS_MAC - const auto t = mach_absolute_time(); - // On OSX/iOS platform the elapsed time is cpu time unit - // We have to query the time base information to convert it back - // See https://developer.apple.com/library/mac/qa/qa1398/_index.html - static mach_timebase_info_data_t timebase; - if (timebase.denom == 0) { - (void)mach_timebase_info(&timebase); - } - return double(t) * timebase.numer / timebase.denom * 1E-9; -#else - timespec t; - clock_gettime(CLOCK_REALTIME, &t); - return t.tv_sec + t.tv_nsec * 1E-9; -#endif -} - -void RaiseThreadPriority() { -#if OS_WIN - BOOL ok = SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS); - CHECK(ok); - SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST); - CHECK(ok); -#elif OS_LINUX - // omit: SCHED_RR and SCHED_FIFO with sched_priority max, max-1 and max/2 - // lead to 2-3x runtime and higher variability! -#elif OS_FREEBSD +#else +#error "port" +#endif +} + +struct ThreadAffinity { +#if OS_WIN + DWORD_PTR mask; +#elif OS_LINUX + cpu_set_t set; +#elif OS_FREEBSD + cpuset_t set; +#endif +}; + +ThreadAffinity* GetThreadAffinity() { + ThreadAffinity* affinity = + static_cast<ThreadAffinity*>(malloc(sizeof(ThreadAffinity))); +#if OS_WIN + DWORD_PTR system_affinity; + const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &affinity->mask, + &system_affinity); + CHECK(ok); +#elif OS_LINUX + const pid_t pid = 0; // current thread + const int err = sched_getaffinity(pid, sizeof(cpu_set_t), &affinity->set); + CHECK(err == 0); +#elif OS_FREEBSD + const pid_t pid = getpid(); // current thread + const int err = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, + sizeof(cpuset_t), &affinity->set); + CHECK(err == 0); +#endif + return affinity; +} + +namespace { + +ThreadAffinity* OriginalThreadAffinity() { + static ThreadAffinity* original = GetThreadAffinity(); + return original; +} + +} // namespace + +void SetThreadAffinity(ThreadAffinity* affinity) { + // Ensure original is initialized before changing. + const ThreadAffinity* const original = OriginalThreadAffinity(); + CHECK(original != nullptr); + +#if OS_WIN + const HANDLE hThread = GetCurrentThread(); + const DWORD_PTR prev = SetThreadAffinityMask(hThread, affinity->mask); + CHECK(prev != 0); +#elif OS_LINUX + const pid_t pid = 0; // current thread + const int err = sched_setaffinity(pid, sizeof(cpu_set_t), &affinity->set); + CHECK(err == 0); +#elif OS_FREEBSD + const pid_t pid = getpid(); // current thread + const int err = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, + sizeof(cpuset_t), &affinity->set); + CHECK(err == 0); #elif OS_MAC -#else -#error "port" -#endif -} - -struct ThreadAffinity { -#if OS_WIN - DWORD_PTR mask; -#elif OS_LINUX - cpu_set_t set; -#elif OS_FREEBSD - cpuset_t set; -#endif -}; - -ThreadAffinity* GetThreadAffinity() { - ThreadAffinity* affinity = - static_cast<ThreadAffinity*>(malloc(sizeof(ThreadAffinity))); -#if OS_WIN - DWORD_PTR system_affinity; - const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &affinity->mask, - &system_affinity); - CHECK(ok); -#elif OS_LINUX - const pid_t pid = 0; // current thread - const int err = sched_getaffinity(pid, sizeof(cpu_set_t), &affinity->set); - CHECK(err == 0); -#elif OS_FREEBSD - const pid_t pid = getpid(); // current thread - const int err = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, - sizeof(cpuset_t), &affinity->set); - CHECK(err == 0); -#endif - return affinity; -} - -namespace { - -ThreadAffinity* OriginalThreadAffinity() { - static ThreadAffinity* original = GetThreadAffinity(); - return original; -} - -} // namespace - -void SetThreadAffinity(ThreadAffinity* affinity) { - // Ensure original is initialized before changing. - const ThreadAffinity* const original = OriginalThreadAffinity(); - CHECK(original != nullptr); - -#if OS_WIN - const HANDLE hThread = GetCurrentThread(); - const DWORD_PTR prev = SetThreadAffinityMask(hThread, affinity->mask); - CHECK(prev != 0); -#elif OS_LINUX - const pid_t pid = 0; // current thread - const int err = sched_setaffinity(pid, sizeof(cpu_set_t), &affinity->set); - CHECK(err == 0); -#elif OS_FREEBSD - const pid_t pid = getpid(); // current thread - const int err = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, - sizeof(cpuset_t), &affinity->set); - CHECK(err == 0); +#else +#error "port" +#endif +} + +std::vector<int> AvailableCPUs() { + std::vector<int> cpus; + cpus.reserve(64); + const ThreadAffinity* const affinity = OriginalThreadAffinity(); +#if OS_WIN + for (int cpu = 0; cpu < 64; ++cpu) { + if (affinity->mask & (1ULL << cpu)) { + cpus.push_back(cpu); + } + } +#elif OS_LINUX + for (size_t cpu = 0; cpu < sizeof(cpu_set_t) * 8; ++cpu) { + if (CPU_ISSET(cpu, &affinity->set)) { + cpus.push_back(cpu); + } + } +#elif OS_FREEBSD + for (size_t cpu = 0; cpu < sizeof(cpuset_t) * 8; ++cpu) { + if (CPU_ISSET(cpu, &affinity->set)) { + cpus.push_back(cpu); + } + } #elif OS_MAC -#else -#error "port" -#endif -} - -std::vector<int> AvailableCPUs() { - std::vector<int> cpus; - cpus.reserve(64); - const ThreadAffinity* const affinity = OriginalThreadAffinity(); -#if OS_WIN - for (int cpu = 0; cpu < 64; ++cpu) { - if (affinity->mask & (1ULL << cpu)) { - cpus.push_back(cpu); - } - } -#elif OS_LINUX - for (size_t cpu = 0; cpu < sizeof(cpu_set_t) * 8; ++cpu) { - if (CPU_ISSET(cpu, &affinity->set)) { - cpus.push_back(cpu); - } - } -#elif OS_FREEBSD - for (size_t cpu = 0; cpu < sizeof(cpuset_t) * 8; ++cpu) { - if (CPU_ISSET(cpu, &affinity->set)) { - cpus.push_back(cpu); - } - } +#else +#error "port" +#endif + return cpus; +} + +void PinThreadToCPU(const int cpu) { + ThreadAffinity affinity; +#if OS_WIN + affinity.mask = 1ULL << cpu; +#elif OS_LINUX + CPU_ZERO(&affinity.set); + CPU_SET(cpu, &affinity.set); +#elif OS_FREEBSD + CPU_ZERO(&affinity.set); + CPU_SET(cpu, &affinity.set); #elif OS_MAC -#else -#error "port" -#endif - return cpus; -} - -void PinThreadToCPU(const int cpu) { - ThreadAffinity affinity; -#if OS_WIN - affinity.mask = 1ULL << cpu; -#elif OS_LINUX - CPU_ZERO(&affinity.set); - CPU_SET(cpu, &affinity.set); -#elif OS_FREEBSD - CPU_ZERO(&affinity.set); - CPU_SET(cpu, &affinity.set); -#elif OS_MAC -#else -#error "port" -#endif - SetThreadAffinity(&affinity); -} - -void PinThreadToRandomCPU() { - std::vector<int> cpus = AvailableCPUs(); - - // Remove first two CPUs because interrupts are often pinned to them. - CHECK(cpus.size() > 2); - cpus.erase(cpus.begin(), cpus.begin() + 2); - - // Random choice to prevent burning up the same core. - std::random_device device; - std::ranlux48 generator(device()); - std::shuffle(cpus.begin(), cpus.end(), generator); - const int cpu = cpus.front(); - - PinThreadToCPU(cpu); - -#if HH_ARCH_X64 - // After setting affinity, we should be running on the desired CPU. - printf("Running on CPU #%d, APIC ID %02x\n", cpu, ApicId()); -#else - printf("Running on CPU #%d\n", cpu); -#endif -} - -} // namespace highwayhash +#else +#error "port" +#endif + SetThreadAffinity(&affinity); +} + +void PinThreadToRandomCPU() { + std::vector<int> cpus = AvailableCPUs(); + + // Remove first two CPUs because interrupts are often pinned to them. + CHECK(cpus.size() > 2); + cpus.erase(cpus.begin(), cpus.begin() + 2); + + // Random choice to prevent burning up the same core. + std::random_device device; + std::ranlux48 generator(device()); + std::shuffle(cpus.begin(), cpus.end(), generator); + const int cpu = cpus.front(); + + PinThreadToCPU(cpu); + +#if HH_ARCH_X64 + // After setting affinity, we should be running on the desired CPU. + printf("Running on CPU #%d, APIC ID %02x\n", cpu, ApicId()); +#else + printf("Running on CPU #%d\n", cpu); +#endif +} + +} // namespace highwayhash diff --git a/contrib/libs/highwayhash/highwayhash/os_specific.h b/contrib/libs/highwayhash/highwayhash/os_specific.h index 46f3c3e3ef..cefd3628e4 100644 --- a/contrib/libs/highwayhash/highwayhash/os_specific.h +++ b/contrib/libs/highwayhash/highwayhash/os_specific.h @@ -1,54 +1,54 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_OS_SPECIFIC_H_ -#define HIGHWAYHASH_OS_SPECIFIC_H_ - -#include <vector> - -namespace highwayhash { - -// Returns current wall-clock time [seconds]. -double Now(); - -// Sets this thread's priority to the maximum. This should not be called on -// single-core systems. Requires elevated permissions. No effect on Linux -// because it increases runtime and variability (issue #19). -void RaiseThreadPriority(); - -// Returns CPU numbers in [0, N), where N is the number of bits in the -// thread's initial affinity (unaffected by any SetThreadAffinity). -std::vector<int> AvailableCPUs(); - -// Opaque. -struct ThreadAffinity; - -// Caller must free() the return value. -ThreadAffinity* GetThreadAffinity(); - -// Restores a previous affinity returned by GetThreadAffinity. -void SetThreadAffinity(ThreadAffinity* affinity); - -// Ensures the thread is running on the specified cpu, and no others. -// Useful for reducing nanobenchmark variability (fewer context switches). -// Uses SetThreadAffinity. -void PinThreadToCPU(const int cpu); - -// Random choice of CPU avoids overloading any one core. -// Uses SetThreadAffinity. -void PinThreadToRandomCPU(); - -} // namespace highwayhash - -#endif // HIGHWAYHASH_OS_SPECIFIC_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_OS_SPECIFIC_H_ +#define HIGHWAYHASH_OS_SPECIFIC_H_ + +#include <vector> + +namespace highwayhash { + +// Returns current wall-clock time [seconds]. +double Now(); + +// Sets this thread's priority to the maximum. This should not be called on +// single-core systems. Requires elevated permissions. No effect on Linux +// because it increases runtime and variability (issue #19). +void RaiseThreadPriority(); + +// Returns CPU numbers in [0, N), where N is the number of bits in the +// thread's initial affinity (unaffected by any SetThreadAffinity). +std::vector<int> AvailableCPUs(); + +// Opaque. +struct ThreadAffinity; + +// Caller must free() the return value. +ThreadAffinity* GetThreadAffinity(); + +// Restores a previous affinity returned by GetThreadAffinity. +void SetThreadAffinity(ThreadAffinity* affinity); + +// Ensures the thread is running on the specified cpu, and no others. +// Useful for reducing nanobenchmark variability (fewer context switches). +// Uses SetThreadAffinity. +void PinThreadToCPU(const int cpu); + +// Random choice of CPU avoids overloading any one core. +// Uses SetThreadAffinity. +void PinThreadToRandomCPU(); + +} // namespace highwayhash + +#endif // HIGHWAYHASH_OS_SPECIFIC_H_ diff --git a/contrib/libs/highwayhash/highwayhash/profiler.h b/contrib/libs/highwayhash/highwayhash/profiler.h index 09da7e71a5..9e8f5f6958 100644 --- a/contrib/libs/highwayhash/highwayhash/profiler.h +++ b/contrib/libs/highwayhash/highwayhash/profiler.h @@ -1,754 +1,754 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_PROFILER_H_ -#define HIGHWAYHASH_PROFILER_H_ - -// High precision, low overhead time measurements. Returns exact call counts and -// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes). -// -// Usage: add this header to BUILD srcs; instrument regions of interest: -// { PROFILER_ZONE("name"); /*code*/ } or -// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }. -// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to -// print call counts and average durations [CPU cycles] to stdout, sorted in -// descending order of total duration. - -// Configuration settings: - -// If zero, this file has no effect and no measurements will be recorded. -#ifndef PROFILER_ENABLED -#define PROFILER_ENABLED 1 -#endif - -// How many mebibytes to allocate (if PROFILER_ENABLED) per thread that -// enters at least one zone. Once this buffer is full, the thread will analyze -// and discard packets, thus temporarily adding some observer overhead. -// Each zone occupies 16 bytes. -#ifndef PROFILER_THREAD_STORAGE -#define PROFILER_THREAD_STORAGE 200ULL -#endif - -#if PROFILER_ENABLED - -#include <algorithm> // min/max -#include <atomic> -#include <cassert> -#include <cstddef> // ptrdiff_t -#include <cstdint> -#include <cstdio> -#include <cstdlib> -#include <cstring> // memcpy -#include <new> - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" - -// Non-portable aspects: -// - SSE2 128-bit load/store (write-combining, UpdateOrAdd) -// - RDTSCP timestamps (serializing, high-resolution) -// - assumes string literals are stored within an 8 MiB range -// - compiler-specific annotations (restrict, alignment, fences) -#if HH_ARCH_X64 -#include <emmintrin.h> -#if HH_MSC_VERSION -#include <intrin.h> -#else -#include <x86intrin.h> -#endif -#endif - -#include "highwayhash/robust_statistics.h" -#include "highwayhash/tsc_timer.h" - -#define PROFILER_CHECK(condition) \ - while (!(condition)) { \ - printf("Profiler check failed at line %d\n", __LINE__); \ - abort(); \ - } - -namespace highwayhash { - -// Upper bounds for various fixed-size data structures (guarded via assert): - -// How many threads can actually enter a zone (those that don't do not count). -// Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB. -// WARNING: a fiber library can spawn hundreds of threads. -static constexpr size_t kMaxThreads = 128; - -// Maximum nesting of zones. -static constexpr size_t kMaxDepth = 64; - -// Total number of zones. -static constexpr size_t kMaxZones = 256; - -// Functions that depend on the cache line size. -class CacheAligned { - public: - static constexpr size_t kPointerSize = sizeof(void*); - static constexpr size_t kCacheLineSize = 64; - - static void* Allocate(const size_t bytes) { - char* const allocated = static_cast<char*>(malloc(bytes + kCacheLineSize)); - if (allocated == nullptr) { - return nullptr; - } - const uintptr_t misalignment = - reinterpret_cast<uintptr_t>(allocated) & (kCacheLineSize - 1); - // malloc is at least kPointerSize aligned, so we can store the "allocated" - // pointer immediately before the aligned memory. - assert(misalignment % kPointerSize == 0); - char* const aligned = allocated + kCacheLineSize - misalignment; - memcpy(aligned - kPointerSize, &allocated, kPointerSize); - return aligned; - } - - // Template allows freeing pointer-to-const. - template <typename T> - static void Free(T* aligned_pointer) { - if (aligned_pointer == nullptr) { - return; - } - const char* const aligned = reinterpret_cast<const char*>(aligned_pointer); - assert(reinterpret_cast<uintptr_t>(aligned) % kCacheLineSize == 0); - char* allocated; - memcpy(&allocated, aligned - kPointerSize, kPointerSize); - assert(allocated <= aligned - kPointerSize); - assert(allocated >= aligned - kCacheLineSize); - free(allocated); - } - -#if HH_ARCH_X64 - // Overwrites "to" without loading it into the cache (read-for-ownership). - template <typename T> - static void StreamCacheLine(const T* from_items, T* to_items) { - const __m128i* const from = reinterpret_cast<const __m128i*>(from_items); - __m128i* const to = reinterpret_cast<__m128i*>(to_items); - HH_COMPILER_FENCE; - const __m128i v0 = _mm_load_si128(from + 0); - const __m128i v1 = _mm_load_si128(from + 1); - const __m128i v2 = _mm_load_si128(from + 2); - const __m128i v3 = _mm_load_si128(from + 3); - // Fences prevent the compiler from reordering loads/stores, which may - // interfere with write-combining. - HH_COMPILER_FENCE; - _mm_stream_si128(to + 0, v0); - _mm_stream_si128(to + 1, v1); - _mm_stream_si128(to + 2, v2); - _mm_stream_si128(to + 3, v3); - HH_COMPILER_FENCE; - } -#endif -}; - -// Represents zone entry/exit events. Stores a full-resolution timestamp plus -// an offset (representing zone name or identifying exit packets). POD. -class Packet { - public: - // If offsets do not fit, UpdateOrAdd will overrun our heap allocation - // (governed by kMaxZones). We have seen multi-megabyte offsets. - static constexpr size_t kOffsetBits = 25; - static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1); - - // We need full-resolution timestamps; at an effective rate of 4 GHz, - // this permits 1 minute zone durations (for longer durations, split into - // multiple zones). Wraparound is handled by masking. - static constexpr size_t kTimestampBits = 64 - kOffsetBits; - static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1; - - static Packet Make(const size_t biased_offset, const uint64_t timestamp) { - assert(biased_offset < (1ULL << kOffsetBits)); - - Packet packet; - packet.bits_ = - (biased_offset << kTimestampBits) + (timestamp & kTimestampMask); - return packet; - } - - uint64_t Timestamp() const { return bits_ & kTimestampMask; } - - size_t BiasedOffset() const { return (bits_ >> kTimestampBits); } - - private: - uint64_t bits_; -}; -static_assert(sizeof(Packet) == 8, "Wrong Packet size"); - -// Returns the address of a string literal. Assuming zone names are also -// literals and stored nearby, we can represent them as offsets, which are -// faster to compute than hashes or even a static index. -// -// This function must not be static - each call (even from other translation -// units) must return the same value. -inline const char* StringOrigin() { - // Chosen such that no zone name is a prefix nor suffix of this string - // to ensure they aren't merged (offset 0 identifies zone-exit packets). - static const char* string_origin = "__#__"; - return string_origin - Packet::kOffsetBias; -} - -// Representation of an active zone, stored in a stack. Used to deduct -// child duration from the parent's self time. POD. -struct Node { - Packet packet; - uint64_t child_total; -}; - -// Holds statistics for all zones with the same name. POD. -struct Accumulator { - static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits; - - uint64_t BiasedOffset() const { return num_calls >> kNumCallBits; } - uint64_t NumCalls() const { return num_calls & ((1ULL << kNumCallBits) - 1); } - - // UpdateOrAdd relies upon this layout. - uint64_t num_calls = 0; // upper bits = biased_offset. - uint64_t total_duration = 0; -}; -#if HH_ARCH_X64 -static_assert(sizeof(Accumulator) == sizeof(__m128i), "Wrong Accumulator size"); -#endif - -template <typename T> -inline T ClampedSubtract(const T minuend, const T subtrahend) { - if (subtrahend > minuend) { - return 0; - } - return minuend - subtrahend; -} - -// Per-thread call graph (stack) and Accumulator for each zone. -class Results { - public: - Results() { - // Zero-initialize first accumulator to avoid a check for num_zones_ == 0. - memset(zones_, 0, sizeof(Accumulator)); - } - - // Used for computing overhead when this thread encounters its first Zone. - // This has no observable effect apart from increasing "analyze_elapsed_". - uint64_t ZoneDuration(const Packet* packets) { - PROFILER_CHECK(depth_ == 0); - PROFILER_CHECK(num_zones_ == 0); - AnalyzePackets(packets, 2); - const uint64_t duration = zones_[0].total_duration; - zones_[0].num_calls = 0; - zones_[0].total_duration = 0; - PROFILER_CHECK(depth_ == 0); - num_zones_ = 0; - return duration; - } - - void SetSelfOverhead(const uint64_t self_overhead) { - self_overhead_ = self_overhead; - } - - void SetChildOverhead(const uint64_t child_overhead) { - child_overhead_ = child_overhead; - } - - // Draw all required information from the packets, which can be discarded - // afterwards. Called whenever this thread's storage is full. - void AnalyzePackets(const Packet* packets, const size_t num_packets) { - const uint64_t t0 = Start<uint64_t>(); - - for (size_t i = 0; i < num_packets; ++i) { - const Packet p = packets[i]; - // Entering a zone - if (p.BiasedOffset() != Packet::kOffsetBias) { - assert(depth_ < kMaxDepth); - nodes_[depth_].packet = p; - nodes_[depth_].child_total = 0; - ++depth_; - continue; - } - - assert(depth_ != 0); - const Node& node = nodes_[depth_ - 1]; - // Masking correctly handles unsigned wraparound. - const uint64_t duration = - (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask; - const uint64_t self_duration = ClampedSubtract( - duration, self_overhead_ + child_overhead_ + node.child_total); - - UpdateOrAdd(node.packet.BiasedOffset(), self_duration); - --depth_; - - // Deduct this nested node's time from its parent's self_duration. - if (depth_ != 0) { - nodes_[depth_ - 1].child_total += duration + child_overhead_; - } - } - - const uint64_t t1 = Stop<uint64_t>(); - analyze_elapsed_ += t1 - t0; - } - - // Incorporates results from another thread. Call after all threads have - // exited any zones. - void Assimilate(const Results& other) { - const uint64_t t0 = Start<uint64_t>(); - assert(depth_ == 0); - assert(other.depth_ == 0); - - for (size_t i = 0; i < other.num_zones_; ++i) { - const Accumulator& zone = other.zones_[i]; - UpdateOrAdd(zone.BiasedOffset(), zone.total_duration); - } - const uint64_t t1 = Stop<uint64_t>(); - analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_; - } - - // Single-threaded. - void Print() { - const uint64_t t0 = Start<uint64_t>(); - MergeDuplicates(); - - // Sort by decreasing total (self) cost. - std::sort(zones_, zones_ + num_zones_, - [](const Accumulator& r1, const Accumulator& r2) { - return r1.total_duration > r2.total_duration; - }); - - const char* string_origin = StringOrigin(); - for (size_t i = 0; i < num_zones_; ++i) { - const Accumulator& r = zones_[i]; - const uint64_t num_calls = r.NumCalls(); - printf("%40s: %10zu x %15zu = %15zu\n", string_origin + r.BiasedOffset(), - num_calls, r.total_duration / num_calls, r.total_duration); - } - - const uint64_t t1 = Stop<uint64_t>(); - analyze_elapsed_ += t1 - t0; - printf("Total clocks during analysis: %zu\n", analyze_elapsed_); - } - - private: -#if HH_ARCH_X64 - static bool SameOffset(const __m128i& zone, const size_t biased_offset) { - const uint64_t num_calls = _mm_cvtsi128_si64(zone); - return (num_calls >> Accumulator::kNumCallBits) == biased_offset; - } -#endif - - // Updates an existing Accumulator (uniquely identified by biased_offset) or - // adds one if this is the first time this thread analyzed that zone. - // Uses a self-organizing list data structure, which avoids dynamic memory - // allocations and is far faster than unordered_map. Loads, updates and - // stores the entire Accumulator with vector instructions. - void UpdateOrAdd(const size_t biased_offset, const uint64_t duration) { - assert(biased_offset < (1ULL << Packet::kOffsetBits)); - -#if HH_ARCH_X64 - const __m128i one_64 = _mm_set1_epi64x(1); - const __m128i duration_64 = _mm_cvtsi64_si128(duration); - const __m128i add_duration_call = _mm_unpacklo_epi64(one_64, duration_64); - - __m128i* const HH_RESTRICT zones = reinterpret_cast<__m128i*>(zones_); - - // Special case for first zone: (maybe) update, without swapping. - __m128i prev = _mm_load_si128(zones); - if (SameOffset(prev, biased_offset)) { - prev = _mm_add_epi64(prev, add_duration_call); - assert(SameOffset(prev, biased_offset)); - _mm_store_si128(zones, prev); - return; - } - - // Look for a zone with the same offset. - for (size_t i = 1; i < num_zones_; ++i) { - __m128i zone = _mm_load_si128(zones + i); - if (SameOffset(zone, biased_offset)) { - zone = _mm_add_epi64(zone, add_duration_call); - assert(SameOffset(zone, biased_offset)); - // Swap with predecessor (more conservative than move to front, - // but at least as successful). - _mm_store_si128(zones + i - 1, zone); - _mm_store_si128(zones + i, prev); - return; - } - prev = zone; - } - - // Not found; create a new Accumulator. - const __m128i biased_offset_64 = _mm_slli_epi64( - _mm_cvtsi64_si128(biased_offset), Accumulator::kNumCallBits); - const __m128i zone = _mm_add_epi64(biased_offset_64, add_duration_call); - assert(SameOffset(zone, biased_offset)); - - assert(num_zones_ < kMaxZones); - _mm_store_si128(zones + num_zones_, zone); - ++num_zones_; -#else - // Special case for first zone: (maybe) update, without swapping. - if (zones_[0].BiasedOffset() == biased_offset) { - zones_[0].total_duration += duration; - zones_[0].num_calls += 1; - assert(zones_[0].BiasedOffset() == biased_offset); - return; - } - - // Look for a zone with the same offset. - for (size_t i = 1; i < num_zones_; ++i) { - if (zones_[i].BiasedOffset() == biased_offset) { - zones_[i].total_duration += duration; - zones_[i].num_calls += 1; - assert(zones_[i].BiasedOffset() == biased_offset); - // Swap with predecessor (more conservative than move to front, - // but at least as successful). - const Accumulator prev = zones_[i - 1]; - zones_[i - 1] = zones_[i]; - zones_[i] = prev; - return; - } - } - - // Not found; create a new Accumulator. - assert(num_zones_ < kMaxZones); - Accumulator* HH_RESTRICT zone = zones_ + num_zones_; - zone->num_calls = (biased_offset << Accumulator::kNumCallBits) + 1; - zone->total_duration = duration; - assert(zone->BiasedOffset() == biased_offset); - ++num_zones_; -#endif - } - - // Each instantiation of a function template seems to get its own copy of - // __func__ and GCC doesn't merge them. An N^2 search for duplicates is - // acceptable because we only expect a few dozen zones. - void MergeDuplicates() { - const char* string_origin = StringOrigin(); - for (size_t i = 0; i < num_zones_; ++i) { - const size_t biased_offset = zones_[i].BiasedOffset(); - const char* name = string_origin + biased_offset; - // Separate num_calls from biased_offset so we can add them together. - uint64_t num_calls = zones_[i].NumCalls(); - - // Add any subsequent duplicates to num_calls and total_duration. - for (size_t j = i + 1; j < num_zones_;) { - if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) { - num_calls += zones_[j].NumCalls(); - zones_[i].total_duration += zones_[j].total_duration; - // Fill hole with last item. - zones_[j] = zones_[--num_zones_]; - } else { // Name differed, try next Accumulator. - ++j; - } - } - - assert(num_calls < (1ULL << Accumulator::kNumCallBits)); - - // Re-pack regardless of whether any duplicates were found. - zones_[i].num_calls = - (biased_offset << Accumulator::kNumCallBits) + num_calls; - } - } - - uint64_t analyze_elapsed_ = 0; - uint64_t self_overhead_ = 0; - uint64_t child_overhead_ = 0; - - size_t depth_ = 0; // Number of active zones. - size_t num_zones_ = 0; // Number of retired zones. - - HH_ALIGNAS(64) Node nodes_[kMaxDepth]; // Stack - HH_ALIGNAS(64) Accumulator zones_[kMaxZones]; // Self-organizing list -}; - -// Per-thread packet storage, allocated via CacheAligned. -class ThreadSpecific { - static constexpr size_t kBufferCapacity = - CacheAligned::kCacheLineSize / sizeof(Packet); - - public: - // "name" is used to sanity-check offsets fit in kOffsetBits. - explicit ThreadSpecific(const char* name) - : packets_(static_cast<Packet*>( - CacheAligned::Allocate(PROFILER_THREAD_STORAGE << 20))), - num_packets_(0), - max_packets_(PROFILER_THREAD_STORAGE << 17), - string_origin_(StringOrigin()) { - // Even in optimized builds (with NDEBUG), verify that this zone's name - // offset fits within the allotted space. If not, UpdateOrAdd is likely to - // overrun zones_[]. We also assert(), but users often do not run debug - // builds. Checking here on the cold path (only reached once per thread) - // is cheap, but it only covers one zone. - const size_t biased_offset = name - string_origin_; - PROFILER_CHECK(biased_offset <= (1ULL << Packet::kOffsetBits)); - } - - ~ThreadSpecific() { CacheAligned::Free(packets_); } - - // Depends on Zone => defined below. - void ComputeOverhead(); - - void WriteEntry(const char* name, const uint64_t timestamp) { - const size_t biased_offset = name - string_origin_; - Write(Packet::Make(biased_offset, timestamp)); - } - - void WriteExit(const uint64_t timestamp) { - const size_t biased_offset = Packet::kOffsetBias; - Write(Packet::Make(biased_offset, timestamp)); - } - - void AnalyzeRemainingPackets() { -#if HH_ARCH_X64 - // Ensures prior weakly-ordered streaming stores are globally visible. - _mm_sfence(); - - // Storage full => empty it. - if (num_packets_ + buffer_size_ > max_packets_) { - results_.AnalyzePackets(packets_, num_packets_); - num_packets_ = 0; - } - memcpy(packets_ + num_packets_, buffer_, buffer_size_ * sizeof(Packet)); - num_packets_ += buffer_size_; -#endif - - results_.AnalyzePackets(packets_, num_packets_); - num_packets_ = 0; - } - - Results& GetResults() { return results_; } - - private: - // Write packet to buffer/storage, emptying them as needed. - void Write(const Packet packet) { -#if HH_ARCH_X64 - // Buffer full => copy to storage. - if (buffer_size_ == kBufferCapacity) { - // Storage full => empty it. - if (num_packets_ + kBufferCapacity > max_packets_) { - results_.AnalyzePackets(packets_, num_packets_); - num_packets_ = 0; - } - // This buffering halves observer overhead and decreases the overall - // runtime by about 3%. - CacheAligned::StreamCacheLine(buffer_, packets_ + num_packets_); - num_packets_ += kBufferCapacity; - buffer_size_ = 0; - } - buffer_[buffer_size_] = packet; - ++buffer_size_; -#else - // Write directly to storage. - if (num_packets_ >= max_packets_) { - results_.AnalyzePackets(packets_, num_packets_); - num_packets_ = 0; - } - packets_[num_packets_] = packet; - ++num_packets_; -#endif - } - - // Write-combining buffer to avoid cache pollution. Must be the first - // non-static member to ensure cache-line alignment. -#if HH_ARCH_X64 - Packet buffer_[kBufferCapacity]; - size_t buffer_size_ = 0; -#endif - - // Contiguous storage for zone enter/exit packets. - Packet* const HH_RESTRICT packets_; - size_t num_packets_; - const size_t max_packets_; - // Cached here because we already read this cache line on zone entry/exit. - const char* HH_RESTRICT string_origin_; - Results results_; -}; - -class ThreadList { - public: - // Thread-safe. - void Add(ThreadSpecific* const ts) { - const uint32_t index = num_threads_.fetch_add(1); - PROFILER_CHECK(index < kMaxThreads); - threads_[index] = ts; - } - - // Single-threaded. - void PrintResults() { - const uint32_t num_threads = num_threads_.load(); - for (uint32_t i = 0; i < num_threads; ++i) { - threads_[i]->AnalyzeRemainingPackets(); - } - - // Combine all threads into a single Result. - for (uint32_t i = 1; i < num_threads; ++i) { - threads_[0]->GetResults().Assimilate(threads_[i]->GetResults()); - } - - if (num_threads != 0) { - threads_[0]->GetResults().Print(); - } - } - - private: - // Owning pointers. - HH_ALIGNAS(64) ThreadSpecific* threads_[kMaxThreads]; - std::atomic<uint32_t> num_threads_{0}; -}; - -// RAII zone enter/exit recorder constructed by the ZONE macro; also -// responsible for initializing ThreadSpecific. -class Zone { - public: - // "name" must be a string literal (see StringOrigin). - HH_NOINLINE explicit Zone(const char* name) { - HH_COMPILER_FENCE; - ThreadSpecific* HH_RESTRICT thread_specific = StaticThreadSpecific(); - if (HH_UNLIKELY(thread_specific == nullptr)) { - void* mem = CacheAligned::Allocate(sizeof(ThreadSpecific)); - thread_specific = new (mem) ThreadSpecific(name); - // Must happen before ComputeOverhead, which re-enters this ctor. - Threads().Add(thread_specific); - StaticThreadSpecific() = thread_specific; - thread_specific->ComputeOverhead(); - } - - // (Capture timestamp ASAP, not inside WriteEntry.) - HH_COMPILER_FENCE; - const uint64_t timestamp = Start<uint64_t>(); - thread_specific->WriteEntry(name, timestamp); - } - - HH_NOINLINE ~Zone() { - HH_COMPILER_FENCE; - const uint64_t timestamp = Stop<uint64_t>(); - StaticThreadSpecific()->WriteExit(timestamp); - HH_COMPILER_FENCE; - } - - // Call exactly once after all threads have exited all zones. - static void PrintResults() { Threads().PrintResults(); } - - private: - // Returns reference to the thread's ThreadSpecific pointer (initially null). - // Function-local static avoids needing a separate definition. - static ThreadSpecific*& StaticThreadSpecific() { - static thread_local ThreadSpecific* thread_specific; - return thread_specific; - } - - // Returns the singleton ThreadList. Non time-critical. - static ThreadList& Threads() { - static ThreadList threads_; - return threads_; - } -}; - -// Creates a zone starting from here until the end of the current scope. -// Timestamps will be recorded when entering and exiting the zone. -// "name" must be a string literal, which is ensured by merging with "". -#define PROFILER_ZONE(name) \ - HH_COMPILER_FENCE; \ - const Zone zone("" name); \ - HH_COMPILER_FENCE - -// Creates a zone for an entire function (when placed at its beginning). -// Shorter/more convenient than ZONE. -#define PROFILER_FUNC \ - HH_COMPILER_FENCE; \ - const Zone zone(__func__); \ - HH_COMPILER_FENCE - -#define PROFILER_PRINT_RESULTS Zone::PrintResults - -inline void ThreadSpecific::ComputeOverhead() { - // Delay after capturing timestamps before/after the actual zone runs. Even - // with frequency throttling disabled, this has a multimodal distribution, - // including 32, 34, 48, 52, 59, 62. - uint64_t self_overhead; - { - const size_t kNumSamples = 32; - uint32_t samples[kNumSamples]; - for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { - const size_t kNumDurations = 1024; - uint32_t durations[kNumDurations]; - - for (size_t idx_duration = 0; idx_duration < kNumDurations; - ++idx_duration) { - { PROFILER_ZONE("Dummy Zone (never shown)"); } -#if HH_ARCH_X64 - const uint64_t duration = results_.ZoneDuration(buffer_); - buffer_size_ = 0; -#else - const uint64_t duration = results_.ZoneDuration(packets_); - num_packets_ = 0; -#endif - durations[idx_duration] = static_cast<uint32_t>(duration); - PROFILER_CHECK(num_packets_ == 0); - } - CountingSort(durations, durations + kNumDurations); - samples[idx_sample] = Mode(durations, kNumDurations); - } - // Median. - CountingSort(samples, samples + kNumSamples); - self_overhead = samples[kNumSamples / 2]; - printf("Overhead: %zu\n", self_overhead); - results_.SetSelfOverhead(self_overhead); - } - - // Delay before capturing start timestamp / after end timestamp. - const size_t kNumSamples = 32; - uint32_t samples[kNumSamples]; - for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { - const size_t kNumDurations = 16; - uint32_t durations[kNumDurations]; - for (size_t idx_duration = 0; idx_duration < kNumDurations; - ++idx_duration) { - const size_t kReps = 10000; - // Analysis time should not be included => must fit within buffer. - PROFILER_CHECK(kReps * 2 < max_packets_); -#if HH_ARCH_X64 - _mm_mfence(); -#endif - const uint64_t t0 = Start<uint64_t>(); - for (size_t i = 0; i < kReps; ++i) { - PROFILER_ZONE("Dummy"); - } -#if HH_ARCH_X64 - _mm_sfence(); -#endif - const uint64_t t1 = Stop<uint64_t>(); -#if HH_ARCH_X64 - PROFILER_CHECK(num_packets_ + buffer_size_ == kReps * 2); - buffer_size_ = 0; -#else - PROFILER_CHECK(num_packets_ == kReps * 2); -#endif - num_packets_ = 0; - const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps; - durations[idx_duration] = - static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead)); - } - CountingSort(durations, durations + kNumDurations); - samples[idx_sample] = Mode(durations, kNumDurations); - } - CountingSort(samples, samples + kNumSamples); - const uint64_t child_overhead = samples[9 * kNumSamples / 10]; - printf("Child overhead: %zu\n", child_overhead); - results_.SetChildOverhead(child_overhead); -} - -} // namespace highwayhash - -#else // !PROFILER_ENABLED -#define PROFILER_ZONE(name) -#define PROFILER_FUNC -#define PROFILER_PRINT_RESULTS() -#endif - -#endif // HIGHWAYHASH_PROFILER_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_PROFILER_H_ +#define HIGHWAYHASH_PROFILER_H_ + +// High precision, low overhead time measurements. Returns exact call counts and +// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes). +// +// Usage: add this header to BUILD srcs; instrument regions of interest: +// { PROFILER_ZONE("name"); /*code*/ } or +// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }. +// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to +// print call counts and average durations [CPU cycles] to stdout, sorted in +// descending order of total duration. + +// Configuration settings: + +// If zero, this file has no effect and no measurements will be recorded. +#ifndef PROFILER_ENABLED +#define PROFILER_ENABLED 1 +#endif + +// How many mebibytes to allocate (if PROFILER_ENABLED) per thread that +// enters at least one zone. Once this buffer is full, the thread will analyze +// and discard packets, thus temporarily adding some observer overhead. +// Each zone occupies 16 bytes. +#ifndef PROFILER_THREAD_STORAGE +#define PROFILER_THREAD_STORAGE 200ULL +#endif + +#if PROFILER_ENABLED + +#include <algorithm> // min/max +#include <atomic> +#include <cassert> +#include <cstddef> // ptrdiff_t +#include <cstdint> +#include <cstdio> +#include <cstdlib> +#include <cstring> // memcpy +#include <new> + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" + +// Non-portable aspects: +// - SSE2 128-bit load/store (write-combining, UpdateOrAdd) +// - RDTSCP timestamps (serializing, high-resolution) +// - assumes string literals are stored within an 8 MiB range +// - compiler-specific annotations (restrict, alignment, fences) +#if HH_ARCH_X64 +#include <emmintrin.h> +#if HH_MSC_VERSION +#include <intrin.h> +#else +#include <x86intrin.h> +#endif +#endif + +#include "highwayhash/robust_statistics.h" +#include "highwayhash/tsc_timer.h" + +#define PROFILER_CHECK(condition) \ + while (!(condition)) { \ + printf("Profiler check failed at line %d\n", __LINE__); \ + abort(); \ + } + +namespace highwayhash { + +// Upper bounds for various fixed-size data structures (guarded via assert): + +// How many threads can actually enter a zone (those that don't do not count). +// Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB. +// WARNING: a fiber library can spawn hundreds of threads. +static constexpr size_t kMaxThreads = 128; + +// Maximum nesting of zones. +static constexpr size_t kMaxDepth = 64; + +// Total number of zones. +static constexpr size_t kMaxZones = 256; + +// Functions that depend on the cache line size. +class CacheAligned { + public: + static constexpr size_t kPointerSize = sizeof(void*); + static constexpr size_t kCacheLineSize = 64; + + static void* Allocate(const size_t bytes) { + char* const allocated = static_cast<char*>(malloc(bytes + kCacheLineSize)); + if (allocated == nullptr) { + return nullptr; + } + const uintptr_t misalignment = + reinterpret_cast<uintptr_t>(allocated) & (kCacheLineSize - 1); + // malloc is at least kPointerSize aligned, so we can store the "allocated" + // pointer immediately before the aligned memory. + assert(misalignment % kPointerSize == 0); + char* const aligned = allocated + kCacheLineSize - misalignment; + memcpy(aligned - kPointerSize, &allocated, kPointerSize); + return aligned; + } + + // Template allows freeing pointer-to-const. + template <typename T> + static void Free(T* aligned_pointer) { + if (aligned_pointer == nullptr) { + return; + } + const char* const aligned = reinterpret_cast<const char*>(aligned_pointer); + assert(reinterpret_cast<uintptr_t>(aligned) % kCacheLineSize == 0); + char* allocated; + memcpy(&allocated, aligned - kPointerSize, kPointerSize); + assert(allocated <= aligned - kPointerSize); + assert(allocated >= aligned - kCacheLineSize); + free(allocated); + } + +#if HH_ARCH_X64 + // Overwrites "to" without loading it into the cache (read-for-ownership). + template <typename T> + static void StreamCacheLine(const T* from_items, T* to_items) { + const __m128i* const from = reinterpret_cast<const __m128i*>(from_items); + __m128i* const to = reinterpret_cast<__m128i*>(to_items); + HH_COMPILER_FENCE; + const __m128i v0 = _mm_load_si128(from + 0); + const __m128i v1 = _mm_load_si128(from + 1); + const __m128i v2 = _mm_load_si128(from + 2); + const __m128i v3 = _mm_load_si128(from + 3); + // Fences prevent the compiler from reordering loads/stores, which may + // interfere with write-combining. + HH_COMPILER_FENCE; + _mm_stream_si128(to + 0, v0); + _mm_stream_si128(to + 1, v1); + _mm_stream_si128(to + 2, v2); + _mm_stream_si128(to + 3, v3); + HH_COMPILER_FENCE; + } +#endif +}; + +// Represents zone entry/exit events. Stores a full-resolution timestamp plus +// an offset (representing zone name or identifying exit packets). POD. +class Packet { + public: + // If offsets do not fit, UpdateOrAdd will overrun our heap allocation + // (governed by kMaxZones). We have seen multi-megabyte offsets. + static constexpr size_t kOffsetBits = 25; + static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1); + + // We need full-resolution timestamps; at an effective rate of 4 GHz, + // this permits 1 minute zone durations (for longer durations, split into + // multiple zones). Wraparound is handled by masking. + static constexpr size_t kTimestampBits = 64 - kOffsetBits; + static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1; + + static Packet Make(const size_t biased_offset, const uint64_t timestamp) { + assert(biased_offset < (1ULL << kOffsetBits)); + + Packet packet; + packet.bits_ = + (biased_offset << kTimestampBits) + (timestamp & kTimestampMask); + return packet; + } + + uint64_t Timestamp() const { return bits_ & kTimestampMask; } + + size_t BiasedOffset() const { return (bits_ >> kTimestampBits); } + + private: + uint64_t bits_; +}; +static_assert(sizeof(Packet) == 8, "Wrong Packet size"); + +// Returns the address of a string literal. Assuming zone names are also +// literals and stored nearby, we can represent them as offsets, which are +// faster to compute than hashes or even a static index. +// +// This function must not be static - each call (even from other translation +// units) must return the same value. +inline const char* StringOrigin() { + // Chosen such that no zone name is a prefix nor suffix of this string + // to ensure they aren't merged (offset 0 identifies zone-exit packets). + static const char* string_origin = "__#__"; + return string_origin - Packet::kOffsetBias; +} + +// Representation of an active zone, stored in a stack. Used to deduct +// child duration from the parent's self time. POD. +struct Node { + Packet packet; + uint64_t child_total; +}; + +// Holds statistics for all zones with the same name. POD. +struct Accumulator { + static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits; + + uint64_t BiasedOffset() const { return num_calls >> kNumCallBits; } + uint64_t NumCalls() const { return num_calls & ((1ULL << kNumCallBits) - 1); } + + // UpdateOrAdd relies upon this layout. + uint64_t num_calls = 0; // upper bits = biased_offset. + uint64_t total_duration = 0; +}; +#if HH_ARCH_X64 +static_assert(sizeof(Accumulator) == sizeof(__m128i), "Wrong Accumulator size"); +#endif + +template <typename T> +inline T ClampedSubtract(const T minuend, const T subtrahend) { + if (subtrahend > minuend) { + return 0; + } + return minuend - subtrahend; +} + +// Per-thread call graph (stack) and Accumulator for each zone. +class Results { + public: + Results() { + // Zero-initialize first accumulator to avoid a check for num_zones_ == 0. + memset(zones_, 0, sizeof(Accumulator)); + } + + // Used for computing overhead when this thread encounters its first Zone. + // This has no observable effect apart from increasing "analyze_elapsed_". + uint64_t ZoneDuration(const Packet* packets) { + PROFILER_CHECK(depth_ == 0); + PROFILER_CHECK(num_zones_ == 0); + AnalyzePackets(packets, 2); + const uint64_t duration = zones_[0].total_duration; + zones_[0].num_calls = 0; + zones_[0].total_duration = 0; + PROFILER_CHECK(depth_ == 0); + num_zones_ = 0; + return duration; + } + + void SetSelfOverhead(const uint64_t self_overhead) { + self_overhead_ = self_overhead; + } + + void SetChildOverhead(const uint64_t child_overhead) { + child_overhead_ = child_overhead; + } + + // Draw all required information from the packets, which can be discarded + // afterwards. Called whenever this thread's storage is full. + void AnalyzePackets(const Packet* packets, const size_t num_packets) { + const uint64_t t0 = Start<uint64_t>(); + + for (size_t i = 0; i < num_packets; ++i) { + const Packet p = packets[i]; + // Entering a zone + if (p.BiasedOffset() != Packet::kOffsetBias) { + assert(depth_ < kMaxDepth); + nodes_[depth_].packet = p; + nodes_[depth_].child_total = 0; + ++depth_; + continue; + } + + assert(depth_ != 0); + const Node& node = nodes_[depth_ - 1]; + // Masking correctly handles unsigned wraparound. + const uint64_t duration = + (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask; + const uint64_t self_duration = ClampedSubtract( + duration, self_overhead_ + child_overhead_ + node.child_total); + + UpdateOrAdd(node.packet.BiasedOffset(), self_duration); + --depth_; + + // Deduct this nested node's time from its parent's self_duration. + if (depth_ != 0) { + nodes_[depth_ - 1].child_total += duration + child_overhead_; + } + } + + const uint64_t t1 = Stop<uint64_t>(); + analyze_elapsed_ += t1 - t0; + } + + // Incorporates results from another thread. Call after all threads have + // exited any zones. + void Assimilate(const Results& other) { + const uint64_t t0 = Start<uint64_t>(); + assert(depth_ == 0); + assert(other.depth_ == 0); + + for (size_t i = 0; i < other.num_zones_; ++i) { + const Accumulator& zone = other.zones_[i]; + UpdateOrAdd(zone.BiasedOffset(), zone.total_duration); + } + const uint64_t t1 = Stop<uint64_t>(); + analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_; + } + + // Single-threaded. + void Print() { + const uint64_t t0 = Start<uint64_t>(); + MergeDuplicates(); + + // Sort by decreasing total (self) cost. + std::sort(zones_, zones_ + num_zones_, + [](const Accumulator& r1, const Accumulator& r2) { + return r1.total_duration > r2.total_duration; + }); + + const char* string_origin = StringOrigin(); + for (size_t i = 0; i < num_zones_; ++i) { + const Accumulator& r = zones_[i]; + const uint64_t num_calls = r.NumCalls(); + printf("%40s: %10zu x %15zu = %15zu\n", string_origin + r.BiasedOffset(), + num_calls, r.total_duration / num_calls, r.total_duration); + } + + const uint64_t t1 = Stop<uint64_t>(); + analyze_elapsed_ += t1 - t0; + printf("Total clocks during analysis: %zu\n", analyze_elapsed_); + } + + private: +#if HH_ARCH_X64 + static bool SameOffset(const __m128i& zone, const size_t biased_offset) { + const uint64_t num_calls = _mm_cvtsi128_si64(zone); + return (num_calls >> Accumulator::kNumCallBits) == biased_offset; + } +#endif + + // Updates an existing Accumulator (uniquely identified by biased_offset) or + // adds one if this is the first time this thread analyzed that zone. + // Uses a self-organizing list data structure, which avoids dynamic memory + // allocations and is far faster than unordered_map. Loads, updates and + // stores the entire Accumulator with vector instructions. + void UpdateOrAdd(const size_t biased_offset, const uint64_t duration) { + assert(biased_offset < (1ULL << Packet::kOffsetBits)); + +#if HH_ARCH_X64 + const __m128i one_64 = _mm_set1_epi64x(1); + const __m128i duration_64 = _mm_cvtsi64_si128(duration); + const __m128i add_duration_call = _mm_unpacklo_epi64(one_64, duration_64); + + __m128i* const HH_RESTRICT zones = reinterpret_cast<__m128i*>(zones_); + + // Special case for first zone: (maybe) update, without swapping. + __m128i prev = _mm_load_si128(zones); + if (SameOffset(prev, biased_offset)) { + prev = _mm_add_epi64(prev, add_duration_call); + assert(SameOffset(prev, biased_offset)); + _mm_store_si128(zones, prev); + return; + } + + // Look for a zone with the same offset. + for (size_t i = 1; i < num_zones_; ++i) { + __m128i zone = _mm_load_si128(zones + i); + if (SameOffset(zone, biased_offset)) { + zone = _mm_add_epi64(zone, add_duration_call); + assert(SameOffset(zone, biased_offset)); + // Swap with predecessor (more conservative than move to front, + // but at least as successful). + _mm_store_si128(zones + i - 1, zone); + _mm_store_si128(zones + i, prev); + return; + } + prev = zone; + } + + // Not found; create a new Accumulator. + const __m128i biased_offset_64 = _mm_slli_epi64( + _mm_cvtsi64_si128(biased_offset), Accumulator::kNumCallBits); + const __m128i zone = _mm_add_epi64(biased_offset_64, add_duration_call); + assert(SameOffset(zone, biased_offset)); + + assert(num_zones_ < kMaxZones); + _mm_store_si128(zones + num_zones_, zone); + ++num_zones_; +#else + // Special case for first zone: (maybe) update, without swapping. + if (zones_[0].BiasedOffset() == biased_offset) { + zones_[0].total_duration += duration; + zones_[0].num_calls += 1; + assert(zones_[0].BiasedOffset() == biased_offset); + return; + } + + // Look for a zone with the same offset. + for (size_t i = 1; i < num_zones_; ++i) { + if (zones_[i].BiasedOffset() == biased_offset) { + zones_[i].total_duration += duration; + zones_[i].num_calls += 1; + assert(zones_[i].BiasedOffset() == biased_offset); + // Swap with predecessor (more conservative than move to front, + // but at least as successful). + const Accumulator prev = zones_[i - 1]; + zones_[i - 1] = zones_[i]; + zones_[i] = prev; + return; + } + } + + // Not found; create a new Accumulator. + assert(num_zones_ < kMaxZones); + Accumulator* HH_RESTRICT zone = zones_ + num_zones_; + zone->num_calls = (biased_offset << Accumulator::kNumCallBits) + 1; + zone->total_duration = duration; + assert(zone->BiasedOffset() == biased_offset); + ++num_zones_; +#endif + } + + // Each instantiation of a function template seems to get its own copy of + // __func__ and GCC doesn't merge them. An N^2 search for duplicates is + // acceptable because we only expect a few dozen zones. + void MergeDuplicates() { + const char* string_origin = StringOrigin(); + for (size_t i = 0; i < num_zones_; ++i) { + const size_t biased_offset = zones_[i].BiasedOffset(); + const char* name = string_origin + biased_offset; + // Separate num_calls from biased_offset so we can add them together. + uint64_t num_calls = zones_[i].NumCalls(); + + // Add any subsequent duplicates to num_calls and total_duration. + for (size_t j = i + 1; j < num_zones_;) { + if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) { + num_calls += zones_[j].NumCalls(); + zones_[i].total_duration += zones_[j].total_duration; + // Fill hole with last item. + zones_[j] = zones_[--num_zones_]; + } else { // Name differed, try next Accumulator. + ++j; + } + } + + assert(num_calls < (1ULL << Accumulator::kNumCallBits)); + + // Re-pack regardless of whether any duplicates were found. + zones_[i].num_calls = + (biased_offset << Accumulator::kNumCallBits) + num_calls; + } + } + + uint64_t analyze_elapsed_ = 0; + uint64_t self_overhead_ = 0; + uint64_t child_overhead_ = 0; + + size_t depth_ = 0; // Number of active zones. + size_t num_zones_ = 0; // Number of retired zones. + + HH_ALIGNAS(64) Node nodes_[kMaxDepth]; // Stack + HH_ALIGNAS(64) Accumulator zones_[kMaxZones]; // Self-organizing list +}; + +// Per-thread packet storage, allocated via CacheAligned. +class ThreadSpecific { + static constexpr size_t kBufferCapacity = + CacheAligned::kCacheLineSize / sizeof(Packet); + + public: + // "name" is used to sanity-check offsets fit in kOffsetBits. + explicit ThreadSpecific(const char* name) + : packets_(static_cast<Packet*>( + CacheAligned::Allocate(PROFILER_THREAD_STORAGE << 20))), + num_packets_(0), + max_packets_(PROFILER_THREAD_STORAGE << 17), + string_origin_(StringOrigin()) { + // Even in optimized builds (with NDEBUG), verify that this zone's name + // offset fits within the allotted space. If not, UpdateOrAdd is likely to + // overrun zones_[]. We also assert(), but users often do not run debug + // builds. Checking here on the cold path (only reached once per thread) + // is cheap, but it only covers one zone. + const size_t biased_offset = name - string_origin_; + PROFILER_CHECK(biased_offset <= (1ULL << Packet::kOffsetBits)); + } + + ~ThreadSpecific() { CacheAligned::Free(packets_); } + + // Depends on Zone => defined below. + void ComputeOverhead(); + + void WriteEntry(const char* name, const uint64_t timestamp) { + const size_t biased_offset = name - string_origin_; + Write(Packet::Make(biased_offset, timestamp)); + } + + void WriteExit(const uint64_t timestamp) { + const size_t biased_offset = Packet::kOffsetBias; + Write(Packet::Make(biased_offset, timestamp)); + } + + void AnalyzeRemainingPackets() { +#if HH_ARCH_X64 + // Ensures prior weakly-ordered streaming stores are globally visible. + _mm_sfence(); + + // Storage full => empty it. + if (num_packets_ + buffer_size_ > max_packets_) { + results_.AnalyzePackets(packets_, num_packets_); + num_packets_ = 0; + } + memcpy(packets_ + num_packets_, buffer_, buffer_size_ * sizeof(Packet)); + num_packets_ += buffer_size_; +#endif + + results_.AnalyzePackets(packets_, num_packets_); + num_packets_ = 0; + } + + Results& GetResults() { return results_; } + + private: + // Write packet to buffer/storage, emptying them as needed. + void Write(const Packet packet) { +#if HH_ARCH_X64 + // Buffer full => copy to storage. + if (buffer_size_ == kBufferCapacity) { + // Storage full => empty it. + if (num_packets_ + kBufferCapacity > max_packets_) { + results_.AnalyzePackets(packets_, num_packets_); + num_packets_ = 0; + } + // This buffering halves observer overhead and decreases the overall + // runtime by about 3%. + CacheAligned::StreamCacheLine(buffer_, packets_ + num_packets_); + num_packets_ += kBufferCapacity; + buffer_size_ = 0; + } + buffer_[buffer_size_] = packet; + ++buffer_size_; +#else + // Write directly to storage. + if (num_packets_ >= max_packets_) { + results_.AnalyzePackets(packets_, num_packets_); + num_packets_ = 0; + } + packets_[num_packets_] = packet; + ++num_packets_; +#endif + } + + // Write-combining buffer to avoid cache pollution. Must be the first + // non-static member to ensure cache-line alignment. +#if HH_ARCH_X64 + Packet buffer_[kBufferCapacity]; + size_t buffer_size_ = 0; +#endif + + // Contiguous storage for zone enter/exit packets. + Packet* const HH_RESTRICT packets_; + size_t num_packets_; + const size_t max_packets_; + // Cached here because we already read this cache line on zone entry/exit. + const char* HH_RESTRICT string_origin_; + Results results_; +}; + +class ThreadList { + public: + // Thread-safe. + void Add(ThreadSpecific* const ts) { + const uint32_t index = num_threads_.fetch_add(1); + PROFILER_CHECK(index < kMaxThreads); + threads_[index] = ts; + } + + // Single-threaded. + void PrintResults() { + const uint32_t num_threads = num_threads_.load(); + for (uint32_t i = 0; i < num_threads; ++i) { + threads_[i]->AnalyzeRemainingPackets(); + } + + // Combine all threads into a single Result. + for (uint32_t i = 1; i < num_threads; ++i) { + threads_[0]->GetResults().Assimilate(threads_[i]->GetResults()); + } + + if (num_threads != 0) { + threads_[0]->GetResults().Print(); + } + } + + private: + // Owning pointers. + HH_ALIGNAS(64) ThreadSpecific* threads_[kMaxThreads]; + std::atomic<uint32_t> num_threads_{0}; +}; + +// RAII zone enter/exit recorder constructed by the ZONE macro; also +// responsible for initializing ThreadSpecific. +class Zone { + public: + // "name" must be a string literal (see StringOrigin). + HH_NOINLINE explicit Zone(const char* name) { + HH_COMPILER_FENCE; + ThreadSpecific* HH_RESTRICT thread_specific = StaticThreadSpecific(); + if (HH_UNLIKELY(thread_specific == nullptr)) { + void* mem = CacheAligned::Allocate(sizeof(ThreadSpecific)); + thread_specific = new (mem) ThreadSpecific(name); + // Must happen before ComputeOverhead, which re-enters this ctor. + Threads().Add(thread_specific); + StaticThreadSpecific() = thread_specific; + thread_specific->ComputeOverhead(); + } + + // (Capture timestamp ASAP, not inside WriteEntry.) + HH_COMPILER_FENCE; + const uint64_t timestamp = Start<uint64_t>(); + thread_specific->WriteEntry(name, timestamp); + } + + HH_NOINLINE ~Zone() { + HH_COMPILER_FENCE; + const uint64_t timestamp = Stop<uint64_t>(); + StaticThreadSpecific()->WriteExit(timestamp); + HH_COMPILER_FENCE; + } + + // Call exactly once after all threads have exited all zones. + static void PrintResults() { Threads().PrintResults(); } + + private: + // Returns reference to the thread's ThreadSpecific pointer (initially null). + // Function-local static avoids needing a separate definition. + static ThreadSpecific*& StaticThreadSpecific() { + static thread_local ThreadSpecific* thread_specific; + return thread_specific; + } + + // Returns the singleton ThreadList. Non time-critical. + static ThreadList& Threads() { + static ThreadList threads_; + return threads_; + } +}; + +// Creates a zone starting from here until the end of the current scope. +// Timestamps will be recorded when entering and exiting the zone. +// "name" must be a string literal, which is ensured by merging with "". +#define PROFILER_ZONE(name) \ + HH_COMPILER_FENCE; \ + const Zone zone("" name); \ + HH_COMPILER_FENCE + +// Creates a zone for an entire function (when placed at its beginning). +// Shorter/more convenient than ZONE. +#define PROFILER_FUNC \ + HH_COMPILER_FENCE; \ + const Zone zone(__func__); \ + HH_COMPILER_FENCE + +#define PROFILER_PRINT_RESULTS Zone::PrintResults + +inline void ThreadSpecific::ComputeOverhead() { + // Delay after capturing timestamps before/after the actual zone runs. Even + // with frequency throttling disabled, this has a multimodal distribution, + // including 32, 34, 48, 52, 59, 62. + uint64_t self_overhead; + { + const size_t kNumSamples = 32; + uint32_t samples[kNumSamples]; + for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { + const size_t kNumDurations = 1024; + uint32_t durations[kNumDurations]; + + for (size_t idx_duration = 0; idx_duration < kNumDurations; + ++idx_duration) { + { PROFILER_ZONE("Dummy Zone (never shown)"); } +#if HH_ARCH_X64 + const uint64_t duration = results_.ZoneDuration(buffer_); + buffer_size_ = 0; +#else + const uint64_t duration = results_.ZoneDuration(packets_); + num_packets_ = 0; +#endif + durations[idx_duration] = static_cast<uint32_t>(duration); + PROFILER_CHECK(num_packets_ == 0); + } + CountingSort(durations, durations + kNumDurations); + samples[idx_sample] = Mode(durations, kNumDurations); + } + // Median. + CountingSort(samples, samples + kNumSamples); + self_overhead = samples[kNumSamples / 2]; + printf("Overhead: %zu\n", self_overhead); + results_.SetSelfOverhead(self_overhead); + } + + // Delay before capturing start timestamp / after end timestamp. + const size_t kNumSamples = 32; + uint32_t samples[kNumSamples]; + for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { + const size_t kNumDurations = 16; + uint32_t durations[kNumDurations]; + for (size_t idx_duration = 0; idx_duration < kNumDurations; + ++idx_duration) { + const size_t kReps = 10000; + // Analysis time should not be included => must fit within buffer. + PROFILER_CHECK(kReps * 2 < max_packets_); +#if HH_ARCH_X64 + _mm_mfence(); +#endif + const uint64_t t0 = Start<uint64_t>(); + for (size_t i = 0; i < kReps; ++i) { + PROFILER_ZONE("Dummy"); + } +#if HH_ARCH_X64 + _mm_sfence(); +#endif + const uint64_t t1 = Stop<uint64_t>(); +#if HH_ARCH_X64 + PROFILER_CHECK(num_packets_ + buffer_size_ == kReps * 2); + buffer_size_ = 0; +#else + PROFILER_CHECK(num_packets_ == kReps * 2); +#endif + num_packets_ = 0; + const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps; + durations[idx_duration] = + static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead)); + } + CountingSort(durations, durations + kNumDurations); + samples[idx_sample] = Mode(durations, kNumDurations); + } + CountingSort(samples, samples + kNumSamples); + const uint64_t child_overhead = samples[9 * kNumSamples / 10]; + printf("Child overhead: %zu\n", child_overhead); + results_.SetChildOverhead(child_overhead); +} + +} // namespace highwayhash + +#else // !PROFILER_ENABLED +#define PROFILER_ZONE(name) +#define PROFILER_FUNC +#define PROFILER_PRINT_RESULTS() +#endif + +#endif // HIGHWAYHASH_PROFILER_H_ diff --git a/contrib/libs/highwayhash/highwayhash/profiler_example.cc b/contrib/libs/highwayhash/highwayhash/profiler_example.cc index 999cc4581f..9d97066ec9 100644 --- a/contrib/libs/highwayhash/highwayhash/profiler_example.cc +++ b/contrib/libs/highwayhash/highwayhash/profiler_example.cc @@ -1,97 +1,97 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <cassert> -#include <cmath> -#include <cstdlib> - -#include "highwayhash/os_specific.h" -#include "highwayhash/profiler.h" - -namespace highwayhash { -namespace { - -void Spin(const double min_time) { - const double t0 = Now(); - for (;;) { - const double elapsed = Now() - t0; - if (elapsed > min_time) { - break; - } - } -} - -void Spin10() { - PROFILER_FUNC; - Spin(10E-6); -} - -void Spin20() { - PROFILER_FUNC; - Spin(20E-6); -} - -void Spin3060() { - { - PROFILER_ZONE("spin30"); - Spin(30E-6); - } - { - PROFILER_ZONE("spin60"); - Spin(60E-6); - } -} - -void Level3() { - PROFILER_FUNC; - for (int rep = 0; rep < 10; ++rep) { - double total = 0.0; - for (int i = 0; i < 100 - rep; ++i) { - total += pow(0.9, i); - } - if (std::abs(total - 9.999) > 1E-2) { - abort(); - } - } -} - -void Level2() { - PROFILER_FUNC; - Level3(); -} - -void Level1() { - PROFILER_FUNC; - Level2(); -} - -void ProfilerExample() { - PinThreadToRandomCPU(); - { - PROFILER_FUNC; - Spin10(); - Spin20(); - Spin3060(); - Level1(); - } - PROFILER_PRINT_RESULTS(); -} - -} // namespace -} // namespace highwayhash - -int main(int argc, char* argv[]) { - highwayhash::ProfilerExample(); - return 0; -} +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <cassert> +#include <cmath> +#include <cstdlib> + +#include "highwayhash/os_specific.h" +#include "highwayhash/profiler.h" + +namespace highwayhash { +namespace { + +void Spin(const double min_time) { + const double t0 = Now(); + for (;;) { + const double elapsed = Now() - t0; + if (elapsed > min_time) { + break; + } + } +} + +void Spin10() { + PROFILER_FUNC; + Spin(10E-6); +} + +void Spin20() { + PROFILER_FUNC; + Spin(20E-6); +} + +void Spin3060() { + { + PROFILER_ZONE("spin30"); + Spin(30E-6); + } + { + PROFILER_ZONE("spin60"); + Spin(60E-6); + } +} + +void Level3() { + PROFILER_FUNC; + for (int rep = 0; rep < 10; ++rep) { + double total = 0.0; + for (int i = 0; i < 100 - rep; ++i) { + total += pow(0.9, i); + } + if (std::abs(total - 9.999) > 1E-2) { + abort(); + } + } +} + +void Level2() { + PROFILER_FUNC; + Level3(); +} + +void Level1() { + PROFILER_FUNC; + Level2(); +} + +void ProfilerExample() { + PinThreadToRandomCPU(); + { + PROFILER_FUNC; + Spin10(); + Spin20(); + Spin3060(); + Level1(); + } + PROFILER_PRINT_RESULTS(); +} + +} // namespace +} // namespace highwayhash + +int main(int argc, char* argv[]) { + highwayhash::ProfilerExample(); + return 0; +} diff --git a/contrib/libs/highwayhash/highwayhash/robust_statistics.h b/contrib/libs/highwayhash/highwayhash/robust_statistics.h index 4e45494f9b..9c4a0b4cd5 100644 --- a/contrib/libs/highwayhash/highwayhash/robust_statistics.h +++ b/contrib/libs/highwayhash/highwayhash/robust_statistics.h @@ -1,135 +1,135 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_ROBUST_STATISTICS_H_ -#define HIGHWAYHASH_ROBUST_STATISTICS_H_ - -// Robust statistics: Mode, Median, MedianAbsoluteDeviation. - -#include <stddef.h> -#include <algorithm> -#include <cassert> -#include <cmath> -#include <limits> -#include <vector> - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" - -namespace highwayhash { - -// @return i in [idx_begin, idx_begin + half_count) that minimizes -// sorted[i + half_count] - sorted[i]. -template <typename T> -size_t MinRange(const T* const HH_RESTRICT sorted, const size_t idx_begin, - const size_t half_count) { - T min_range = std::numeric_limits<T>::max(); - size_t min_idx = 0; - - for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) { - assert(sorted[idx] <= sorted[idx + half_count]); - const T range = sorted[idx + half_count] - sorted[idx]; - if (range < min_range) { - min_range = range; - min_idx = idx; - } - } - - return min_idx; -} - -// Returns an estimate of the mode by calling MinRange on successively -// halved intervals. "sorted" must be in ascending order. This is the -// Half Sample Mode estimator proposed by Bickel in "On a fast, robust -// estimator of the mode", with complexity O(N log N). The mode is less -// affected by outliers in highly-skewed distributions than the median. -// The averaging operation below assumes "T" is an unsigned integer type. -template <typename T> -T Mode(const T* const HH_RESTRICT sorted, const size_t num_values) { - size_t idx_begin = 0; - size_t half_count = num_values / 2; - while (half_count > 1) { - idx_begin = MinRange(sorted, idx_begin, half_count); - half_count >>= 1; - } - - const T x = sorted[idx_begin + 0]; - if (half_count == 0) { - return x; - } - assert(half_count == 1); - const T average = (x + sorted[idx_begin + 1] + 1) / 2; - return average; -} - -// Sorts integral values in ascending order. About 3x faster than std::sort for -// input distributions with very few unique values. -template <class T> -void CountingSort(T* begin, T* end) { - // Unique values and their frequency (similar to flat_map). - using Unique = std::pair<T, int>; - std::vector<Unique> unique; - for (const T* p = begin; p != end; ++p) { - const T value = *p; - const auto pos = - std::find_if(unique.begin(), unique.end(), - [value](const Unique& u) { return u.first == value; }); - if (pos == unique.end()) { - unique.push_back(std::make_pair(*p, 1)); - } else { - ++pos->second; - } - } - - // Sort in ascending order of value (pair.first). - std::sort(unique.begin(), unique.end()); - - // Write that many copies of each unique value to the array. - T* HH_RESTRICT p = begin; - for (const auto& value_count : unique) { - std::fill(p, p + value_count.second, value_count.first); - p += value_count.second; - } - assert(p == end); -} - -// Returns the median value. Side effect: sorts "samples". -template <typename T> -T Median(std::vector<T>* samples) { - assert(!samples->empty()); - std::sort(samples->begin(), samples->end()); - const size_t half = samples->size() / 2; - // Odd count: return middle - if (samples->size() % 2) { - return (*samples)[half]; - } - // Even count: return average of middle two. - return ((*samples)[half] + (*samples)[half - 1]) / 2; -} - -// Returns a robust measure of variability. -template <typename T> -T MedianAbsoluteDeviation(const std::vector<T>& samples, const T median) { - assert(!samples.empty()); - std::vector<T> abs_deviations; - abs_deviations.reserve(samples.size()); - for (const T sample : samples) { - abs_deviations.push_back(std::abs(sample - median)); - } - return Median(&abs_deviations); -} - -} // namespace highwayhash - -#endif // HIGHWAYHASH_ROBUST_STATISTICS_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_ROBUST_STATISTICS_H_ +#define HIGHWAYHASH_ROBUST_STATISTICS_H_ + +// Robust statistics: Mode, Median, MedianAbsoluteDeviation. + +#include <stddef.h> +#include <algorithm> +#include <cassert> +#include <cmath> +#include <limits> +#include <vector> + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" + +namespace highwayhash { + +// @return i in [idx_begin, idx_begin + half_count) that minimizes +// sorted[i + half_count] - sorted[i]. +template <typename T> +size_t MinRange(const T* const HH_RESTRICT sorted, const size_t idx_begin, + const size_t half_count) { + T min_range = std::numeric_limits<T>::max(); + size_t min_idx = 0; + + for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) { + assert(sorted[idx] <= sorted[idx + half_count]); + const T range = sorted[idx + half_count] - sorted[idx]; + if (range < min_range) { + min_range = range; + min_idx = idx; + } + } + + return min_idx; +} + +// Returns an estimate of the mode by calling MinRange on successively +// halved intervals. "sorted" must be in ascending order. This is the +// Half Sample Mode estimator proposed by Bickel in "On a fast, robust +// estimator of the mode", with complexity O(N log N). The mode is less +// affected by outliers in highly-skewed distributions than the median. +// The averaging operation below assumes "T" is an unsigned integer type. +template <typename T> +T Mode(const T* const HH_RESTRICT sorted, const size_t num_values) { + size_t idx_begin = 0; + size_t half_count = num_values / 2; + while (half_count > 1) { + idx_begin = MinRange(sorted, idx_begin, half_count); + half_count >>= 1; + } + + const T x = sorted[idx_begin + 0]; + if (half_count == 0) { + return x; + } + assert(half_count == 1); + const T average = (x + sorted[idx_begin + 1] + 1) / 2; + return average; +} + +// Sorts integral values in ascending order. About 3x faster than std::sort for +// input distributions with very few unique values. +template <class T> +void CountingSort(T* begin, T* end) { + // Unique values and their frequency (similar to flat_map). + using Unique = std::pair<T, int>; + std::vector<Unique> unique; + for (const T* p = begin; p != end; ++p) { + const T value = *p; + const auto pos = + std::find_if(unique.begin(), unique.end(), + [value](const Unique& u) { return u.first == value; }); + if (pos == unique.end()) { + unique.push_back(std::make_pair(*p, 1)); + } else { + ++pos->second; + } + } + + // Sort in ascending order of value (pair.first). + std::sort(unique.begin(), unique.end()); + + // Write that many copies of each unique value to the array. + T* HH_RESTRICT p = begin; + for (const auto& value_count : unique) { + std::fill(p, p + value_count.second, value_count.first); + p += value_count.second; + } + assert(p == end); +} + +// Returns the median value. Side effect: sorts "samples". +template <typename T> +T Median(std::vector<T>* samples) { + assert(!samples->empty()); + std::sort(samples->begin(), samples->end()); + const size_t half = samples->size() / 2; + // Odd count: return middle + if (samples->size() % 2) { + return (*samples)[half]; + } + // Even count: return average of middle two. + return ((*samples)[half] + (*samples)[half - 1]) / 2; +} + +// Returns a robust measure of variability. +template <typename T> +T MedianAbsoluteDeviation(const std::vector<T>& samples, const T median) { + assert(!samples.empty()); + std::vector<T> abs_deviations; + abs_deviations.reserve(samples.size()); + for (const T sample : samples) { + abs_deviations.push_back(std::abs(sample - median)); + } + return Median(&abs_deviations); +} + +} // namespace highwayhash + +#endif // HIGHWAYHASH_ROBUST_STATISTICS_H_ diff --git a/contrib/libs/highwayhash/highwayhash/scalar.h b/contrib/libs/highwayhash/highwayhash/scalar.h index 72ccae727e..eb7bac9c1d 100644 --- a/contrib/libs/highwayhash/highwayhash/scalar.h +++ b/contrib/libs/highwayhash/highwayhash/scalar.h @@ -1,352 +1,352 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_SCALAR_H_ -#define HIGHWAYHASH_SCALAR_H_ - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include <stddef.h> // size_t -#include <stdint.h> - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" - -namespace highwayhash { -// To prevent ODR violations when including this from multiple translation -// units (TU) that are compiled with different flags, the contents must reside -// in a namespace whose name is unique to the TU. NOTE: this behavior is -// incompatible with precompiled modules and requires textual inclusion instead. -namespace HH_TARGET_NAME { - -// Single-lane "vector" type with the same interface as V128/Scalar. Allows the -// same client template to generate both SIMD and portable code. -template <typename Type> -class Scalar { - public: - struct Intrinsic { - Type t; - }; - - using T = Type; - static constexpr size_t N = 1; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE Scalar() {} - - HH_INLINE explicit Scalar(const T t) : v_(t) {} - - HH_INLINE Scalar(const Scalar<T>& other) : v_(other.v_) {} - - HH_INLINE Scalar& operator=(const Scalar<T>& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE Scalar(const Intrinsic& v) : v_(v.t) {} - HH_INLINE Scalar& operator=(const Intrinsic& v) { - v_ = v.t; - return *this; - } - HH_INLINE operator Intrinsic() const { return {v_}; } - - HH_INLINE Scalar operator==(const Scalar& other) const { - Scalar eq; - eq.FillWithByte(v_ == other.v_ ? 0xFF : 0x00); - return eq; - } - HH_INLINE Scalar operator<(const Scalar& other) const { - Scalar lt; - lt.FillWithByte(v_ < other.v_ ? 0xFF : 0x00); - return lt; - } - HH_INLINE Scalar operator>(const Scalar& other) const { - Scalar gt; - gt.FillWithByte(v_ > other.v_ ? 0xFF : 0x00); - return gt; - } - - HH_INLINE Scalar& operator*=(const Scalar& other) { - v_ *= other.v_; - return *this; - } - HH_INLINE Scalar& operator/=(const Scalar& other) { - v_ /= other.v_; - return *this; - } - HH_INLINE Scalar& operator+=(const Scalar& other) { - v_ += other.v_; - return *this; - } - HH_INLINE Scalar& operator-=(const Scalar& other) { - v_ -= other.v_; - return *this; - } - - HH_INLINE Scalar& operator&=(const Scalar& other) { - v_ &= other.v_; - return *this; - } - HH_INLINE Scalar& operator|=(const Scalar& other) { - v_ |= other.v_; - return *this; - } - HH_INLINE Scalar& operator^=(const Scalar& other) { - v_ ^= other.v_; - return *this; - } - - HH_INLINE Scalar& operator<<=(const int count) { - // In C, int64_t << 64 is undefined, but we want to match the sensible - // behavior of SSE2 (zeroing). - if (count >= sizeof(T) * 8) { - v_ = 0; - } else { - v_ <<= count; - } - return *this; - } - - HH_INLINE Scalar& operator>>=(const int count) { - if (count >= sizeof(T) * 8) { - v_ = 0; - } else { - v_ >>= count; - } - return *this; - } - - // For internal use only. We need to avoid memcpy/memset because this is a - // restricted header. - void FillWithByte(const unsigned char value) { - unsigned char* bytes = reinterpret_cast<unsigned char*>(&v_); - for (size_t i = 0; i < sizeof(T); ++i) { - bytes[i] = value; - } - } - - void CopyTo(unsigned char* HH_RESTRICT to_bytes) const { - const unsigned char* from_bytes = - reinterpret_cast<const unsigned char*>(&v_); - for (size_t i = 0; i < sizeof(T); ++i) { - to_bytes[i] = from_bytes[i]; - } - } - - private: - T v_; -}; - -// Non-member operators. - -template <typename T> -HH_INLINE Scalar<T> operator*(const Scalar<T>& left, const Scalar<T>& right) { - Scalar<T> t(left); - return t *= right; -} - -template <typename T> -HH_INLINE Scalar<T> operator/(const Scalar<T>& left, const Scalar<T>& right) { - Scalar<T> t(left); - return t /= right; -} - -template <typename T> -HH_INLINE Scalar<T> operator+(const Scalar<T>& left, const Scalar<T>& right) { - Scalar<T> t(left); - return t += right; -} - -template <typename T> -HH_INLINE Scalar<T> operator-(const Scalar<T>& left, const Scalar<T>& right) { - Scalar<T> t(left); - return t -= right; -} - -template <typename T> -HH_INLINE Scalar<T> operator&(const Scalar<T>& left, const Scalar<T>& right) { - Scalar<T> t(left); - return t &= right; -} - -template <typename T> -HH_INLINE Scalar<T> operator|(const Scalar<T> left, const Scalar<T>& right) { - Scalar<T> t(left); - return t |= right; -} - -template <typename T> -HH_INLINE Scalar<T> operator^(const Scalar<T>& left, const Scalar<T>& right) { - Scalar<T> t(left); - return t ^= right; -} - -template <typename T> -HH_INLINE Scalar<T> operator<<(const Scalar<T>& v, const int count) { - Scalar<T> t(v); - return t <<= count; -} - -template <typename T> -HH_INLINE Scalar<T> operator>>(const Scalar<T>& v, const int count) { - Scalar<T> t(v); - return t >>= count; -} - -using V1x8U = Scalar<uint8_t>; -using V1x16U = Scalar<uint16_t>; -using V1x16I = Scalar<int16_t>; -using V1x32U = Scalar<uint32_t>; -using V1x32I = Scalar<int32_t>; -using V1x64U = Scalar<uint64_t>; -using V1x32F = Scalar<float>; -using V1x64F = Scalar<double>; - -// Load/Store. - -// We differentiate between targets' vector types via template specialization. -// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may -// generate better code in unoptimized builds. Only declare the primary -// templates to avoid needing mutual exclusion with vector128/256. -template <class V> -HH_INLINE V Load(const typename V::T* const HH_RESTRICT from); -template <class V> -HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from); - -template <> -HH_INLINE V1x8U Load<V1x8U>(const V1x8U::T* const HH_RESTRICT from) { - return V1x8U(*from); -} -template <> -HH_INLINE V1x16U Load<V1x16U>(const V1x16U::T* const HH_RESTRICT from) { - return V1x16U(*from); -} -template <> -HH_INLINE V1x16I Load<V1x16I>(const V1x16I::T* const HH_RESTRICT from) { - return V1x16I(*from); -} -template <> -HH_INLINE V1x32U Load<V1x32U>(const V1x32U::T* const HH_RESTRICT from) { - return V1x32U(*from); -} -template <> -HH_INLINE V1x32I Load<V1x32I>(const V1x32I::T* const HH_RESTRICT from) { - return V1x32I(*from); -} -template <> -HH_INLINE V1x64U Load<V1x64U>(const V1x64U::T* const HH_RESTRICT from) { - return V1x64U(*from); -} -template <> -HH_INLINE V1x32F Load<V1x32F>(const V1x32F::T* const HH_RESTRICT from) { - return V1x32F(*from); -} -template <> -HH_INLINE V1x64F Load<V1x64F>(const V1x64F::T* const HH_RESTRICT from) { - return V1x64F(*from); -} - -template <> -HH_INLINE V1x8U LoadUnaligned<V1x8U>(const V1x8U::T* const HH_RESTRICT from) { - return V1x8U(*from); -} -template <> -HH_INLINE V1x16U -LoadUnaligned<V1x16U>(const V1x16U::T* const HH_RESTRICT from) { - return V1x16U(*from); -} -template <> -HH_INLINE V1x16I -LoadUnaligned<V1x16I>(const V1x16I::T* const HH_RESTRICT from) { - return V1x16I(*from); -} -template <> -HH_INLINE V1x32U -LoadUnaligned<V1x32U>(const V1x32U::T* const HH_RESTRICT from) { - return V1x32U(*from); -} -template <> -HH_INLINE V1x32I -LoadUnaligned<V1x32I>(const V1x32I::T* const HH_RESTRICT from) { - return V1x32I(*from); -} -template <> -HH_INLINE V1x64U -LoadUnaligned<V1x64U>(const V1x64U::T* const HH_RESTRICT from) { - return V1x64U(*from); -} -template <> -HH_INLINE V1x32F -LoadUnaligned<V1x32F>(const V1x32F::T* const HH_RESTRICT from) { - return V1x32F(*from); -} -template <> -HH_INLINE V1x64F -LoadUnaligned<V1x64F>(const V1x64F::T* const HH_RESTRICT from) { - return V1x64F(*from); -} - -template <typename T> -HH_INLINE void Store(const Scalar<T>& v, T* const HH_RESTRICT to) { - v.CopyTo(reinterpret_cast<unsigned char*>(to)); -} - -template <typename T> -HH_INLINE void StoreUnaligned(const Scalar<T>& v, T* const HH_RESTRICT to) { - v.CopyTo(reinterpret_cast<unsigned char*>(to)); -} - -template <typename T> -HH_INLINE void Stream(const Scalar<T>& v, T* const HH_RESTRICT to) { - v.CopyTo(reinterpret_cast<unsigned char*>(to)); -} - -// Miscellaneous functions. - -template <typename T> -HH_INLINE Scalar<T> RotateLeft(const Scalar<T>& v, const int count) { - constexpr size_t num_bits = sizeof(T) * 8; - return (v << count) | (v >> (num_bits - count)); -} - -template <typename T> -HH_INLINE Scalar<T> AndNot(const Scalar<T>& neg_mask, const Scalar<T>& values) { - return values & ~neg_mask; -} - -template <typename T> -HH_INLINE Scalar<T> Select(const Scalar<T>& a, const Scalar<T>& b, - const Scalar<T>& mask) { - const char* mask_bytes = reinterpret_cast<const char*>(&mask); - return (mask_bytes[sizeof(T) - 1] & 0x80) ? b : a; -} - -template <typename T> -HH_INLINE Scalar<T> Min(const Scalar<T>& v0, const Scalar<T>& v1) { - return (v0 < v1) ? v0 : v1; -} - -template <typename T> -HH_INLINE Scalar<T> Max(const Scalar<T>& v0, const Scalar<T>& v1) { - return (v0 < v1) ? v1 : v0; -} - -} // namespace HH_TARGET_NAME -} // namespace highwayhash - -#endif // HIGHWAYHASH_SCALAR_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_SCALAR_H_ +#define HIGHWAYHASH_SCALAR_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include <stddef.h> // size_t +#include <stdint.h> + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" + +namespace highwayhash { +// To prevent ODR violations when including this from multiple translation +// units (TU) that are compiled with different flags, the contents must reside +// in a namespace whose name is unique to the TU. NOTE: this behavior is +// incompatible with precompiled modules and requires textual inclusion instead. +namespace HH_TARGET_NAME { + +// Single-lane "vector" type with the same interface as V128/Scalar. Allows the +// same client template to generate both SIMD and portable code. +template <typename Type> +class Scalar { + public: + struct Intrinsic { + Type t; + }; + + using T = Type; + static constexpr size_t N = 1; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE Scalar() {} + + HH_INLINE explicit Scalar(const T t) : v_(t) {} + + HH_INLINE Scalar(const Scalar<T>& other) : v_(other.v_) {} + + HH_INLINE Scalar& operator=(const Scalar<T>& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE Scalar(const Intrinsic& v) : v_(v.t) {} + HH_INLINE Scalar& operator=(const Intrinsic& v) { + v_ = v.t; + return *this; + } + HH_INLINE operator Intrinsic() const { return {v_}; } + + HH_INLINE Scalar operator==(const Scalar& other) const { + Scalar eq; + eq.FillWithByte(v_ == other.v_ ? 0xFF : 0x00); + return eq; + } + HH_INLINE Scalar operator<(const Scalar& other) const { + Scalar lt; + lt.FillWithByte(v_ < other.v_ ? 0xFF : 0x00); + return lt; + } + HH_INLINE Scalar operator>(const Scalar& other) const { + Scalar gt; + gt.FillWithByte(v_ > other.v_ ? 0xFF : 0x00); + return gt; + } + + HH_INLINE Scalar& operator*=(const Scalar& other) { + v_ *= other.v_; + return *this; + } + HH_INLINE Scalar& operator/=(const Scalar& other) { + v_ /= other.v_; + return *this; + } + HH_INLINE Scalar& operator+=(const Scalar& other) { + v_ += other.v_; + return *this; + } + HH_INLINE Scalar& operator-=(const Scalar& other) { + v_ -= other.v_; + return *this; + } + + HH_INLINE Scalar& operator&=(const Scalar& other) { + v_ &= other.v_; + return *this; + } + HH_INLINE Scalar& operator|=(const Scalar& other) { + v_ |= other.v_; + return *this; + } + HH_INLINE Scalar& operator^=(const Scalar& other) { + v_ ^= other.v_; + return *this; + } + + HH_INLINE Scalar& operator<<=(const int count) { + // In C, int64_t << 64 is undefined, but we want to match the sensible + // behavior of SSE2 (zeroing). + if (count >= sizeof(T) * 8) { + v_ = 0; + } else { + v_ <<= count; + } + return *this; + } + + HH_INLINE Scalar& operator>>=(const int count) { + if (count >= sizeof(T) * 8) { + v_ = 0; + } else { + v_ >>= count; + } + return *this; + } + + // For internal use only. We need to avoid memcpy/memset because this is a + // restricted header. + void FillWithByte(const unsigned char value) { + unsigned char* bytes = reinterpret_cast<unsigned char*>(&v_); + for (size_t i = 0; i < sizeof(T); ++i) { + bytes[i] = value; + } + } + + void CopyTo(unsigned char* HH_RESTRICT to_bytes) const { + const unsigned char* from_bytes = + reinterpret_cast<const unsigned char*>(&v_); + for (size_t i = 0; i < sizeof(T); ++i) { + to_bytes[i] = from_bytes[i]; + } + } + + private: + T v_; +}; + +// Non-member operators. + +template <typename T> +HH_INLINE Scalar<T> operator*(const Scalar<T>& left, const Scalar<T>& right) { + Scalar<T> t(left); + return t *= right; +} + +template <typename T> +HH_INLINE Scalar<T> operator/(const Scalar<T>& left, const Scalar<T>& right) { + Scalar<T> t(left); + return t /= right; +} + +template <typename T> +HH_INLINE Scalar<T> operator+(const Scalar<T>& left, const Scalar<T>& right) { + Scalar<T> t(left); + return t += right; +} + +template <typename T> +HH_INLINE Scalar<T> operator-(const Scalar<T>& left, const Scalar<T>& right) { + Scalar<T> t(left); + return t -= right; +} + +template <typename T> +HH_INLINE Scalar<T> operator&(const Scalar<T>& left, const Scalar<T>& right) { + Scalar<T> t(left); + return t &= right; +} + +template <typename T> +HH_INLINE Scalar<T> operator|(const Scalar<T> left, const Scalar<T>& right) { + Scalar<T> t(left); + return t |= right; +} + +template <typename T> +HH_INLINE Scalar<T> operator^(const Scalar<T>& left, const Scalar<T>& right) { + Scalar<T> t(left); + return t ^= right; +} + +template <typename T> +HH_INLINE Scalar<T> operator<<(const Scalar<T>& v, const int count) { + Scalar<T> t(v); + return t <<= count; +} + +template <typename T> +HH_INLINE Scalar<T> operator>>(const Scalar<T>& v, const int count) { + Scalar<T> t(v); + return t >>= count; +} + +using V1x8U = Scalar<uint8_t>; +using V1x16U = Scalar<uint16_t>; +using V1x16I = Scalar<int16_t>; +using V1x32U = Scalar<uint32_t>; +using V1x32I = Scalar<int32_t>; +using V1x64U = Scalar<uint64_t>; +using V1x32F = Scalar<float>; +using V1x64F = Scalar<double>; + +// Load/Store. + +// We differentiate between targets' vector types via template specialization. +// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may +// generate better code in unoptimized builds. Only declare the primary +// templates to avoid needing mutual exclusion with vector128/256. +template <class V> +HH_INLINE V Load(const typename V::T* const HH_RESTRICT from); +template <class V> +HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from); + +template <> +HH_INLINE V1x8U Load<V1x8U>(const V1x8U::T* const HH_RESTRICT from) { + return V1x8U(*from); +} +template <> +HH_INLINE V1x16U Load<V1x16U>(const V1x16U::T* const HH_RESTRICT from) { + return V1x16U(*from); +} +template <> +HH_INLINE V1x16I Load<V1x16I>(const V1x16I::T* const HH_RESTRICT from) { + return V1x16I(*from); +} +template <> +HH_INLINE V1x32U Load<V1x32U>(const V1x32U::T* const HH_RESTRICT from) { + return V1x32U(*from); +} +template <> +HH_INLINE V1x32I Load<V1x32I>(const V1x32I::T* const HH_RESTRICT from) { + return V1x32I(*from); +} +template <> +HH_INLINE V1x64U Load<V1x64U>(const V1x64U::T* const HH_RESTRICT from) { + return V1x64U(*from); +} +template <> +HH_INLINE V1x32F Load<V1x32F>(const V1x32F::T* const HH_RESTRICT from) { + return V1x32F(*from); +} +template <> +HH_INLINE V1x64F Load<V1x64F>(const V1x64F::T* const HH_RESTRICT from) { + return V1x64F(*from); +} + +template <> +HH_INLINE V1x8U LoadUnaligned<V1x8U>(const V1x8U::T* const HH_RESTRICT from) { + return V1x8U(*from); +} +template <> +HH_INLINE V1x16U +LoadUnaligned<V1x16U>(const V1x16U::T* const HH_RESTRICT from) { + return V1x16U(*from); +} +template <> +HH_INLINE V1x16I +LoadUnaligned<V1x16I>(const V1x16I::T* const HH_RESTRICT from) { + return V1x16I(*from); +} +template <> +HH_INLINE V1x32U +LoadUnaligned<V1x32U>(const V1x32U::T* const HH_RESTRICT from) { + return V1x32U(*from); +} +template <> +HH_INLINE V1x32I +LoadUnaligned<V1x32I>(const V1x32I::T* const HH_RESTRICT from) { + return V1x32I(*from); +} +template <> +HH_INLINE V1x64U +LoadUnaligned<V1x64U>(const V1x64U::T* const HH_RESTRICT from) { + return V1x64U(*from); +} +template <> +HH_INLINE V1x32F +LoadUnaligned<V1x32F>(const V1x32F::T* const HH_RESTRICT from) { + return V1x32F(*from); +} +template <> +HH_INLINE V1x64F +LoadUnaligned<V1x64F>(const V1x64F::T* const HH_RESTRICT from) { + return V1x64F(*from); +} + +template <typename T> +HH_INLINE void Store(const Scalar<T>& v, T* const HH_RESTRICT to) { + v.CopyTo(reinterpret_cast<unsigned char*>(to)); +} + +template <typename T> +HH_INLINE void StoreUnaligned(const Scalar<T>& v, T* const HH_RESTRICT to) { + v.CopyTo(reinterpret_cast<unsigned char*>(to)); +} + +template <typename T> +HH_INLINE void Stream(const Scalar<T>& v, T* const HH_RESTRICT to) { + v.CopyTo(reinterpret_cast<unsigned char*>(to)); +} + +// Miscellaneous functions. + +template <typename T> +HH_INLINE Scalar<T> RotateLeft(const Scalar<T>& v, const int count) { + constexpr size_t num_bits = sizeof(T) * 8; + return (v << count) | (v >> (num_bits - count)); +} + +template <typename T> +HH_INLINE Scalar<T> AndNot(const Scalar<T>& neg_mask, const Scalar<T>& values) { + return values & ~neg_mask; +} + +template <typename T> +HH_INLINE Scalar<T> Select(const Scalar<T>& a, const Scalar<T>& b, + const Scalar<T>& mask) { + const char* mask_bytes = reinterpret_cast<const char*>(&mask); + return (mask_bytes[sizeof(T) - 1] & 0x80) ? b : a; +} + +template <typename T> +HH_INLINE Scalar<T> Min(const Scalar<T>& v0, const Scalar<T>& v1) { + return (v0 < v1) ? v0 : v1; +} + +template <typename T> +HH_INLINE Scalar<T> Max(const Scalar<T>& v0, const Scalar<T>& v1) { + return (v0 < v1) ? v1 : v0; +} + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HIGHWAYHASH_SCALAR_H_ diff --git a/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.cc b/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.cc index 9ddeca64e6..136f2769a1 100644 --- a/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.cc +++ b/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.cc @@ -1,183 +1,183 @@ -// Copyright 2015 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "highwayhash/scalar_sip_tree_hash.h" - -#include <cstddef> -#include <cstring> // memcpy - -#include "highwayhash/compiler_specific.h" -#include "highwayhash/sip_hash.h" - -namespace highwayhash { -namespace { - -// Paper: https://www.131002.net/siphash/siphash.pdf -// SSE41 implementation: https://goo.gl/80GBSD -// Tree hash extension: http://dx.doi.org/10.4236/jis.2014.53010 - -// The hash state is updated by injecting 4x8-byte packets; -// XORing together all state vectors yields 32 bytes that are -// reduced to 64 bits via 8-byte SipHash. - -const int kNumLanes = 4; -using Lanes = HH_U64[kNumLanes]; -const int kPacketSize = sizeof(Lanes); - -template <int kUpdateRounds, int kFinalizeRounds> -class ScalarSipTreeHashState { - public: - HH_INLINE ScalarSipTreeHashState(const Lanes& keys, const int lane) { - const HH_U64 key = keys[lane] ^ (kNumLanes | lane); - v0 = 0x736f6d6570736575ull ^ key; - v1 = 0x646f72616e646f6dull ^ key; - v2 = 0x6c7967656e657261ull ^ key; - v3 = 0x7465646279746573ull ^ key; - } - - HH_INLINE void Update(const HH_U64& packet) { - v3 ^= packet; - - Compress<kUpdateRounds>(); - - v0 ^= packet; - } - - HH_INLINE HH_U64 Finalize() { - // Mix in bits to avoid leaking the key if all packets were zero. - v2 ^= 0xFF; - - Compress<kFinalizeRounds>(); - - return (v0 ^ v1) ^ (v2 ^ v3); - } - - private: - // Rotate a 64-bit value "v" left by N bits. - template <HH_U64 bits> - static HH_INLINE HH_U64 RotateLeft(const HH_U64 v) { - const HH_U64 left = v << bits; - const HH_U64 right = v >> (64 - bits); - return left | right; - } - - template <int kRounds> - HH_INLINE void Compress() { - for (int i = 0; i < kRounds; ++i) { - // ARX network: add, rotate, exclusive-or. - v0 += v1; - v2 += v3; - v1 = RotateLeft<13>(v1); - v3 = RotateLeft<16>(v3); - v1 ^= v0; - v3 ^= v2; - - v0 = RotateLeft<32>(v0); - - v2 += v1; - v0 += v3; - v1 = RotateLeft<17>(v1); - v3 = RotateLeft<21>(v3); - v1 ^= v2; - v3 ^= v0; - - v2 = RotateLeft<32>(v2); - } - } - - HH_U64 v0; - HH_U64 v1; - HH_U64 v2; - HH_U64 v3; -}; - -} // namespace - -template <size_t kUpdateRounds, size_t kFinalizeRounds> -HH_U64 ScalarSipTreeHashT(const Lanes& key, const char* bytes, - const HH_U64 size) { - // "j-lanes" tree hashing interleaves 8-byte input packets. - using State = ScalarSipTreeHashState<kUpdateRounds, kFinalizeRounds>; - State state[kNumLanes] = {State(key, 0), State(key, 1), State(key, 2), - State(key, 3)}; - - // Hash entire 32-byte packets. - const size_t remainder = size & (kPacketSize - 1); - const size_t truncated_size = size - remainder; - const HH_U64* packets = reinterpret_cast<const HH_U64*>(bytes); - for (size_t i = 0; i < truncated_size / kPacketSize; ++i) { - for (int lane = 0; lane < kNumLanes; ++lane) { - const HH_U64 packet = *packets++; - state[lane].Update(packet); - } - } - - // Update with final 32-byte packet. - const size_t remainder_mod4 = remainder & 3; - uint32_t packet4 = static_cast<uint32_t>(remainder << 24); - const char* final_bytes = bytes + size - remainder_mod4; - for (size_t i = 0; i < remainder_mod4; ++i) { - const uint32_t byte = static_cast<unsigned char>(final_bytes[i]); - packet4 += byte << (i * 8); - } - - char final_packet[kPacketSize] = {0}; - memcpy(final_packet, bytes + truncated_size, remainder - remainder_mod4); - memcpy(final_packet + kPacketSize - 4, &packet4, sizeof(packet4)); - packets = reinterpret_cast<const HH_U64*>(final_packet); - for (int lane = 0; lane < kNumLanes; ++lane) { - state[lane].Update(packets[lane]); - } - - // Store the resulting hashes. - uint64_t hashes[4]; - for (int lane = 0; lane < kNumLanes; ++lane) { - hashes[lane] = state[lane].Finalize(); - } - - typename SipHashStateT<kUpdateRounds, kFinalizeRounds>::Key reduce_key; - memcpy(&reduce_key, &key, sizeof(reduce_key)); - return ReduceSipTreeHash<kNumLanes, kUpdateRounds, kFinalizeRounds>( - reduce_key, hashes); -} - -HH_U64 ScalarSipTreeHash(const Lanes& key, const char* bytes, - const HH_U64 size) { - return ScalarSipTreeHashT<2, 4>(key, bytes, size); -} - -HH_U64 ScalarSipTreeHash13(const Lanes& key, const char* bytes, - const HH_U64 size) { - return ScalarSipTreeHashT<1, 3>(key, bytes, size); -} -} // namespace highwayhash - -using highwayhash::HH_U64; -using highwayhash::ScalarSipTreeHash; -using highwayhash::ScalarSipTreeHash13; -using Key = HH_U64[4]; - -extern "C" { - -HH_U64 ScalarSipTreeHashC(const HH_U64* key, const char* bytes, - const HH_U64 size) { - return ScalarSipTreeHash(*reinterpret_cast<const Key*>(key), bytes, size); -} - -HH_U64 ScalarSipTreeHash13C(const HH_U64* key, const char* bytes, - const HH_U64 size) { - return ScalarSipTreeHash13(*reinterpret_cast<const Key*>(key), bytes, size); -} - -} // extern "C" +// Copyright 2015 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highwayhash/scalar_sip_tree_hash.h" + +#include <cstddef> +#include <cstring> // memcpy + +#include "highwayhash/compiler_specific.h" +#include "highwayhash/sip_hash.h" + +namespace highwayhash { +namespace { + +// Paper: https://www.131002.net/siphash/siphash.pdf +// SSE41 implementation: https://goo.gl/80GBSD +// Tree hash extension: http://dx.doi.org/10.4236/jis.2014.53010 + +// The hash state is updated by injecting 4x8-byte packets; +// XORing together all state vectors yields 32 bytes that are +// reduced to 64 bits via 8-byte SipHash. + +const int kNumLanes = 4; +using Lanes = HH_U64[kNumLanes]; +const int kPacketSize = sizeof(Lanes); + +template <int kUpdateRounds, int kFinalizeRounds> +class ScalarSipTreeHashState { + public: + HH_INLINE ScalarSipTreeHashState(const Lanes& keys, const int lane) { + const HH_U64 key = keys[lane] ^ (kNumLanes | lane); + v0 = 0x736f6d6570736575ull ^ key; + v1 = 0x646f72616e646f6dull ^ key; + v2 = 0x6c7967656e657261ull ^ key; + v3 = 0x7465646279746573ull ^ key; + } + + HH_INLINE void Update(const HH_U64& packet) { + v3 ^= packet; + + Compress<kUpdateRounds>(); + + v0 ^= packet; + } + + HH_INLINE HH_U64 Finalize() { + // Mix in bits to avoid leaking the key if all packets were zero. + v2 ^= 0xFF; + + Compress<kFinalizeRounds>(); + + return (v0 ^ v1) ^ (v2 ^ v3); + } + + private: + // Rotate a 64-bit value "v" left by N bits. + template <HH_U64 bits> + static HH_INLINE HH_U64 RotateLeft(const HH_U64 v) { + const HH_U64 left = v << bits; + const HH_U64 right = v >> (64 - bits); + return left | right; + } + + template <int kRounds> + HH_INLINE void Compress() { + for (int i = 0; i < kRounds; ++i) { + // ARX network: add, rotate, exclusive-or. + v0 += v1; + v2 += v3; + v1 = RotateLeft<13>(v1); + v3 = RotateLeft<16>(v3); + v1 ^= v0; + v3 ^= v2; + + v0 = RotateLeft<32>(v0); + + v2 += v1; + v0 += v3; + v1 = RotateLeft<17>(v1); + v3 = RotateLeft<21>(v3); + v1 ^= v2; + v3 ^= v0; + + v2 = RotateLeft<32>(v2); + } + } + + HH_U64 v0; + HH_U64 v1; + HH_U64 v2; + HH_U64 v3; +}; + +} // namespace + +template <size_t kUpdateRounds, size_t kFinalizeRounds> +HH_U64 ScalarSipTreeHashT(const Lanes& key, const char* bytes, + const HH_U64 size) { + // "j-lanes" tree hashing interleaves 8-byte input packets. + using State = ScalarSipTreeHashState<kUpdateRounds, kFinalizeRounds>; + State state[kNumLanes] = {State(key, 0), State(key, 1), State(key, 2), + State(key, 3)}; + + // Hash entire 32-byte packets. + const size_t remainder = size & (kPacketSize - 1); + const size_t truncated_size = size - remainder; + const HH_U64* packets = reinterpret_cast<const HH_U64*>(bytes); + for (size_t i = 0; i < truncated_size / kPacketSize; ++i) { + for (int lane = 0; lane < kNumLanes; ++lane) { + const HH_U64 packet = *packets++; + state[lane].Update(packet); + } + } + + // Update with final 32-byte packet. + const size_t remainder_mod4 = remainder & 3; + uint32_t packet4 = static_cast<uint32_t>(remainder << 24); + const char* final_bytes = bytes + size - remainder_mod4; + for (size_t i = 0; i < remainder_mod4; ++i) { + const uint32_t byte = static_cast<unsigned char>(final_bytes[i]); + packet4 += byte << (i * 8); + } + + char final_packet[kPacketSize] = {0}; + memcpy(final_packet, bytes + truncated_size, remainder - remainder_mod4); + memcpy(final_packet + kPacketSize - 4, &packet4, sizeof(packet4)); + packets = reinterpret_cast<const HH_U64*>(final_packet); + for (int lane = 0; lane < kNumLanes; ++lane) { + state[lane].Update(packets[lane]); + } + + // Store the resulting hashes. + uint64_t hashes[4]; + for (int lane = 0; lane < kNumLanes; ++lane) { + hashes[lane] = state[lane].Finalize(); + } + + typename SipHashStateT<kUpdateRounds, kFinalizeRounds>::Key reduce_key; + memcpy(&reduce_key, &key, sizeof(reduce_key)); + return ReduceSipTreeHash<kNumLanes, kUpdateRounds, kFinalizeRounds>( + reduce_key, hashes); +} + +HH_U64 ScalarSipTreeHash(const Lanes& key, const char* bytes, + const HH_U64 size) { + return ScalarSipTreeHashT<2, 4>(key, bytes, size); +} + +HH_U64 ScalarSipTreeHash13(const Lanes& key, const char* bytes, + const HH_U64 size) { + return ScalarSipTreeHashT<1, 3>(key, bytes, size); +} +} // namespace highwayhash + +using highwayhash::HH_U64; +using highwayhash::ScalarSipTreeHash; +using highwayhash::ScalarSipTreeHash13; +using Key = HH_U64[4]; + +extern "C" { + +HH_U64 ScalarSipTreeHashC(const HH_U64* key, const char* bytes, + const HH_U64 size) { + return ScalarSipTreeHash(*reinterpret_cast<const Key*>(key), bytes, size); +} + +HH_U64 ScalarSipTreeHash13C(const HH_U64* key, const char* bytes, + const HH_U64 size) { + return ScalarSipTreeHash13(*reinterpret_cast<const Key*>(key), bytes, size); +} + +} // extern "C" diff --git a/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.h b/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.h index 2f79f3a010..f882be89d2 100644 --- a/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.h +++ b/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.h @@ -1,37 +1,37 @@ -// Copyright 2015 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_ -#define HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_ - -// Scalar (non-vector/SIMD) version for comparison purposes. - -#include "highwayhash/state_helpers.h" - -#ifdef __cplusplus -namespace highwayhash { -extern "C" { -#endif - -HH_U64 ScalarSipTreeHash(const HH_U64 (&key)[4], const char* bytes, - const HH_U64 size); -HH_U64 ScalarSipTreeHash13(const HH_U64 (&key)[4], const char* bytes, - const HH_U64 size); - -#ifdef __cplusplus -} // extern "C" -} // namespace highwayhash -#endif - -#endif // HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_ +// Copyright 2015 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_ +#define HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_ + +// Scalar (non-vector/SIMD) version for comparison purposes. + +#include "highwayhash/state_helpers.h" + +#ifdef __cplusplus +namespace highwayhash { +extern "C" { +#endif + +HH_U64 ScalarSipTreeHash(const HH_U64 (&key)[4], const char* bytes, + const HH_U64 size); +HH_U64 ScalarSipTreeHash13(const HH_U64 (&key)[4], const char* bytes, + const HH_U64 size); + +#ifdef __cplusplus +} // extern "C" +} // namespace highwayhash +#endif + +#endif // HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_ diff --git a/contrib/libs/highwayhash/highwayhash/sip_hash.cc b/contrib/libs/highwayhash/highwayhash/sip_hash.cc index 1c08533544..3d73a0bcdd 100644 --- a/contrib/libs/highwayhash/highwayhash/sip_hash.cc +++ b/contrib/libs/highwayhash/highwayhash/sip_hash.cc @@ -1,33 +1,33 @@ -// Copyright 2016 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "highwayhash/sip_hash.h" - -using highwayhash::HH_U64; -using highwayhash::SipHash; -using highwayhash::SipHash13; -using Key = highwayhash::SipHashState::Key; -using Key13 = highwayhash::SipHash13State::Key; - -extern "C" { - -HH_U64 SipHashC(const HH_U64* key, const char* bytes, const HH_U64 size) { - return SipHash(*reinterpret_cast<const Key*>(key), bytes, size); -} - -HH_U64 SipHash13C(const HH_U64* key, const char* bytes, const HH_U64 size) { - return SipHash13(*reinterpret_cast<const Key13*>(key), bytes, size); -} - -} // extern "C" +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highwayhash/sip_hash.h" + +using highwayhash::HH_U64; +using highwayhash::SipHash; +using highwayhash::SipHash13; +using Key = highwayhash::SipHashState::Key; +using Key13 = highwayhash::SipHash13State::Key; + +extern "C" { + +HH_U64 SipHashC(const HH_U64* key, const char* bytes, const HH_U64 size) { + return SipHash(*reinterpret_cast<const Key*>(key), bytes, size); +} + +HH_U64 SipHash13C(const HH_U64* key, const char* bytes, const HH_U64 size) { + return SipHash13(*reinterpret_cast<const Key13*>(key), bytes, size); +} + +} // extern "C" diff --git a/contrib/libs/highwayhash/highwayhash/sip_hash.h b/contrib/libs/highwayhash/highwayhash/sip_hash.h index eebe3dc944..24a5cf4f22 100644 --- a/contrib/libs/highwayhash/highwayhash/sip_hash.h +++ b/contrib/libs/highwayhash/highwayhash/sip_hash.h @@ -1,171 +1,171 @@ -// Copyright 2016 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_SIP_HASH_H_ -#define HIGHWAYHASH_SIP_HASH_H_ - -// Portable but fast SipHash implementation. - -#include <cstddef> -#include <cstring> // memcpy - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" -#include "highwayhash/endianess.h" -#include "highwayhash/state_helpers.h" - -namespace highwayhash { - -// Paper: https://www.131002.net/siphash/siphash.pdf -template <int kUpdateIters, int kFinalizeIters> -class SipHashStateT { - public: - using Key = HH_U64[2]; - static const size_t kPacketSize = sizeof(HH_U64); - - explicit HH_INLINE SipHashStateT(const Key& key) { - v0 = 0x736f6d6570736575ull ^ key[0]; - v1 = 0x646f72616e646f6dull ^ key[1]; - v2 = 0x6c7967656e657261ull ^ key[0]; - v3 = 0x7465646279746573ull ^ key[1]; - } - - HH_INLINE void Update(const char* bytes) { - HH_U64 packet; - memcpy(&packet, bytes, sizeof(packet)); - packet = host_from_le64(packet); - - v3 ^= packet; - - Compress<kUpdateIters>(); - - v0 ^= packet; - } - - HH_INLINE HH_U64 Finalize() { - // Mix in bits to avoid leaking the key if all packets were zero. - v2 ^= 0xFF; - - Compress<kFinalizeIters>(); - - return (v0 ^ v1) ^ (v2 ^ v3); - } - private: - // Rotate a 64-bit value "v" left by N bits. - template <HH_U64 bits> - static HH_INLINE HH_U64 RotateLeft(const HH_U64 v) { - const HH_U64 left = v << bits; - const HH_U64 right = v >> (64 - bits); - return left | right; - } - - template <size_t rounds> - HH_INLINE void Compress() { - for (size_t i = 0; i < rounds; ++i) { - // ARX network: add, rotate, exclusive-or. - v0 += v1; - v2 += v3; - v1 = RotateLeft<13>(v1); - v3 = RotateLeft<16>(v3); - v1 ^= v0; - v3 ^= v2; - - v0 = RotateLeft<32>(v0); - - v2 += v1; - v0 += v3; - v1 = RotateLeft<17>(v1); - v3 = RotateLeft<21>(v3); - v1 ^= v2; - v3 ^= v0; - - v2 = RotateLeft<32>(v2); - } - } - - HH_U64 v0; - HH_U64 v1; - HH_U64 v2; - HH_U64 v3; -}; - -using SipHashState = SipHashStateT<2, 4>; -using SipHash13State = SipHashStateT<1, 3>; - -// Override the HighwayTreeHash padding scheme with that of SipHash so that -// the hash output matches the known-good values in sip_hash_test. -template <> -HH_INLINE void PaddedUpdate<SipHashState>(const HH_U64 size, - const char* remaining_bytes, - const HH_U64 remaining_size, - SipHashState* state) { - // Copy to avoid overrunning the input buffer. - char final_packet[SipHashState::kPacketSize] = {0}; - memcpy(final_packet, remaining_bytes, remaining_size); - final_packet[SipHashState::kPacketSize - 1] = static_cast<char>(size & 0xFF); - state->Update(final_packet); -} - -template <> -HH_INLINE void PaddedUpdate<SipHash13State>(const HH_U64 size, - const char* remaining_bytes, - const HH_U64 remaining_size, - SipHash13State* state) { - // Copy to avoid overrunning the input buffer. - char final_packet[SipHash13State::kPacketSize] = {0}; - memcpy(final_packet, remaining_bytes, remaining_size); - final_packet[SipHash13State::kPacketSize - 1] = - static_cast<char>(size & 0xFF); - state->Update(final_packet); -} - -// Fast, cryptographically strong pseudo-random function, e.g. for -// deterministic/idempotent 'random' number generation. See also -// README.md for information on resisting hash flooding attacks. -// -// Robust versus timing attacks because memory accesses are sequential -// and the algorithm is branch-free. Compute time is proportional to the -// number of 8-byte packets and about twice as fast as an sse41 implementation. -// -// "key" is a secret 128-bit key unknown to attackers. -// "bytes" is the data to hash; ceil(size / 8) * 8 bytes are read. -// Returns a 64-bit hash of the given data bytes, which are swapped on -// big-endian CPUs so the return value is the same as on little-endian CPUs. -static HH_INLINE HH_U64 SipHash(const SipHashState::Key& key, const char* bytes, - const HH_U64 size) { - return ComputeHash<SipHashState>(key, bytes, size); -} - -// Round-reduced SipHash version (1 update and 3 finalization rounds). -static HH_INLINE HH_U64 SipHash13(const SipHash13State::Key& key, - const char* bytes, const HH_U64 size) { - return ComputeHash<SipHash13State>(key, bytes, size); -} - -template <int kNumLanes, int kUpdateIters, int kFinalizeIters> -static HH_INLINE HH_U64 ReduceSipTreeHash( - const typename SipHashStateT<kUpdateIters, kFinalizeIters>::Key& key, - const uint64_t (&hashes)[kNumLanes]) { - SipHashStateT<kUpdateIters, kFinalizeIters> state(key); - - for (int i = 0; i < kNumLanes; ++i) { - state.Update(reinterpret_cast<const char*>(&hashes[i])); - } - - return state.Finalize(); -} - -} // namespace highwayhash - -#endif // HIGHWAYHASH_SIP_HASH_H_ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_SIP_HASH_H_ +#define HIGHWAYHASH_SIP_HASH_H_ + +// Portable but fast SipHash implementation. + +#include <cstddef> +#include <cstring> // memcpy + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/endianess.h" +#include "highwayhash/state_helpers.h" + +namespace highwayhash { + +// Paper: https://www.131002.net/siphash/siphash.pdf +template <int kUpdateIters, int kFinalizeIters> +class SipHashStateT { + public: + using Key = HH_U64[2]; + static const size_t kPacketSize = sizeof(HH_U64); + + explicit HH_INLINE SipHashStateT(const Key& key) { + v0 = 0x736f6d6570736575ull ^ key[0]; + v1 = 0x646f72616e646f6dull ^ key[1]; + v2 = 0x6c7967656e657261ull ^ key[0]; + v3 = 0x7465646279746573ull ^ key[1]; + } + + HH_INLINE void Update(const char* bytes) { + HH_U64 packet; + memcpy(&packet, bytes, sizeof(packet)); + packet = host_from_le64(packet); + + v3 ^= packet; + + Compress<kUpdateIters>(); + + v0 ^= packet; + } + + HH_INLINE HH_U64 Finalize() { + // Mix in bits to avoid leaking the key if all packets were zero. + v2 ^= 0xFF; + + Compress<kFinalizeIters>(); + + return (v0 ^ v1) ^ (v2 ^ v3); + } + private: + // Rotate a 64-bit value "v" left by N bits. + template <HH_U64 bits> + static HH_INLINE HH_U64 RotateLeft(const HH_U64 v) { + const HH_U64 left = v << bits; + const HH_U64 right = v >> (64 - bits); + return left | right; + } + + template <size_t rounds> + HH_INLINE void Compress() { + for (size_t i = 0; i < rounds; ++i) { + // ARX network: add, rotate, exclusive-or. + v0 += v1; + v2 += v3; + v1 = RotateLeft<13>(v1); + v3 = RotateLeft<16>(v3); + v1 ^= v0; + v3 ^= v2; + + v0 = RotateLeft<32>(v0); + + v2 += v1; + v0 += v3; + v1 = RotateLeft<17>(v1); + v3 = RotateLeft<21>(v3); + v1 ^= v2; + v3 ^= v0; + + v2 = RotateLeft<32>(v2); + } + } + + HH_U64 v0; + HH_U64 v1; + HH_U64 v2; + HH_U64 v3; +}; + +using SipHashState = SipHashStateT<2, 4>; +using SipHash13State = SipHashStateT<1, 3>; + +// Override the HighwayTreeHash padding scheme with that of SipHash so that +// the hash output matches the known-good values in sip_hash_test. +template <> +HH_INLINE void PaddedUpdate<SipHashState>(const HH_U64 size, + const char* remaining_bytes, + const HH_U64 remaining_size, + SipHashState* state) { + // Copy to avoid overrunning the input buffer. + char final_packet[SipHashState::kPacketSize] = {0}; + memcpy(final_packet, remaining_bytes, remaining_size); + final_packet[SipHashState::kPacketSize - 1] = static_cast<char>(size & 0xFF); + state->Update(final_packet); +} + +template <> +HH_INLINE void PaddedUpdate<SipHash13State>(const HH_U64 size, + const char* remaining_bytes, + const HH_U64 remaining_size, + SipHash13State* state) { + // Copy to avoid overrunning the input buffer. + char final_packet[SipHash13State::kPacketSize] = {0}; + memcpy(final_packet, remaining_bytes, remaining_size); + final_packet[SipHash13State::kPacketSize - 1] = + static_cast<char>(size & 0xFF); + state->Update(final_packet); +} + +// Fast, cryptographically strong pseudo-random function, e.g. for +// deterministic/idempotent 'random' number generation. See also +// README.md for information on resisting hash flooding attacks. +// +// Robust versus timing attacks because memory accesses are sequential +// and the algorithm is branch-free. Compute time is proportional to the +// number of 8-byte packets and about twice as fast as an sse41 implementation. +// +// "key" is a secret 128-bit key unknown to attackers. +// "bytes" is the data to hash; ceil(size / 8) * 8 bytes are read. +// Returns a 64-bit hash of the given data bytes, which are swapped on +// big-endian CPUs so the return value is the same as on little-endian CPUs. +static HH_INLINE HH_U64 SipHash(const SipHashState::Key& key, const char* bytes, + const HH_U64 size) { + return ComputeHash<SipHashState>(key, bytes, size); +} + +// Round-reduced SipHash version (1 update and 3 finalization rounds). +static HH_INLINE HH_U64 SipHash13(const SipHash13State::Key& key, + const char* bytes, const HH_U64 size) { + return ComputeHash<SipHash13State>(key, bytes, size); +} + +template <int kNumLanes, int kUpdateIters, int kFinalizeIters> +static HH_INLINE HH_U64 ReduceSipTreeHash( + const typename SipHashStateT<kUpdateIters, kFinalizeIters>::Key& key, + const uint64_t (&hashes)[kNumLanes]) { + SipHashStateT<kUpdateIters, kFinalizeIters> state(key); + + for (int i = 0; i < kNumLanes; ++i) { + state.Update(reinterpret_cast<const char*>(&hashes[i])); + } + + return state.Finalize(); +} + +} // namespace highwayhash + +#endif // HIGHWAYHASH_SIP_HASH_H_ diff --git a/contrib/libs/highwayhash/highwayhash/sip_hash_test.cc b/contrib/libs/highwayhash/highwayhash/sip_hash_test.cc index 425dfea93c..8da79cf058 100644 --- a/contrib/libs/highwayhash/highwayhash/sip_hash_test.cc +++ b/contrib/libs/highwayhash/highwayhash/sip_hash_test.cc @@ -1,150 +1,150 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "highwayhash/sip_hash.h" - -#include <cassert> -#include <numeric> -#include <stdio.h> -#include <stdlib.h> - -#ifdef HH_GOOGLETEST -#include "base/integral_types.h" -#include "testing/base/public/benchmark.h" -#include "testing/base/public/gunit.h" -#endif -#include "highwayhash/scalar_sip_tree_hash.h" -#include "highwayhash/sip_tree_hash.h" - -namespace highwayhash { -namespace { - -void VerifySipHash() { - const int kMaxSize = 64; - char in[kMaxSize]; // empty string, 00, 00 01, ... - const HH_U64 key[2] = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL}; - - // Known-good SipHash-2-4 output from D. Bernstein. - const HH_U64 kSipHashOutput[64] = { - 0x726FDB47DD0E0E31, 0x74F839C593DC67FD, 0x0D6C8009D9A94F5A, - 0x85676696D7FB7E2D, 0xCF2794E0277187B7, 0x18765564CD99A68D, - 0xCBC9466E58FEE3CE, 0xAB0200F58B01D137, 0x93F5F5799A932462, - 0x9E0082DF0BA9E4B0, 0x7A5DBBC594DDB9F3, 0xF4B32F46226BADA7, - 0x751E8FBC860EE5FB, 0x14EA5627C0843D90, 0xF723CA908E7AF2EE, - 0xA129CA6149BE45E5, 0x3F2ACC7F57C29BDB, 0x699AE9F52CBE4794, - 0x4BC1B3F0968DD39C, 0xBB6DC91DA77961BD, 0xBED65CF21AA2EE98, - 0xD0F2CBB02E3B67C7, 0x93536795E3A33E88, 0xA80C038CCD5CCEC8, - 0xB8AD50C6F649AF94, 0xBCE192DE8A85B8EA, 0x17D835B85BBB15F3, - 0x2F2E6163076BCFAD, 0xDE4DAAACA71DC9A5, 0xA6A2506687956571, - 0xAD87A3535C49EF28, 0x32D892FAD841C342, 0x7127512F72F27CCE, - 0xA7F32346F95978E3, 0x12E0B01ABB051238, 0x15E034D40FA197AE, - 0x314DFFBE0815A3B4, 0x027990F029623981, 0xCADCD4E59EF40C4D, - 0x9ABFD8766A33735C, 0x0E3EA96B5304A7D0, 0xAD0C42D6FC585992, - 0x187306C89BC215A9, 0xD4A60ABCF3792B95, 0xF935451DE4F21DF2, - 0xA9538F0419755787, 0xDB9ACDDFF56CA510, 0xD06C98CD5C0975EB, - 0xE612A3CB9ECBA951, 0xC766E62CFCADAF96, 0xEE64435A9752FE72, - 0xA192D576B245165A, 0x0A8787BF8ECB74B2, 0x81B3E73D20B49B6F, - 0x7FA8220BA3B2ECEA, 0x245731C13CA42499, 0xB78DBFAF3A8D83BD, - 0xEA1AD565322A1A0B, 0x60E61C23A3795013, 0x6606D7E446282B93, - 0x6CA4ECB15C5F91E1, 0x9F626DA15C9625F3, 0xE51B38608EF25F57, - 0x958A324CEB064572}; - - for (int size = 0; size < kMaxSize; ++size) { - in[size] = static_cast<char>(size); - const HH_U64 hash = highwayhash::SipHash(key, in, size); -#ifdef HH_GOOGLETEST - EXPECT_EQ(kSipHashOutput[size], hash) << "Mismatch at length " << size; -#else - if (hash != kSipHashOutput[size]) { - printf("Mismatch at length %d\n", size); - abort(); - } -#endif - } -} - -#ifdef HH_GOOGLETEST -TEST(SipHashTest, OutputMatchesExpectations) { VerifySipHash(); } - -namespace bm { -/* Run with: - blaze run -c opt --cpu=haswell third_party/highwayhash:sip_hash_test -- \ - --benchmarks=all --benchmark_min_iters=1 --benchmark_min_time=0.25 -*/ - -// Returns a pointer to memory of at least size bytes long to be used as hashing -// input. -char* GetInput(size_t size) { - static constexpr size_t kMaxSize = 100 << 20; - assert(size <= kMaxSize); - static auto* res = []() { - auto* res = new char[kMaxSize]; - std::iota(res, res + kMaxSize, 0); - return res; - }(); - return res; -} - -template <class Hasher> -void BM(int iters, int size) { - StopBenchmarkTiming(); - auto* input = GetInput(size); - const HH_U64 keys[4] = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL, - 0x1716151413121110ULL, 0x1F1E1D1C1B1A1918ULL}; - Hasher hasher(keys); - StartBenchmarkTiming(); - for (int i = 0; i < iters; ++i) { - testing::DoNotOptimize(hasher(input, size)); - } - StopBenchmarkTiming(); - SetBenchmarkBytesProcessed(static_cast<int64>(iters) * size); -} - -void Args(::testing::Benchmark* bm) { - bm->DenseRange(1, 16)->Range(32, 100 << 20); -} - -#define DEFINE_HASHER(hashfn, num_keys) \ - struct hashfn##er { \ - hashfn##er(const HH_U64* k) { memcpy(keys, k, sizeof(keys)); } \ - HH_U64 operator()(const char* input, size_t size) { \ - return highwayhash::hashfn(keys, input, size); \ - } \ - HH_U64 keys[num_keys]; \ - } - -DEFINE_HASHER(SipHash, 2); -BENCHMARK(BM<SipHasher>)->Apply(Args); - -DEFINE_HASHER(ScalarSipTreeHash, 4); -BENCHMARK(BM<ScalarSipTreeHasher>)->Apply(Args); - -#ifdef __AVX2__ -DEFINE_HASHER(SipTreeHash, 4); -BENCHMARK(BM<SipTreeHasher>)->Apply(Args); -#endif - -} // namespace bm -#endif // HH_GOOGLETEST - -} // namespace -} // namespace highwayhash - -#ifndef HH_GOOGLETEST -int main(int argc, char* argv[]) { - highwayhash::VerifySipHash(); - printf("VerifySipHash succeeded.\n"); - return 0; -} -#endif +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highwayhash/sip_hash.h" + +#include <cassert> +#include <numeric> +#include <stdio.h> +#include <stdlib.h> + +#ifdef HH_GOOGLETEST +#include "base/integral_types.h" +#include "testing/base/public/benchmark.h" +#include "testing/base/public/gunit.h" +#endif +#include "highwayhash/scalar_sip_tree_hash.h" +#include "highwayhash/sip_tree_hash.h" + +namespace highwayhash { +namespace { + +void VerifySipHash() { + const int kMaxSize = 64; + char in[kMaxSize]; // empty string, 00, 00 01, ... + const HH_U64 key[2] = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL}; + + // Known-good SipHash-2-4 output from D. Bernstein. + const HH_U64 kSipHashOutput[64] = { + 0x726FDB47DD0E0E31, 0x74F839C593DC67FD, 0x0D6C8009D9A94F5A, + 0x85676696D7FB7E2D, 0xCF2794E0277187B7, 0x18765564CD99A68D, + 0xCBC9466E58FEE3CE, 0xAB0200F58B01D137, 0x93F5F5799A932462, + 0x9E0082DF0BA9E4B0, 0x7A5DBBC594DDB9F3, 0xF4B32F46226BADA7, + 0x751E8FBC860EE5FB, 0x14EA5627C0843D90, 0xF723CA908E7AF2EE, + 0xA129CA6149BE45E5, 0x3F2ACC7F57C29BDB, 0x699AE9F52CBE4794, + 0x4BC1B3F0968DD39C, 0xBB6DC91DA77961BD, 0xBED65CF21AA2EE98, + 0xD0F2CBB02E3B67C7, 0x93536795E3A33E88, 0xA80C038CCD5CCEC8, + 0xB8AD50C6F649AF94, 0xBCE192DE8A85B8EA, 0x17D835B85BBB15F3, + 0x2F2E6163076BCFAD, 0xDE4DAAACA71DC9A5, 0xA6A2506687956571, + 0xAD87A3535C49EF28, 0x32D892FAD841C342, 0x7127512F72F27CCE, + 0xA7F32346F95978E3, 0x12E0B01ABB051238, 0x15E034D40FA197AE, + 0x314DFFBE0815A3B4, 0x027990F029623981, 0xCADCD4E59EF40C4D, + 0x9ABFD8766A33735C, 0x0E3EA96B5304A7D0, 0xAD0C42D6FC585992, + 0x187306C89BC215A9, 0xD4A60ABCF3792B95, 0xF935451DE4F21DF2, + 0xA9538F0419755787, 0xDB9ACDDFF56CA510, 0xD06C98CD5C0975EB, + 0xE612A3CB9ECBA951, 0xC766E62CFCADAF96, 0xEE64435A9752FE72, + 0xA192D576B245165A, 0x0A8787BF8ECB74B2, 0x81B3E73D20B49B6F, + 0x7FA8220BA3B2ECEA, 0x245731C13CA42499, 0xB78DBFAF3A8D83BD, + 0xEA1AD565322A1A0B, 0x60E61C23A3795013, 0x6606D7E446282B93, + 0x6CA4ECB15C5F91E1, 0x9F626DA15C9625F3, 0xE51B38608EF25F57, + 0x958A324CEB064572}; + + for (int size = 0; size < kMaxSize; ++size) { + in[size] = static_cast<char>(size); + const HH_U64 hash = highwayhash::SipHash(key, in, size); +#ifdef HH_GOOGLETEST + EXPECT_EQ(kSipHashOutput[size], hash) << "Mismatch at length " << size; +#else + if (hash != kSipHashOutput[size]) { + printf("Mismatch at length %d\n", size); + abort(); + } +#endif + } +} + +#ifdef HH_GOOGLETEST +TEST(SipHashTest, OutputMatchesExpectations) { VerifySipHash(); } + +namespace bm { +/* Run with: + blaze run -c opt --cpu=haswell third_party/highwayhash:sip_hash_test -- \ + --benchmarks=all --benchmark_min_iters=1 --benchmark_min_time=0.25 +*/ + +// Returns a pointer to memory of at least size bytes long to be used as hashing +// input. +char* GetInput(size_t size) { + static constexpr size_t kMaxSize = 100 << 20; + assert(size <= kMaxSize); + static auto* res = []() { + auto* res = new char[kMaxSize]; + std::iota(res, res + kMaxSize, 0); + return res; + }(); + return res; +} + +template <class Hasher> +void BM(int iters, int size) { + StopBenchmarkTiming(); + auto* input = GetInput(size); + const HH_U64 keys[4] = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL, + 0x1716151413121110ULL, 0x1F1E1D1C1B1A1918ULL}; + Hasher hasher(keys); + StartBenchmarkTiming(); + for (int i = 0; i < iters; ++i) { + testing::DoNotOptimize(hasher(input, size)); + } + StopBenchmarkTiming(); + SetBenchmarkBytesProcessed(static_cast<int64>(iters) * size); +} + +void Args(::testing::Benchmark* bm) { + bm->DenseRange(1, 16)->Range(32, 100 << 20); +} + +#define DEFINE_HASHER(hashfn, num_keys) \ + struct hashfn##er { \ + hashfn##er(const HH_U64* k) { memcpy(keys, k, sizeof(keys)); } \ + HH_U64 operator()(const char* input, size_t size) { \ + return highwayhash::hashfn(keys, input, size); \ + } \ + HH_U64 keys[num_keys]; \ + } + +DEFINE_HASHER(SipHash, 2); +BENCHMARK(BM<SipHasher>)->Apply(Args); + +DEFINE_HASHER(ScalarSipTreeHash, 4); +BENCHMARK(BM<ScalarSipTreeHasher>)->Apply(Args); + +#ifdef __AVX2__ +DEFINE_HASHER(SipTreeHash, 4); +BENCHMARK(BM<SipTreeHasher>)->Apply(Args); +#endif + +} // namespace bm +#endif // HH_GOOGLETEST + +} // namespace +} // namespace highwayhash + +#ifndef HH_GOOGLETEST +int main(int argc, char* argv[]) { + highwayhash::VerifySipHash(); + printf("VerifySipHash succeeded.\n"); + return 0; +} +#endif diff --git a/contrib/libs/highwayhash/highwayhash/sip_tree_hash.cc b/contrib/libs/highwayhash/highwayhash/sip_tree_hash.cc index 18c56d7907..2dc4ce78e4 100644 --- a/contrib/libs/highwayhash/highwayhash/sip_tree_hash.cc +++ b/contrib/libs/highwayhash/highwayhash/sip_tree_hash.cc @@ -1,227 +1,227 @@ -// Copyright 2015 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "highwayhash/sip_tree_hash.h" - -#include <cstring> // memcpy - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" -#include "highwayhash/sip_hash.h" - -#if HH_TARGET == HH_TARGET_AVX2 -#include "highwayhash/vector256.h" -namespace highwayhash { -namespace HH_TARGET_NAME { -namespace { - -// Paper: https://www.131002.net/siphash/siphash.pdf -// SSE41 implementation: https://goo.gl/80GBSD -// Tree hash extension: http://dx.doi.org/10.4236/jis.2014.53010 - -// The hash state is updated by injecting 4x8-byte packets; -// XORing together all state vectors yields 32 bytes that are -// reduced to 64 bits via 8-byte SipHash. - -const int kPacketSize = 32; -const int kNumLanes = kPacketSize / sizeof(HH_U64); - -// 32 bytes key. Parameters are hardwired to c=2, d=4 [rounds]. -template <int kUpdateRounds, int kFinalizeRounds> -class SipTreeHashStateT { - public: - explicit HH_INLINE SipTreeHashStateT(const HH_U64 (&keys)[kNumLanes]) { - const V4x64U init(0x7465646279746573ull, 0x6c7967656e657261ull, - 0x646f72616e646f6dull, 0x736f6d6570736575ull); - const V4x64U lanes(kNumLanes | 3, kNumLanes | 2, kNumLanes | 1, - kNumLanes | 0); - const V4x64U key = - LoadUnaligned<V4x64U>(reinterpret_cast<const uint64_t*>(keys)) ^ lanes; - v0 = V4x64U(_mm256_permute4x64_epi64(init, 0x00)) ^ key; - v1 = V4x64U(_mm256_permute4x64_epi64(init, 0x55)) ^ key; - v2 = V4x64U(_mm256_permute4x64_epi64(init, 0xAA)) ^ key; - v3 = V4x64U(_mm256_permute4x64_epi64(init, 0xFF)) ^ key; - } - - HH_INLINE void Update(const V4x64U& packet) { - v3 ^= packet; - - Compress<kUpdateRounds>(); - - v0 ^= packet; - } - - HH_INLINE V4x64U Finalize() { - // Mix in bits to avoid leaking the key if all packets were zero. - v2 ^= V4x64U(0xFF); - - Compress<kFinalizeRounds>(); - - return (v0 ^ v1) ^ (v2 ^ v3); - } - - private: - static HH_INLINE V4x64U RotateLeft16(const V4x64U& v) { - const V4x64U control(0x0D0C0B0A09080F0EULL, 0x0504030201000706ULL, - 0x0D0C0B0A09080F0EULL, 0x0504030201000706ULL); - return V4x64U(_mm256_shuffle_epi8(v, control)); - } - - // Rotates each 64-bit element of "v" left by N bits. - template <HH_U64 bits> - static HH_INLINE V4x64U RotateLeft(const V4x64U& v) { - const V4x64U left = v << bits; - const V4x64U right = v >> (64 - bits); - return left | right; - } - - static HH_INLINE V4x64U Rotate32(const V4x64U& v) { - return V4x64U(_mm256_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1))); - } - - template <int kRounds> - HH_INLINE void Compress() { - // Loop is faster than unrolling! - for (int i = 0; i < kRounds; ++i) { - // ARX network: add, rotate, exclusive-or. - v0 += v1; - v2 += v3; - v1 = RotateLeft<13>(v1); - v3 = RotateLeft16(v3); - v1 ^= v0; - v3 ^= v2; - - v0 = Rotate32(v0); - - v2 += v1; - v0 += v3; - v1 = RotateLeft<17>(v1); - v3 = RotateLeft<21>(v3); - v1 ^= v2; - v3 ^= v0; - - v2 = Rotate32(v2); - } - } - - V4x64U v0; - V4x64U v1; - V4x64U v2; - V4x64U v3; -}; - -// Returns 32-byte packet by loading the remaining 0..31 bytes, storing -// "remainder" in the upper byte, and zeroing any intervening bytes. -// "remainder" is the number of accessible/remaining bytes (size % 32). -// Loading past the end of the input risks page fault exceptions which even -// LDDQU cannot prevent. -static HH_INLINE V4x64U LoadFinalPacket32(const char* bytes, const HH_U64 size, - const HH_U64 remainder) { - // Copying into an aligned buffer incurs a store-to-load-forwarding stall. - // Instead, we use masked loads to read any remaining whole uint32_t - // without incurring page faults for the others. - const size_t remaining_32 = remainder >> 2; // 0..7 - - // mask[32*i+31] := uint32_t #i valid/accessible ? 1 : 0. - // To avoid large lookup tables, we pack uint32_t lanes into bytes, - // compute the packed mask by shifting, and then sign-extend 0xFF to - // 0xFFFFFFFF (although only the MSB needs to be set). - // remaining_32 = 0 => mask = 00000000; remaining_32 = 7 => mask = 01111111. - const HH_U64 packed_mask = 0x00FFFFFFFFFFFFFFULL >> ((7 - remaining_32) * 8); - const V4x64U mask(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(packed_mask))); - // Load 0..7 remaining (potentially unaligned) uint32_t. - const V4x64U packet28( - _mm256_maskload_epi32(reinterpret_cast<const int*>(bytes), mask)); - - // Load any remaining bytes individually and combine into a uint32_t. - const int remainder_mod4 = remainder & 3; - // Length padding ensures that zero-valued buffers of different lengths - // result in different hashes. - uint32_t packet4 = static_cast<uint32_t>(remainder << 24); - const char* final_bytes = bytes + (remaining_32 * 4); - for (int i = 0; i < remainder_mod4; ++i) { - const uint32_t byte = static_cast<unsigned char>(final_bytes[i]); - packet4 += byte << (i * 8); - } - - // The upper 4 bytes of packet28 are zero; replace with packet4 to - // obtain the (length-padded) 32-byte packet. - const __m256i v4 = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(packet4)); - const V4x64U packet(_mm256_blend_epi32(packet28, v4, 0x80)); - return packet; -} - -} // namespace -} // namespace HH_TARGET_NAME - -template <size_t kUpdateRounds, size_t kFinalizeRounds> -HH_U64 SipTreeHashT(const HH_U64 (&key)[4], const char* bytes, - const HH_U64 size) { - using namespace HH_TARGET_NAME; - SipTreeHashStateT<kUpdateRounds, kFinalizeRounds> state(key); - - const size_t remainder = size & (kPacketSize - 1); - const size_t truncated_size = size - remainder; - const HH_U64* packets = reinterpret_cast<const HH_U64*>(bytes); - for (size_t i = 0; i < truncated_size / sizeof(HH_U64); i += kNumLanes) { - const V4x64U packet = - LoadUnaligned<V4x64U>(reinterpret_cast<const uint64_t*>(packets) + i); - state.Update(packet); - } - - const V4x64U final_packet = - LoadFinalPacket32(bytes + truncated_size, size, remainder); - - state.Update(final_packet); - - // Faster than passing __m256i and extracting. - HH_ALIGNAS(32) uint64_t hashes[kNumLanes]; - Store(state.Finalize(), hashes); - - typename SipHashStateT<kUpdateRounds, kFinalizeRounds>::Key reduce_key; - memcpy(&reduce_key, &key, sizeof(reduce_key)); - return ReduceSipTreeHash<kNumLanes, kUpdateRounds, kFinalizeRounds>( - reduce_key, hashes); -} - -HH_U64 SipTreeHash(const HH_U64 (&key)[4], const char* bytes, - const HH_U64 size) { - return SipTreeHashT<2, 4>(key, bytes, size); -} - -HH_U64 SipTreeHash13(const HH_U64 (&key)[4], const char* bytes, - const HH_U64 size) { - return SipTreeHashT<1, 3>(key, bytes, size); -} - -} // namespace highwayhash - -using highwayhash::HH_U64; -using highwayhash::SipTreeHash; -using highwayhash::SipTreeHash13; -using Key = HH_U64[4]; - -extern "C" { - -HH_U64 SipTreeHashC(const HH_U64* key, const char* bytes, const HH_U64 size) { - return SipTreeHash(*reinterpret_cast<const Key*>(key), bytes, size); -} - -HH_U64 SipTreeHash13C(const HH_U64* key, const char* bytes, const HH_U64 size) { - return SipTreeHash13(*reinterpret_cast<const Key*>(key), bytes, size); -} - -} // extern "C" - -#endif // HH_TARGET == HH_TARGET_AVX2 +// Copyright 2015 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highwayhash/sip_tree_hash.h" + +#include <cstring> // memcpy + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/sip_hash.h" + +#if HH_TARGET == HH_TARGET_AVX2 +#include "highwayhash/vector256.h" +namespace highwayhash { +namespace HH_TARGET_NAME { +namespace { + +// Paper: https://www.131002.net/siphash/siphash.pdf +// SSE41 implementation: https://goo.gl/80GBSD +// Tree hash extension: http://dx.doi.org/10.4236/jis.2014.53010 + +// The hash state is updated by injecting 4x8-byte packets; +// XORing together all state vectors yields 32 bytes that are +// reduced to 64 bits via 8-byte SipHash. + +const int kPacketSize = 32; +const int kNumLanes = kPacketSize / sizeof(HH_U64); + +// 32 bytes key. Parameters are hardwired to c=2, d=4 [rounds]. +template <int kUpdateRounds, int kFinalizeRounds> +class SipTreeHashStateT { + public: + explicit HH_INLINE SipTreeHashStateT(const HH_U64 (&keys)[kNumLanes]) { + const V4x64U init(0x7465646279746573ull, 0x6c7967656e657261ull, + 0x646f72616e646f6dull, 0x736f6d6570736575ull); + const V4x64U lanes(kNumLanes | 3, kNumLanes | 2, kNumLanes | 1, + kNumLanes | 0); + const V4x64U key = + LoadUnaligned<V4x64U>(reinterpret_cast<const uint64_t*>(keys)) ^ lanes; + v0 = V4x64U(_mm256_permute4x64_epi64(init, 0x00)) ^ key; + v1 = V4x64U(_mm256_permute4x64_epi64(init, 0x55)) ^ key; + v2 = V4x64U(_mm256_permute4x64_epi64(init, 0xAA)) ^ key; + v3 = V4x64U(_mm256_permute4x64_epi64(init, 0xFF)) ^ key; + } + + HH_INLINE void Update(const V4x64U& packet) { + v3 ^= packet; + + Compress<kUpdateRounds>(); + + v0 ^= packet; + } + + HH_INLINE V4x64U Finalize() { + // Mix in bits to avoid leaking the key if all packets were zero. + v2 ^= V4x64U(0xFF); + + Compress<kFinalizeRounds>(); + + return (v0 ^ v1) ^ (v2 ^ v3); + } + + private: + static HH_INLINE V4x64U RotateLeft16(const V4x64U& v) { + const V4x64U control(0x0D0C0B0A09080F0EULL, 0x0504030201000706ULL, + 0x0D0C0B0A09080F0EULL, 0x0504030201000706ULL); + return V4x64U(_mm256_shuffle_epi8(v, control)); + } + + // Rotates each 64-bit element of "v" left by N bits. + template <HH_U64 bits> + static HH_INLINE V4x64U RotateLeft(const V4x64U& v) { + const V4x64U left = v << bits; + const V4x64U right = v >> (64 - bits); + return left | right; + } + + static HH_INLINE V4x64U Rotate32(const V4x64U& v) { + return V4x64U(_mm256_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1))); + } + + template <int kRounds> + HH_INLINE void Compress() { + // Loop is faster than unrolling! + for (int i = 0; i < kRounds; ++i) { + // ARX network: add, rotate, exclusive-or. + v0 += v1; + v2 += v3; + v1 = RotateLeft<13>(v1); + v3 = RotateLeft16(v3); + v1 ^= v0; + v3 ^= v2; + + v0 = Rotate32(v0); + + v2 += v1; + v0 += v3; + v1 = RotateLeft<17>(v1); + v3 = RotateLeft<21>(v3); + v1 ^= v2; + v3 ^= v0; + + v2 = Rotate32(v2); + } + } + + V4x64U v0; + V4x64U v1; + V4x64U v2; + V4x64U v3; +}; + +// Returns 32-byte packet by loading the remaining 0..31 bytes, storing +// "remainder" in the upper byte, and zeroing any intervening bytes. +// "remainder" is the number of accessible/remaining bytes (size % 32). +// Loading past the end of the input risks page fault exceptions which even +// LDDQU cannot prevent. +static HH_INLINE V4x64U LoadFinalPacket32(const char* bytes, const HH_U64 size, + const HH_U64 remainder) { + // Copying into an aligned buffer incurs a store-to-load-forwarding stall. + // Instead, we use masked loads to read any remaining whole uint32_t + // without incurring page faults for the others. + const size_t remaining_32 = remainder >> 2; // 0..7 + + // mask[32*i+31] := uint32_t #i valid/accessible ? 1 : 0. + // To avoid large lookup tables, we pack uint32_t lanes into bytes, + // compute the packed mask by shifting, and then sign-extend 0xFF to + // 0xFFFFFFFF (although only the MSB needs to be set). + // remaining_32 = 0 => mask = 00000000; remaining_32 = 7 => mask = 01111111. + const HH_U64 packed_mask = 0x00FFFFFFFFFFFFFFULL >> ((7 - remaining_32) * 8); + const V4x64U mask(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(packed_mask))); + // Load 0..7 remaining (potentially unaligned) uint32_t. + const V4x64U packet28( + _mm256_maskload_epi32(reinterpret_cast<const int*>(bytes), mask)); + + // Load any remaining bytes individually and combine into a uint32_t. + const int remainder_mod4 = remainder & 3; + // Length padding ensures that zero-valued buffers of different lengths + // result in different hashes. + uint32_t packet4 = static_cast<uint32_t>(remainder << 24); + const char* final_bytes = bytes + (remaining_32 * 4); + for (int i = 0; i < remainder_mod4; ++i) { + const uint32_t byte = static_cast<unsigned char>(final_bytes[i]); + packet4 += byte << (i * 8); + } + + // The upper 4 bytes of packet28 are zero; replace with packet4 to + // obtain the (length-padded) 32-byte packet. + const __m256i v4 = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(packet4)); + const V4x64U packet(_mm256_blend_epi32(packet28, v4, 0x80)); + return packet; +} + +} // namespace +} // namespace HH_TARGET_NAME + +template <size_t kUpdateRounds, size_t kFinalizeRounds> +HH_U64 SipTreeHashT(const HH_U64 (&key)[4], const char* bytes, + const HH_U64 size) { + using namespace HH_TARGET_NAME; + SipTreeHashStateT<kUpdateRounds, kFinalizeRounds> state(key); + + const size_t remainder = size & (kPacketSize - 1); + const size_t truncated_size = size - remainder; + const HH_U64* packets = reinterpret_cast<const HH_U64*>(bytes); + for (size_t i = 0; i < truncated_size / sizeof(HH_U64); i += kNumLanes) { + const V4x64U packet = + LoadUnaligned<V4x64U>(reinterpret_cast<const uint64_t*>(packets) + i); + state.Update(packet); + } + + const V4x64U final_packet = + LoadFinalPacket32(bytes + truncated_size, size, remainder); + + state.Update(final_packet); + + // Faster than passing __m256i and extracting. + HH_ALIGNAS(32) uint64_t hashes[kNumLanes]; + Store(state.Finalize(), hashes); + + typename SipHashStateT<kUpdateRounds, kFinalizeRounds>::Key reduce_key; + memcpy(&reduce_key, &key, sizeof(reduce_key)); + return ReduceSipTreeHash<kNumLanes, kUpdateRounds, kFinalizeRounds>( + reduce_key, hashes); +} + +HH_U64 SipTreeHash(const HH_U64 (&key)[4], const char* bytes, + const HH_U64 size) { + return SipTreeHashT<2, 4>(key, bytes, size); +} + +HH_U64 SipTreeHash13(const HH_U64 (&key)[4], const char* bytes, + const HH_U64 size) { + return SipTreeHashT<1, 3>(key, bytes, size); +} + +} // namespace highwayhash + +using highwayhash::HH_U64; +using highwayhash::SipTreeHash; +using highwayhash::SipTreeHash13; +using Key = HH_U64[4]; + +extern "C" { + +HH_U64 SipTreeHashC(const HH_U64* key, const char* bytes, const HH_U64 size) { + return SipTreeHash(*reinterpret_cast<const Key*>(key), bytes, size); +} + +HH_U64 SipTreeHash13C(const HH_U64* key, const char* bytes, const HH_U64 size) { + return SipTreeHash13(*reinterpret_cast<const Key*>(key), bytes, size); +} + +} // extern "C" + +#endif // HH_TARGET == HH_TARGET_AVX2 diff --git a/contrib/libs/highwayhash/highwayhash/sip_tree_hash.h b/contrib/libs/highwayhash/highwayhash/sip_tree_hash.h index ee5a42340e..788aa8025b 100644 --- a/contrib/libs/highwayhash/highwayhash/sip_tree_hash.h +++ b/contrib/libs/highwayhash/highwayhash/sip_tree_hash.h @@ -1,52 +1,52 @@ -// Copyright 2015 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_SIP_TREE_HASH_H_ -#define HIGHWAYHASH_SIP_TREE_HASH_H_ - -#include "highwayhash/state_helpers.h" - -#ifdef __cplusplus -namespace highwayhash { -extern "C" { -#endif - -// Fast, cryptographically strong pseudo-random function. Useful for: -// . hash tables holding attacker-controlled data. This function is -// immune to hash flooding DOS attacks because multi-collisions are -// infeasible to compute, provided the key remains secret. -// . deterministic/idempotent 'random' number generation, e.g. for -// choosing a subset of items based on their contents. -// -// Robust versus timing attacks because memory accesses are sequential -// and the algorithm is branch-free. Compute time is proportional to the -// number of 8-byte packets and 1.5x faster than an sse41 implementation. -// Requires an AVX-2 capable CPU. -// -// "key" is a secret 256-bit key unknown to attackers. -// "bytes" is the data to hash (possibly unaligned). -// "size" is the number of bytes to hash; exactly that many bytes are read. -// Returns a 64-bit hash of the given data bytes. -HH_U64 SipTreeHash(const HH_U64 (&key)[4], const char* bytes, - const HH_U64 size); - -HH_U64 SipTreeHash13(const HH_U64 (&key)[4], const char* bytes, - const HH_U64 size); - -#ifdef __cplusplus -} // extern "C" -} // namespace highwayhash -#endif - -#endif // HIGHWAYHASH_SIP_TREE_HASH_H_ +// Copyright 2015 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_SIP_TREE_HASH_H_ +#define HIGHWAYHASH_SIP_TREE_HASH_H_ + +#include "highwayhash/state_helpers.h" + +#ifdef __cplusplus +namespace highwayhash { +extern "C" { +#endif + +// Fast, cryptographically strong pseudo-random function. Useful for: +// . hash tables holding attacker-controlled data. This function is +// immune to hash flooding DOS attacks because multi-collisions are +// infeasible to compute, provided the key remains secret. +// . deterministic/idempotent 'random' number generation, e.g. for +// choosing a subset of items based on their contents. +// +// Robust versus timing attacks because memory accesses are sequential +// and the algorithm is branch-free. Compute time is proportional to the +// number of 8-byte packets and 1.5x faster than an sse41 implementation. +// Requires an AVX-2 capable CPU. +// +// "key" is a secret 256-bit key unknown to attackers. +// "bytes" is the data to hash (possibly unaligned). +// "size" is the number of bytes to hash; exactly that many bytes are read. +// Returns a 64-bit hash of the given data bytes. +HH_U64 SipTreeHash(const HH_U64 (&key)[4], const char* bytes, + const HH_U64 size); + +HH_U64 SipTreeHash13(const HH_U64 (&key)[4], const char* bytes, + const HH_U64 size); + +#ifdef __cplusplus +} // extern "C" +} // namespace highwayhash +#endif + +#endif // HIGHWAYHASH_SIP_TREE_HASH_H_ diff --git a/contrib/libs/highwayhash/highwayhash/state_helpers.h b/contrib/libs/highwayhash/highwayhash/state_helpers.h index 4dd651260f..88e31a4509 100644 --- a/contrib/libs/highwayhash/highwayhash/state_helpers.h +++ b/contrib/libs/highwayhash/highwayhash/state_helpers.h @@ -1,128 +1,128 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_STATE_H_ -#define HIGHWAYHASH_STATE_H_ - -// Helper functions to split inputs into packets and call State::Update on each. - -#include <stdint.h> -#include <cstddef> -#include <cstring> -#include <memory> - -#include "highwayhash/compiler_specific.h" - -namespace highwayhash { - -// uint64_t is unsigned long on Linux; we need 'unsigned long long' -// for interoperability with TensorFlow. -typedef unsigned long long HH_U64; // NOLINT - -// Copies the remaining bytes to a zero-padded buffer, sets the upper byte to -// size % 256 (always possible because this should only be called if the -// total size is not a multiple of the packet size) and updates hash state. -// -// The padding scheme is essentially from SipHash, but permuted for the -// convenience of AVX-2 masked loads. This function must use the same layout so -// that the vector and scalar HighwayTreeHash have the same result. -// -// "remaining_size" is the number of accessible/remaining bytes -// (size % kPacketSize). -// -// Primary template; the specialization for AVX-2 is faster. Intended as an -// implementation detail, do not call directly. -template <class State> -HH_INLINE void PaddedUpdate(const HH_U64 size, const char* remaining_bytes, - const HH_U64 remaining_size, State* state) { +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_STATE_H_ +#define HIGHWAYHASH_STATE_H_ + +// Helper functions to split inputs into packets and call State::Update on each. + +#include <stdint.h> +#include <cstddef> +#include <cstring> +#include <memory> + +#include "highwayhash/compiler_specific.h" + +namespace highwayhash { + +// uint64_t is unsigned long on Linux; we need 'unsigned long long' +// for interoperability with TensorFlow. +typedef unsigned long long HH_U64; // NOLINT + +// Copies the remaining bytes to a zero-padded buffer, sets the upper byte to +// size % 256 (always possible because this should only be called if the +// total size is not a multiple of the packet size) and updates hash state. +// +// The padding scheme is essentially from SipHash, but permuted for the +// convenience of AVX-2 masked loads. This function must use the same layout so +// that the vector and scalar HighwayTreeHash have the same result. +// +// "remaining_size" is the number of accessible/remaining bytes +// (size % kPacketSize). +// +// Primary template; the specialization for AVX-2 is faster. Intended as an +// implementation detail, do not call directly. +template <class State> +HH_INLINE void PaddedUpdate(const HH_U64 size, const char* remaining_bytes, + const HH_U64 remaining_size, State* state) { HH_ALIGNAS(32) char final_packet[State::kPacketSize] = {0}; - - // This layout matches the AVX-2 specialization in highway_tree_hash.h. - uint32_t packet4 = static_cast<uint32_t>(size) << 24; - - const size_t remainder_mod4 = remaining_size & 3; - if (remainder_mod4 != 0) { - const char* final_bytes = remaining_bytes + remaining_size - remainder_mod4; - packet4 += static_cast<uint32_t>(final_bytes[0]); - const int idx1 = remainder_mod4 >> 1; - const int idx2 = remainder_mod4 - 1; - packet4 += static_cast<uint32_t>(final_bytes[idx1]) << 8; - packet4 += static_cast<uint32_t>(final_bytes[idx2]) << 16; - } - - memcpy(final_packet, remaining_bytes, remaining_size - remainder_mod4); - memcpy(final_packet + State::kPacketSize - 4, &packet4, sizeof(packet4)); - - state->Update(final_packet); -} - -// Updates hash state for every whole packet, and once more for the final -// padded packet. -template <class State> -HH_INLINE void UpdateState(const char* bytes, const HH_U64 size, State* state) { - // Feed entire packets. - const int kPacketSize = State::kPacketSize; - static_assert((kPacketSize & (kPacketSize - 1)) == 0, "Size must be 2^i."); - const size_t remainder = size & (kPacketSize - 1); - const size_t truncated_size = size - remainder; - for (size_t i = 0; i < truncated_size; i += kPacketSize) { - state->Update(bytes + i); - } - - PaddedUpdate(size, bytes + truncated_size, remainder, state); -} - -// Convenience function for updating with the bytes of a string. -template <class String, class State> -HH_INLINE void UpdateState(const String& s, State* state) { - const char* bytes = reinterpret_cast<const char*>(s.data()); - const size_t size = s.length() * sizeof(typename String::value_type); - UpdateState(bytes, size, state); -} - -// Computes a hash of a byte array using the given hash State class. -// -// Example: const SipHashState::Key key = { 1, 2 }; char data[4]; -// ComputeHash<SipHashState>(key, data, sizeof(data)); -// -// This function avoids duplicating Update/Finalize in every call site. -// Callers wanting to combine multiple hashes should repeatedly UpdateState() -// and only call State::Finalize once. -template <class State> -HH_U64 ComputeHash(const typename State::Key& key, const char* bytes, - const HH_U64 size) { - State state(key); - UpdateState(bytes, size, &state); - return state.Finalize(); -} - -// Computes a hash of a string's bytes using the given hash State class. -// -// Example: const SipHashState::Key key = { 1, 2 }; -// StringHasher<SipHashState>()(key, std::u16string(u"abc")); -// -// A struct with nested function template enables deduction of the String type. -template <class State> -struct StringHasher { - template <class String> - HH_U64 operator()(const typename State::Key& key, const String& s) { - State state(key); - UpdateState(s, &state); - return state.Finalize(); - } -}; - -} // namespace highwayhash - -#endif // HIGHWAYHASH_STATE_H_ + + // This layout matches the AVX-2 specialization in highway_tree_hash.h. + uint32_t packet4 = static_cast<uint32_t>(size) << 24; + + const size_t remainder_mod4 = remaining_size & 3; + if (remainder_mod4 != 0) { + const char* final_bytes = remaining_bytes + remaining_size - remainder_mod4; + packet4 += static_cast<uint32_t>(final_bytes[0]); + const int idx1 = remainder_mod4 >> 1; + const int idx2 = remainder_mod4 - 1; + packet4 += static_cast<uint32_t>(final_bytes[idx1]) << 8; + packet4 += static_cast<uint32_t>(final_bytes[idx2]) << 16; + } + + memcpy(final_packet, remaining_bytes, remaining_size - remainder_mod4); + memcpy(final_packet + State::kPacketSize - 4, &packet4, sizeof(packet4)); + + state->Update(final_packet); +} + +// Updates hash state for every whole packet, and once more for the final +// padded packet. +template <class State> +HH_INLINE void UpdateState(const char* bytes, const HH_U64 size, State* state) { + // Feed entire packets. + const int kPacketSize = State::kPacketSize; + static_assert((kPacketSize & (kPacketSize - 1)) == 0, "Size must be 2^i."); + const size_t remainder = size & (kPacketSize - 1); + const size_t truncated_size = size - remainder; + for (size_t i = 0; i < truncated_size; i += kPacketSize) { + state->Update(bytes + i); + } + + PaddedUpdate(size, bytes + truncated_size, remainder, state); +} + +// Convenience function for updating with the bytes of a string. +template <class String, class State> +HH_INLINE void UpdateState(const String& s, State* state) { + const char* bytes = reinterpret_cast<const char*>(s.data()); + const size_t size = s.length() * sizeof(typename String::value_type); + UpdateState(bytes, size, state); +} + +// Computes a hash of a byte array using the given hash State class. +// +// Example: const SipHashState::Key key = { 1, 2 }; char data[4]; +// ComputeHash<SipHashState>(key, data, sizeof(data)); +// +// This function avoids duplicating Update/Finalize in every call site. +// Callers wanting to combine multiple hashes should repeatedly UpdateState() +// and only call State::Finalize once. +template <class State> +HH_U64 ComputeHash(const typename State::Key& key, const char* bytes, + const HH_U64 size) { + State state(key); + UpdateState(bytes, size, &state); + return state.Finalize(); +} + +// Computes a hash of a string's bytes using the given hash State class. +// +// Example: const SipHashState::Key key = { 1, 2 }; +// StringHasher<SipHashState>()(key, std::u16string(u"abc")); +// +// A struct with nested function template enables deduction of the String type. +template <class State> +struct StringHasher { + template <class String> + HH_U64 operator()(const typename State::Key& key, const String& s) { + State state(key); + UpdateState(s, &state); + return state.Finalize(); + } +}; + +} // namespace highwayhash + +#endif // HIGHWAYHASH_STATE_H_ diff --git a/contrib/libs/highwayhash/highwayhash/tsc_timer.h b/contrib/libs/highwayhash/highwayhash/tsc_timer.h index 4a88c0f8e6..6a4b4a4bdb 100644 --- a/contrib/libs/highwayhash/highwayhash/tsc_timer.h +++ b/contrib/libs/highwayhash/highwayhash/tsc_timer.h @@ -1,204 +1,204 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_TSC_TIMER_H_ -#define HIGHWAYHASH_TSC_TIMER_H_ - -// High-resolution (~10 ns) timestamps, using fences to prevent reordering and -// ensure exactly the desired regions are measured. - -#include <stdint.h> - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" - -#if HH_ARCH_X64 && HH_MSC_VERSION -#include <emmintrin.h> // _mm_lfence -#include <intrin.h> -#endif - -namespace highwayhash { - -// Start/Stop return absolute timestamps and must be placed immediately before -// and after the region to measure. We provide separate Start/Stop functions -// because they use different fences. -// -// Background: RDTSC is not 'serializing'; earlier instructions may complete -// after it, and/or later instructions may complete before it. 'Fences' ensure -// regions' elapsed times are independent of such reordering. The only -// documented unprivileged serializing instruction is CPUID, which acts as a -// full fence (no reordering across it in either direction). Unfortunately -// the latency of CPUID varies wildly (perhaps made worse by not initializing -// its EAX input). Because it cannot reliably be deducted from the region's -// elapsed time, it must not be included in the region to measure (i.e. -// between the two RDTSC). -// -// The newer RDTSCP is sometimes described as serializing, but it actually -// only serves as a half-fence with release semantics. Although all -// instructions in the region will complete before the final timestamp is -// captured, subsequent instructions may leak into the region and increase the -// elapsed time. Inserting another fence after the final RDTSCP would prevent -// such reordering without affecting the measured region. -// -// Fortunately, such a fence exists. The LFENCE instruction is only documented -// to delay later loads until earlier loads are visible. However, Intel's -// reference manual says it acts as a full fence (waiting until all earlier -// instructions have completed, and delaying later instructions until it -// completes). AMD assigns the same behavior to MFENCE. -// -// We need a fence before the initial RDTSC to prevent earlier instructions -// from leaking into the region, and arguably another after RDTSC to avoid -// region instructions from completing before the timestamp is recorded. -// When surrounded by fences, the additional RDTSCP half-fence provides no -// benefit, so the initial timestamp can be recorded via RDTSC, which has -// lower overhead than RDTSCP because it does not read TSC_AUX. In summary, -// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE. -// -// Using Start+Start leads to higher variance and overhead than Stop+Stop. -// However, Stop+Stop includes an LFENCE in the region measurements, which -// adds a delay dependent on earlier loads. The combination of Start+Stop -// is faster than Start+Start and more consistent than Stop+Stop because -// the first LFENCE already delayed subsequent loads before the measured -// region. This combination seems not to have been considered in prior work: -// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c -// -// Note: performance counters can measure 'exact' instructions-retired or -// (unhalted) cycle counts. The RDPMC instruction is not serializing and also -// requires fences. Unfortunately, it is not accessible on all OSes and we -// prefer to avoid kernel-mode drivers. Performance counters are also affected -// by several under/over-count errata, so we use the TSC instead. - -// Primary templates; must use one of the specializations. -template <typename T> -inline T Start(); - -template <typename T> -inline T Stop(); - -template <> -inline uint64_t Start<uint64_t>() { - uint64_t t; -#if HH_ARCH_PPC - asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); -#elif HH_ARCH_AARCH64 - asm volatile("mrs %0, cntvct_el0" : "=r"(t)); -#elif HH_ARCH_X64 && HH_MSC_VERSION - _mm_lfence(); - HH_COMPILER_FENCE; - t = __rdtsc(); - _mm_lfence(); - HH_COMPILER_FENCE; -#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION) - asm volatile( - "lfence\n\t" - "rdtsc\n\t" - "shl $32, %%rdx\n\t" - "or %%rdx, %0\n\t" - "lfence" - : "=a"(t) - : - // "memory" avoids reordering. rdx = TSC >> 32. - // "cc" = flags modified by SHL. - : "rdx", "memory", "cc"); -#else -#error "Port" -#endif - return t; -} - -template <> -inline uint64_t Stop<uint64_t>() { - uint64_t t; -#if HH_ARCH_PPC - asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); -#elif HH_ARCH_AARCH64 - asm volatile("mrs %0, cntvct_el0" : "=r"(t)); -#elif HH_ARCH_X64 && HH_MSC_VERSION - HH_COMPILER_FENCE; - unsigned aux; - t = __rdtscp(&aux); - _mm_lfence(); - HH_COMPILER_FENCE; -#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION) - // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). - asm volatile( - "rdtscp\n\t" - "shl $32, %%rdx\n\t" - "or %%rdx, %0\n\t" - "lfence" - : "=a"(t) - : - // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. - // "cc" = flags modified by SHL. - : "rcx", "rdx", "memory", "cc"); -#else -#error "Port" -#endif - return t; -} - -// Returns a 32-bit timestamp with about 4 cycles less overhead than -// Start<uint64_t>. Only suitable for measuring very short regions because the -// timestamp overflows about once a second. -template <> -inline uint32_t Start<uint32_t>() { - uint32_t t; -#if HH_ARCH_X64 && HH_MSC_VERSION - _mm_lfence(); - HH_COMPILER_FENCE; - t = static_cast<uint32_t>(__rdtsc()); - _mm_lfence(); - HH_COMPILER_FENCE; -#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION) - asm volatile( - "lfence\n\t" - "rdtsc\n\t" - "lfence" - : "=a"(t) - : - // "memory" avoids reordering. rdx = TSC >> 32. - : "rdx", "memory"); -#else - t = static_cast<uint32_t>(Start<uint64_t>()); -#endif - return t; -} - -template <> -inline uint32_t Stop<uint32_t>() { - uint32_t t; -#if HH_ARCH_X64 && HH_MSC_VERSION - HH_COMPILER_FENCE; - unsigned aux; - t = static_cast<uint32_t>(__rdtscp(&aux)); - _mm_lfence(); - HH_COMPILER_FENCE; -#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION) - // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). - asm volatile( - "rdtscp\n\t" - "lfence" - : "=a"(t) - : - // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. - : "rcx", "rdx", "memory"); -#else - t = static_cast<uint32_t>(Stop<uint64_t>()); -#endif - return t; -} - -} // namespace highwayhash - -#endif // HIGHWAYHASH_TSC_TIMER_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_TSC_TIMER_H_ +#define HIGHWAYHASH_TSC_TIMER_H_ + +// High-resolution (~10 ns) timestamps, using fences to prevent reordering and +// ensure exactly the desired regions are measured. + +#include <stdint.h> + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" + +#if HH_ARCH_X64 && HH_MSC_VERSION +#include <emmintrin.h> // _mm_lfence +#include <intrin.h> +#endif + +namespace highwayhash { + +// Start/Stop return absolute timestamps and must be placed immediately before +// and after the region to measure. We provide separate Start/Stop functions +// because they use different fences. +// +// Background: RDTSC is not 'serializing'; earlier instructions may complete +// after it, and/or later instructions may complete before it. 'Fences' ensure +// regions' elapsed times are independent of such reordering. The only +// documented unprivileged serializing instruction is CPUID, which acts as a +// full fence (no reordering across it in either direction). Unfortunately +// the latency of CPUID varies wildly (perhaps made worse by not initializing +// its EAX input). Because it cannot reliably be deducted from the region's +// elapsed time, it must not be included in the region to measure (i.e. +// between the two RDTSC). +// +// The newer RDTSCP is sometimes described as serializing, but it actually +// only serves as a half-fence with release semantics. Although all +// instructions in the region will complete before the final timestamp is +// captured, subsequent instructions may leak into the region and increase the +// elapsed time. Inserting another fence after the final RDTSCP would prevent +// such reordering without affecting the measured region. +// +// Fortunately, such a fence exists. The LFENCE instruction is only documented +// to delay later loads until earlier loads are visible. However, Intel's +// reference manual says it acts as a full fence (waiting until all earlier +// instructions have completed, and delaying later instructions until it +// completes). AMD assigns the same behavior to MFENCE. +// +// We need a fence before the initial RDTSC to prevent earlier instructions +// from leaking into the region, and arguably another after RDTSC to avoid +// region instructions from completing before the timestamp is recorded. +// When surrounded by fences, the additional RDTSCP half-fence provides no +// benefit, so the initial timestamp can be recorded via RDTSC, which has +// lower overhead than RDTSCP because it does not read TSC_AUX. In summary, +// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE. +// +// Using Start+Start leads to higher variance and overhead than Stop+Stop. +// However, Stop+Stop includes an LFENCE in the region measurements, which +// adds a delay dependent on earlier loads. The combination of Start+Stop +// is faster than Start+Start and more consistent than Stop+Stop because +// the first LFENCE already delayed subsequent loads before the measured +// region. This combination seems not to have been considered in prior work: +// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c +// +// Note: performance counters can measure 'exact' instructions-retired or +// (unhalted) cycle counts. The RDPMC instruction is not serializing and also +// requires fences. Unfortunately, it is not accessible on all OSes and we +// prefer to avoid kernel-mode drivers. Performance counters are also affected +// by several under/over-count errata, so we use the TSC instead. + +// Primary templates; must use one of the specializations. +template <typename T> +inline T Start(); + +template <typename T> +inline T Stop(); + +template <> +inline uint64_t Start<uint64_t>() { + uint64_t t; +#if HH_ARCH_PPC + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); +#elif HH_ARCH_AARCH64 + asm volatile("mrs %0, cntvct_el0" : "=r"(t)); +#elif HH_ARCH_X64 && HH_MSC_VERSION + _mm_lfence(); + HH_COMPILER_FENCE; + t = __rdtsc(); + _mm_lfence(); + HH_COMPILER_FENCE; +#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION) + asm volatile( + "lfence\n\t" + "rdtsc\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rdx = TSC >> 32. + // "cc" = flags modified by SHL. + : "rdx", "memory", "cc"); +#else +#error "Port" +#endif + return t; +} + +template <> +inline uint64_t Stop<uint64_t>() { + uint64_t t; +#if HH_ARCH_PPC + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); +#elif HH_ARCH_AARCH64 + asm volatile("mrs %0, cntvct_el0" : "=r"(t)); +#elif HH_ARCH_X64 && HH_MSC_VERSION + HH_COMPILER_FENCE; + unsigned aux; + t = __rdtscp(&aux); + _mm_lfence(); + HH_COMPILER_FENCE; +#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION) + // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). + asm volatile( + "rdtscp\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. + // "cc" = flags modified by SHL. + : "rcx", "rdx", "memory", "cc"); +#else +#error "Port" +#endif + return t; +} + +// Returns a 32-bit timestamp with about 4 cycles less overhead than +// Start<uint64_t>. Only suitable for measuring very short regions because the +// timestamp overflows about once a second. +template <> +inline uint32_t Start<uint32_t>() { + uint32_t t; +#if HH_ARCH_X64 && HH_MSC_VERSION + _mm_lfence(); + HH_COMPILER_FENCE; + t = static_cast<uint32_t>(__rdtsc()); + _mm_lfence(); + HH_COMPILER_FENCE; +#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION) + asm volatile( + "lfence\n\t" + "rdtsc\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rdx = TSC >> 32. + : "rdx", "memory"); +#else + t = static_cast<uint32_t>(Start<uint64_t>()); +#endif + return t; +} + +template <> +inline uint32_t Stop<uint32_t>() { + uint32_t t; +#if HH_ARCH_X64 && HH_MSC_VERSION + HH_COMPILER_FENCE; + unsigned aux; + t = static_cast<uint32_t>(__rdtscp(&aux)); + _mm_lfence(); + HH_COMPILER_FENCE; +#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION) + // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). + asm volatile( + "rdtscp\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. + : "rcx", "rdx", "memory"); +#else + t = static_cast<uint32_t>(Stop<uint64_t>()); +#endif + return t; +} + +} // namespace highwayhash + +#endif // HIGHWAYHASH_TSC_TIMER_H_ diff --git a/contrib/libs/highwayhash/highwayhash/vector128.h b/contrib/libs/highwayhash/highwayhash/vector128.h index 53eb9f164c..24c30859cd 100644 --- a/contrib/libs/highwayhash/highwayhash/vector128.h +++ b/contrib/libs/highwayhash/highwayhash/vector128.h @@ -1,796 +1,796 @@ -// Copyright 2016 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_VECTOR128_H_ -#define HIGHWAYHASH_VECTOR128_H_ - -// Defines SIMD vector classes ("V2x64U") with overloaded arithmetic operators: -// const V2x64U masked_sum = (a + b) & m; -// This is shorter and more readable than compiler intrinsics: -// const __m128i masked_sum = _mm_and_si128(_mm_add_epi64(a, b), m); -// There is typically no runtime cost for these abstractions. -// -// The naming convention is VNxBBT where N is the number of lanes, BB the -// number of bits per lane and T is the lane type: unsigned integer (U), -// signed integer (I), or floating-point (F). - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include <stddef.h> -#include <stdint.h> - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" - -// For auto-dependency generation, we need to include all headers but not their -// contents (otherwise compilation fails because -msse4.1 is not specified). -#ifndef HH_DISABLE_TARGET_SPECIFIC - -// WARNING: smmintrin.h will also be included through immintrin.h in the AVX2 -// translation unit, which is compiled with different flags. This risks ODR -// violations, and can cause crashes when functions are not inlined and the -// linker selects the AVX2 version. Unfortunately this include cannot reside -// within a namespace due to conflicts with other system headers. We need to -// assume all the intrinsic functions (defined as static inline by Clang's -// library and as extern inline by GCC) are in fact inlined. targets.bzl -// generates a test that verifies this by detecting duplicate symbols. -#include <smmintrin.h> // SSE4.1 - -namespace highwayhash { -// To prevent ODR violations when including this from multiple translation -// units (TU) that are compiled with different flags, the contents must reside -// in a namespace whose name is unique to the TU. NOTE: this behavior is -// incompatible with precompiled modules and requires textual inclusion instead. -namespace HH_TARGET_NAME { - -// Primary template for 128-bit SSE4.1 vectors; only specializations are used. -template <typename T> -class V128 {}; - -template <> -class V128<uint8_t> { - public: - using Intrinsic = __m128i; - using T = uint8_t; - static constexpr size_t N = 16; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE V128() {} - - // Broadcasts i to all lanes (usually by loading from memory). - HH_INLINE explicit V128(T i) : v_(_mm_set1_epi8(i)) {} - - // Copy from other vector. - HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} - template <typename U> - HH_INLINE explicit V128(const V128<U>& other) : v_(other) {} - HH_INLINE V128& operator=(const V128& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE V128(const Intrinsic& v) : v_(v) {} - HH_INLINE V128& operator=(const Intrinsic& v) { - v_ = v; - return *this; - } - HH_INLINE operator Intrinsic() const { return v_; } - - // There are no greater-than comparison instructions for unsigned T. - HH_INLINE V128 operator==(const V128& other) const { - return V128(_mm_cmpeq_epi8(v_, other.v_)); - } - - HH_INLINE V128& operator+=(const V128& other) { - v_ = _mm_add_epi8(v_, other.v_); - return *this; - } - HH_INLINE V128& operator-=(const V128& other) { - v_ = _mm_sub_epi8(v_, other.v_); - return *this; - } - - HH_INLINE V128& operator&=(const V128& other) { - v_ = _mm_and_si128(v_, other.v_); - return *this; - } - HH_INLINE V128& operator|=(const V128& other) { - v_ = _mm_or_si128(v_, other.v_); - return *this; - } - HH_INLINE V128& operator^=(const V128& other) { - v_ = _mm_xor_si128(v_, other.v_); - return *this; - } - - private: - Intrinsic v_; -}; - -template <> -class V128<uint16_t> { - public: - using Intrinsic = __m128i; - using T = uint16_t; - static constexpr size_t N = 8; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE V128() {} - - // Lane 0 (p_0) is the lowest. - HH_INLINE V128(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) - : v_(_mm_set_epi16(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {} - - // Broadcasts i to all lanes (usually by loading from memory). - HH_INLINE explicit V128(T i) : v_(_mm_set1_epi16(i)) {} - - // Copy from other vector. - HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} - template <typename U> - HH_INLINE explicit V128(const V128<U>& other) : v_(other) {} - HH_INLINE V128& operator=(const V128& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE V128(const Intrinsic& v) : v_(v) {} - HH_INLINE V128& operator=(const Intrinsic& v) { - v_ = v; - return *this; - } - HH_INLINE operator Intrinsic() const { return v_; } - - // There are no greater-than comparison instructions for unsigned T. - HH_INLINE V128 operator==(const V128& other) const { - return V128(_mm_cmpeq_epi16(v_, other.v_)); - } - - HH_INLINE V128& operator+=(const V128& other) { - v_ = _mm_add_epi16(v_, other.v_); - return *this; - } - HH_INLINE V128& operator-=(const V128& other) { - v_ = _mm_sub_epi16(v_, other.v_); - return *this; - } - - HH_INLINE V128& operator&=(const V128& other) { - v_ = _mm_and_si128(v_, other.v_); - return *this; - } - HH_INLINE V128& operator|=(const V128& other) { - v_ = _mm_or_si128(v_, other.v_); - return *this; - } - HH_INLINE V128& operator^=(const V128& other) { - v_ = _mm_xor_si128(v_, other.v_); - return *this; - } - - HH_INLINE V128& operator<<=(const int count) { - v_ = _mm_slli_epi16(v_, count); - return *this; - } - HH_INLINE V128& operator<<=(const Intrinsic& count) { - v_ = _mm_sll_epi16(v_, count); - return *this; - } - - HH_INLINE V128& operator>>=(const int count) { - v_ = _mm_srli_epi16(v_, count); - return *this; - } - HH_INLINE V128& operator>>=(const Intrinsic& count) { - v_ = _mm_srl_epi16(v_, count); - return *this; - } - - private: - Intrinsic v_; -}; - -template <> -class V128<uint32_t> { - public: - using Intrinsic = __m128i; - using T = uint32_t; - static constexpr size_t N = 4; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE V128() {} - - // Lane 0 (p_0) is the lowest. - HH_INLINE V128(T p_3, T p_2, T p_1, T p_0) - : v_(_mm_set_epi32(p_3, p_2, p_1, p_0)) {} - - // Broadcasts i to all lanes (usually by loading from memory). - HH_INLINE explicit V128(T i) : v_(_mm_set1_epi32(i)) {} - - // Copy from other vector. - HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} - template <typename U> - HH_INLINE explicit V128(const V128<U>& other) : v_(other) {} - HH_INLINE V128& operator=(const V128& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE V128(const Intrinsic& v) : v_(v) {} - HH_INLINE V128& operator=(const Intrinsic& v) { - v_ = v; - return *this; - } - HH_INLINE operator Intrinsic() const { return v_; } - - // There are no greater-than comparison instructions for unsigned T. - HH_INLINE V128 operator==(const V128& other) const { - return V128(_mm_cmpeq_epi32(v_, other.v_)); - } - - HH_INLINE V128& operator+=(const V128& other) { - v_ = _mm_add_epi32(v_, other.v_); - return *this; - } - HH_INLINE V128& operator-=(const V128& other) { - v_ = _mm_sub_epi32(v_, other.v_); - return *this; - } - - HH_INLINE V128& operator&=(const V128& other) { - v_ = _mm_and_si128(v_, other.v_); - return *this; - } - HH_INLINE V128& operator|=(const V128& other) { - v_ = _mm_or_si128(v_, other.v_); - return *this; - } - HH_INLINE V128& operator^=(const V128& other) { - v_ = _mm_xor_si128(v_, other.v_); - return *this; - } - - HH_INLINE V128& operator<<=(const int count) { - v_ = _mm_slli_epi32(v_, count); - return *this; - } - HH_INLINE V128& operator<<=(const Intrinsic& count) { - v_ = _mm_sll_epi32(v_, count); - return *this; - } - - HH_INLINE V128& operator>>=(const int count) { - v_ = _mm_srli_epi32(v_, count); - return *this; - } - HH_INLINE V128& operator>>=(const Intrinsic& count) { - v_ = _mm_srl_epi32(v_, count); - return *this; - } - - private: - Intrinsic v_; -}; - -template <> -class V128<uint64_t> { - public: - using Intrinsic = __m128i; - using T = uint64_t; - static constexpr size_t N = 2; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE V128() {} - - // Lane 0 (p_0) is the lowest. - HH_INLINE V128(T p_1, T p_0) : v_(_mm_set_epi64x(p_1, p_0)) {} - - // Broadcasts i to all lanes (usually by loading from memory). - HH_INLINE explicit V128(T i) : v_(_mm_set_epi64x(i, i)) {} - - // Copy from other vector. - HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} - template <typename U> - HH_INLINE explicit V128(const V128<U>& other) : v_(other) {} - HH_INLINE V128& operator=(const V128& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE V128(const Intrinsic& v) : v_(v) {} - HH_INLINE V128& operator=(const Intrinsic& v) { - v_ = v; - return *this; - } - HH_INLINE operator Intrinsic() const { return v_; } - - // There are no greater-than comparison instructions for unsigned T. - HH_INLINE V128 operator==(const V128& other) const { - return V128(_mm_cmpeq_epi64(v_, other.v_)); - } - - HH_INLINE V128& operator+=(const V128& other) { - v_ = _mm_add_epi64(v_, other.v_); - return *this; - } - HH_INLINE V128& operator-=(const V128& other) { - v_ = _mm_sub_epi64(v_, other.v_); - return *this; - } - - HH_INLINE V128& operator&=(const V128& other) { - v_ = _mm_and_si128(v_, other.v_); - return *this; - } - HH_INLINE V128& operator|=(const V128& other) { - v_ = _mm_or_si128(v_, other.v_); - return *this; - } - HH_INLINE V128& operator^=(const V128& other) { - v_ = _mm_xor_si128(v_, other.v_); - return *this; - } - - HH_INLINE V128& operator<<=(const int count) { - v_ = _mm_slli_epi64(v_, count); - return *this; - } - HH_INLINE V128& operator<<=(const Intrinsic& count) { - v_ = _mm_sll_epi64(v_, count); - return *this; - } - - HH_INLINE V128& operator>>=(const int count) { - v_ = _mm_srli_epi64(v_, count); - return *this; - } - HH_INLINE V128& operator>>=(const Intrinsic& count) { - v_ = _mm_srl_epi64(v_, count); - return *this; - } - - private: - Intrinsic v_; -}; - -template <> -class V128<float> { - public: - using Intrinsic = __m128; - using T = float; - static constexpr size_t N = 4; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE V128() {} - - // Lane 0 (p_0) is the lowest. - HH_INLINE V128(T p_3, T p_2, T p_1, T p_0) - : v_(_mm_set_ps(p_3, p_2, p_1, p_0)) {} - - // Broadcasts to all lanes. - HH_INLINE explicit V128(T f) : v_(_mm_set1_ps(f)) {} - - // Copy from other vector. - HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} - template <typename U> - HH_INLINE explicit V128(const V128<U>& other) : v_(other) {} - HH_INLINE V128& operator=(const V128& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE V128(const Intrinsic& v) : v_(v) {} - HH_INLINE V128& operator=(const Intrinsic& v) { - v_ = v; - return *this; - } - HH_INLINE operator Intrinsic() const { return v_; } - - HH_INLINE V128 operator==(const V128& other) const { - return V128(_mm_cmpeq_ps(v_, other.v_)); - } - HH_INLINE V128 operator<(const V128& other) const { - return V128(_mm_cmplt_ps(v_, other.v_)); - } - HH_INLINE V128 operator>(const V128& other) const { - return V128(_mm_cmplt_ps(other.v_, v_)); - } - - HH_INLINE V128& operator*=(const V128& other) { - v_ = _mm_mul_ps(v_, other.v_); - return *this; - } - HH_INLINE V128& operator/=(const V128& other) { - v_ = _mm_div_ps(v_, other.v_); - return *this; - } - HH_INLINE V128& operator+=(const V128& other) { - v_ = _mm_add_ps(v_, other.v_); - return *this; - } - HH_INLINE V128& operator-=(const V128& other) { - v_ = _mm_sub_ps(v_, other.v_); - return *this; - } - - HH_INLINE V128& operator&=(const V128& other) { - v_ = _mm_and_ps(v_, other.v_); - return *this; - } - HH_INLINE V128& operator|=(const V128& other) { - v_ = _mm_or_ps(v_, other.v_); - return *this; - } - HH_INLINE V128& operator^=(const V128& other) { - v_ = _mm_xor_ps(v_, other.v_); - return *this; - } - - private: - Intrinsic v_; -}; - -template <> -class V128<double> { - public: - using Intrinsic = __m128d; - using T = double; - static constexpr size_t N = 2; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE V128() {} - - // Lane 0 (p_0) is the lowest. - HH_INLINE V128(T p_1, T p_0) : v_(_mm_set_pd(p_1, p_0)) {} - - // Broadcasts to all lanes. - HH_INLINE explicit V128(T f) : v_(_mm_set1_pd(f)) {} - - // Copy from other vector. - HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} - template <typename U> - HH_INLINE explicit V128(const V128<U>& other) : v_(other) {} - HH_INLINE V128& operator=(const V128& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE V128(const Intrinsic& v) : v_(v) {} - HH_INLINE V128& operator=(const Intrinsic& v) { - v_ = v; - return *this; - } - HH_INLINE operator Intrinsic() const { return v_; } - - HH_INLINE V128 operator==(const V128& other) const { - return V128(_mm_cmpeq_pd(v_, other.v_)); - } - HH_INLINE V128 operator<(const V128& other) const { - return V128(_mm_cmplt_pd(v_, other.v_)); - } - HH_INLINE V128 operator>(const V128& other) const { - return V128(_mm_cmplt_pd(other.v_, v_)); - } - - HH_INLINE V128& operator*=(const V128& other) { - v_ = _mm_mul_pd(v_, other.v_); - return *this; - } - HH_INLINE V128& operator/=(const V128& other) { - v_ = _mm_div_pd(v_, other.v_); - return *this; - } - HH_INLINE V128& operator+=(const V128& other) { - v_ = _mm_add_pd(v_, other.v_); - return *this; - } - HH_INLINE V128& operator-=(const V128& other) { - v_ = _mm_sub_pd(v_, other.v_); - return *this; - } - - HH_INLINE V128& operator&=(const V128& other) { - v_ = _mm_and_pd(v_, other.v_); - return *this; - } - HH_INLINE V128& operator|=(const V128& other) { - v_ = _mm_or_pd(v_, other.v_); - return *this; - } - HH_INLINE V128& operator^=(const V128& other) { - v_ = _mm_xor_pd(v_, other.v_); - return *this; - } - - private: - Intrinsic v_; -}; - -// Nonmember functions for any V128 via member functions. - -template <typename T> -HH_INLINE V128<T> operator*(const V128<T>& left, const V128<T>& right) { - V128<T> t(left); - return t *= right; -} - -template <typename T> -HH_INLINE V128<T> operator/(const V128<T>& left, const V128<T>& right) { - V128<T> t(left); - return t /= right; -} - -template <typename T> -HH_INLINE V128<T> operator+(const V128<T>& left, const V128<T>& right) { - V128<T> t(left); - return t += right; -} - -template <typename T> -HH_INLINE V128<T> operator-(const V128<T>& left, const V128<T>& right) { - V128<T> t(left); - return t -= right; -} - -template <typename T> -HH_INLINE V128<T> operator&(const V128<T>& left, const V128<T>& right) { - V128<T> t(left); - return t &= right; -} - -template <typename T> -HH_INLINE V128<T> operator|(const V128<T>& left, const V128<T>& right) { - V128<T> t(left); - return t |= right; -} - -template <typename T> -HH_INLINE V128<T> operator^(const V128<T>& left, const V128<T>& right) { - V128<T> t(left); - return t ^= right; -} - -template <typename T> -HH_INLINE V128<T> operator<<(const V128<T>& v, const int count) { - V128<T> t(v); - return t <<= count; -} - -template <typename T> -HH_INLINE V128<T> operator>>(const V128<T>& v, const int count) { - V128<T> t(v); - return t >>= count; -} - -template <typename T> -HH_INLINE V128<T> operator<<(const V128<T>& v, const __m128i& count) { - V128<T> t(v); - return t <<= count; -} - -template <typename T> -HH_INLINE V128<T> operator>>(const V128<T>& v, const __m128i& count) { - V128<T> t(v); - return t >>= count; -} - -using V16x8U = V128<uint8_t>; -using V8x16U = V128<uint16_t>; -using V4x32U = V128<uint32_t>; -using V2x64U = V128<uint64_t>; -using V4x32F = V128<float>; -using V2x64F = V128<double>; - -// Load/Store for any V128. - -// We differentiate between targets' vector types via template specialization. -// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may -// generate better code in unoptimized builds. Only declare the primary -// templates to avoid needing mutual exclusion with vector256. - -template <class V> -HH_INLINE V Load(const typename V::T* const HH_RESTRICT from); - -template <class V> -HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from); - -// "from" must be vector-aligned. -template <> -HH_INLINE V16x8U Load<V16x8U>(const V16x8U::T* const HH_RESTRICT from) { - const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); - return V16x8U(_mm_load_si128(p)); -} -template <> -HH_INLINE V8x16U Load<V8x16U>(const V8x16U::T* const HH_RESTRICT from) { - const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); - return V8x16U(_mm_load_si128(p)); -} -template <> -HH_INLINE V4x32U Load<V4x32U>(const V4x32U::T* const HH_RESTRICT from) { - const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); - return V4x32U(_mm_load_si128(p)); -} -template <> -HH_INLINE V2x64U Load<V2x64U>(const V2x64U::T* const HH_RESTRICT from) { - const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); - return V2x64U(_mm_load_si128(p)); -} -template <> -HH_INLINE V4x32F Load<V4x32F>(const V4x32F::T* const HH_RESTRICT from) { - return V4x32F(_mm_load_ps(from)); -} -template <> -HH_INLINE V2x64F Load<V2x64F>(const V2x64F::T* const HH_RESTRICT from) { - return V2x64F(_mm_load_pd(from)); -} - -template <> -HH_INLINE V16x8U -LoadUnaligned<V16x8U>(const V16x8U::T* const HH_RESTRICT from) { - const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); - return V16x8U(_mm_loadu_si128(p)); -} -template <> -HH_INLINE V8x16U -LoadUnaligned<V8x16U>(const V8x16U::T* const HH_RESTRICT from) { - const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); - return V8x16U(_mm_loadu_si128(p)); -} -template <> -HH_INLINE V4x32U -LoadUnaligned<V4x32U>(const V4x32U::T* const HH_RESTRICT from) { - const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); - return V4x32U(_mm_loadu_si128(p)); -} -template <> -HH_INLINE V2x64U -LoadUnaligned<V2x64U>(const V2x64U::T* const HH_RESTRICT from) { - const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); - return V2x64U(_mm_loadu_si128(p)); -} -template <> -HH_INLINE V4x32F -LoadUnaligned<V4x32F>(const V4x32F::T* const HH_RESTRICT from) { - return V4x32F(_mm_loadu_ps(from)); -} -template <> -HH_INLINE V2x64F -LoadUnaligned<V2x64F>(const V2x64F::T* const HH_RESTRICT from) { - return V2x64F(_mm_loadu_pd(from)); -} - -// "to" must be vector-aligned. -template <typename T> -HH_INLINE void Store(const V128<T>& v, T* const HH_RESTRICT to) { - _mm_store_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v); -} -HH_INLINE void Store(const V128<float>& v, float* const HH_RESTRICT to) { - _mm_store_ps(to, v); -} -HH_INLINE void Store(const V128<double>& v, double* const HH_RESTRICT to) { - _mm_store_pd(to, v); -} - -template <typename T> -HH_INLINE void StoreUnaligned(const V128<T>& v, T* const HH_RESTRICT to) { - _mm_storeu_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v); -} -HH_INLINE void StoreUnaligned(const V128<float>& v, - float* const HH_RESTRICT to) { - _mm_storeu_ps(to, v); -} -HH_INLINE void StoreUnaligned(const V128<double>& v, - double* const HH_RESTRICT to) { - _mm_storeu_pd(to, v); -} - -// Writes directly to (aligned) memory, bypassing the cache. This is useful for -// data that will not be read again in the near future. -template <typename T> -HH_INLINE void Stream(const V128<T>& v, T* const HH_RESTRICT to) { - _mm_stream_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v); -} -HH_INLINE void Stream(const V128<float>& v, float* const HH_RESTRICT to) { - _mm_stream_ps(to, v); -} -HH_INLINE void Stream(const V128<double>& v, double* const HH_RESTRICT to) { - _mm_stream_pd(to, v); -} - -// Miscellaneous functions. - -template <typename T> -HH_INLINE V128<T> RotateLeft(const V128<T>& v, const int count) { - constexpr size_t num_bits = sizeof(T) * 8; - return (v << count) | (v >> (num_bits - count)); -} - -template <typename T> -HH_INLINE V128<T> AndNot(const V128<T>& neg_mask, const V128<T>& values) { - return V128<T>(_mm_andnot_si128(neg_mask, values)); -} -template <> -HH_INLINE V128<float> AndNot(const V128<float>& neg_mask, - const V128<float>& values) { - return V128<float>(_mm_andnot_ps(neg_mask, values)); -} -template <> -HH_INLINE V128<double> AndNot(const V128<double>& neg_mask, - const V128<double>& values) { - return V128<double>(_mm_andnot_pd(neg_mask, values)); -} - -HH_INLINE V4x32F Select(const V4x32F& a, const V4x32F& b, const V4x32F& mask) { - return V4x32F(_mm_blendv_ps(a, b, mask)); -} - -HH_INLINE V2x64F Select(const V2x64F& a, const V2x64F& b, const V2x64F& mask) { - return V2x64F(_mm_blendv_pd(a, b, mask)); -} - -// Min/Max - -HH_INLINE V16x8U Min(const V16x8U& v0, const V16x8U& v1) { - return V16x8U(_mm_min_epu8(v0, v1)); -} - -HH_INLINE V16x8U Max(const V16x8U& v0, const V16x8U& v1) { - return V16x8U(_mm_max_epu8(v0, v1)); -} - -HH_INLINE V8x16U Min(const V8x16U& v0, const V8x16U& v1) { - return V8x16U(_mm_min_epu16(v0, v1)); -} - -HH_INLINE V8x16U Max(const V8x16U& v0, const V8x16U& v1) { - return V8x16U(_mm_max_epu16(v0, v1)); -} - -HH_INLINE V4x32U Min(const V4x32U& v0, const V4x32U& v1) { - return V4x32U(_mm_min_epu32(v0, v1)); -} - -HH_INLINE V4x32U Max(const V4x32U& v0, const V4x32U& v1) { - return V4x32U(_mm_max_epu32(v0, v1)); -} - -HH_INLINE V4x32F Min(const V4x32F& v0, const V4x32F& v1) { - return V4x32F(_mm_min_ps(v0, v1)); -} - -HH_INLINE V4x32F Max(const V4x32F& v0, const V4x32F& v1) { - return V4x32F(_mm_max_ps(v0, v1)); -} - -HH_INLINE V2x64F Min(const V2x64F& v0, const V2x64F& v1) { - return V2x64F(_mm_min_pd(v0, v1)); -} - -HH_INLINE V2x64F Max(const V2x64F& v0, const V2x64F& v1) { - return V2x64F(_mm_max_pd(v0, v1)); -} - -} // namespace HH_TARGET_NAME -} // namespace highwayhash - -#endif // HH_DISABLE_TARGET_SPECIFIC -#endif // HIGHWAYHASH_VECTOR128_H_ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_VECTOR128_H_ +#define HIGHWAYHASH_VECTOR128_H_ + +// Defines SIMD vector classes ("V2x64U") with overloaded arithmetic operators: +// const V2x64U masked_sum = (a + b) & m; +// This is shorter and more readable than compiler intrinsics: +// const __m128i masked_sum = _mm_and_si128(_mm_add_epi64(a, b), m); +// There is typically no runtime cost for these abstractions. +// +// The naming convention is VNxBBT where N is the number of lanes, BB the +// number of bits per lane and T is the lane type: unsigned integer (U), +// signed integer (I), or floating-point (F). + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include <stddef.h> +#include <stdint.h> + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" + +// For auto-dependency generation, we need to include all headers but not their +// contents (otherwise compilation fails because -msse4.1 is not specified). +#ifndef HH_DISABLE_TARGET_SPECIFIC + +// WARNING: smmintrin.h will also be included through immintrin.h in the AVX2 +// translation unit, which is compiled with different flags. This risks ODR +// violations, and can cause crashes when functions are not inlined and the +// linker selects the AVX2 version. Unfortunately this include cannot reside +// within a namespace due to conflicts with other system headers. We need to +// assume all the intrinsic functions (defined as static inline by Clang's +// library and as extern inline by GCC) are in fact inlined. targets.bzl +// generates a test that verifies this by detecting duplicate symbols. +#include <smmintrin.h> // SSE4.1 + +namespace highwayhash { +// To prevent ODR violations when including this from multiple translation +// units (TU) that are compiled with different flags, the contents must reside +// in a namespace whose name is unique to the TU. NOTE: this behavior is +// incompatible with precompiled modules and requires textual inclusion instead. +namespace HH_TARGET_NAME { + +// Primary template for 128-bit SSE4.1 vectors; only specializations are used. +template <typename T> +class V128 {}; + +template <> +class V128<uint8_t> { + public: + using Intrinsic = __m128i; + using T = uint8_t; + static constexpr size_t N = 16; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE V128() {} + + // Broadcasts i to all lanes (usually by loading from memory). + HH_INLINE explicit V128(T i) : v_(_mm_set1_epi8(i)) {} + + // Copy from other vector. + HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} + template <typename U> + HH_INLINE explicit V128(const V128<U>& other) : v_(other) {} + HH_INLINE V128& operator=(const V128& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE V128(const Intrinsic& v) : v_(v) {} + HH_INLINE V128& operator=(const Intrinsic& v) { + v_ = v; + return *this; + } + HH_INLINE operator Intrinsic() const { return v_; } + + // There are no greater-than comparison instructions for unsigned T. + HH_INLINE V128 operator==(const V128& other) const { + return V128(_mm_cmpeq_epi8(v_, other.v_)); + } + + HH_INLINE V128& operator+=(const V128& other) { + v_ = _mm_add_epi8(v_, other.v_); + return *this; + } + HH_INLINE V128& operator-=(const V128& other) { + v_ = _mm_sub_epi8(v_, other.v_); + return *this; + } + + HH_INLINE V128& operator&=(const V128& other) { + v_ = _mm_and_si128(v_, other.v_); + return *this; + } + HH_INLINE V128& operator|=(const V128& other) { + v_ = _mm_or_si128(v_, other.v_); + return *this; + } + HH_INLINE V128& operator^=(const V128& other) { + v_ = _mm_xor_si128(v_, other.v_); + return *this; + } + + private: + Intrinsic v_; +}; + +template <> +class V128<uint16_t> { + public: + using Intrinsic = __m128i; + using T = uint16_t; + static constexpr size_t N = 8; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE V128() {} + + // Lane 0 (p_0) is the lowest. + HH_INLINE V128(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) + : v_(_mm_set_epi16(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {} + + // Broadcasts i to all lanes (usually by loading from memory). + HH_INLINE explicit V128(T i) : v_(_mm_set1_epi16(i)) {} + + // Copy from other vector. + HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} + template <typename U> + HH_INLINE explicit V128(const V128<U>& other) : v_(other) {} + HH_INLINE V128& operator=(const V128& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE V128(const Intrinsic& v) : v_(v) {} + HH_INLINE V128& operator=(const Intrinsic& v) { + v_ = v; + return *this; + } + HH_INLINE operator Intrinsic() const { return v_; } + + // There are no greater-than comparison instructions for unsigned T. + HH_INLINE V128 operator==(const V128& other) const { + return V128(_mm_cmpeq_epi16(v_, other.v_)); + } + + HH_INLINE V128& operator+=(const V128& other) { + v_ = _mm_add_epi16(v_, other.v_); + return *this; + } + HH_INLINE V128& operator-=(const V128& other) { + v_ = _mm_sub_epi16(v_, other.v_); + return *this; + } + + HH_INLINE V128& operator&=(const V128& other) { + v_ = _mm_and_si128(v_, other.v_); + return *this; + } + HH_INLINE V128& operator|=(const V128& other) { + v_ = _mm_or_si128(v_, other.v_); + return *this; + } + HH_INLINE V128& operator^=(const V128& other) { + v_ = _mm_xor_si128(v_, other.v_); + return *this; + } + + HH_INLINE V128& operator<<=(const int count) { + v_ = _mm_slli_epi16(v_, count); + return *this; + } + HH_INLINE V128& operator<<=(const Intrinsic& count) { + v_ = _mm_sll_epi16(v_, count); + return *this; + } + + HH_INLINE V128& operator>>=(const int count) { + v_ = _mm_srli_epi16(v_, count); + return *this; + } + HH_INLINE V128& operator>>=(const Intrinsic& count) { + v_ = _mm_srl_epi16(v_, count); + return *this; + } + + private: + Intrinsic v_; +}; + +template <> +class V128<uint32_t> { + public: + using Intrinsic = __m128i; + using T = uint32_t; + static constexpr size_t N = 4; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE V128() {} + + // Lane 0 (p_0) is the lowest. + HH_INLINE V128(T p_3, T p_2, T p_1, T p_0) + : v_(_mm_set_epi32(p_3, p_2, p_1, p_0)) {} + + // Broadcasts i to all lanes (usually by loading from memory). + HH_INLINE explicit V128(T i) : v_(_mm_set1_epi32(i)) {} + + // Copy from other vector. + HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} + template <typename U> + HH_INLINE explicit V128(const V128<U>& other) : v_(other) {} + HH_INLINE V128& operator=(const V128& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE V128(const Intrinsic& v) : v_(v) {} + HH_INLINE V128& operator=(const Intrinsic& v) { + v_ = v; + return *this; + } + HH_INLINE operator Intrinsic() const { return v_; } + + // There are no greater-than comparison instructions for unsigned T. + HH_INLINE V128 operator==(const V128& other) const { + return V128(_mm_cmpeq_epi32(v_, other.v_)); + } + + HH_INLINE V128& operator+=(const V128& other) { + v_ = _mm_add_epi32(v_, other.v_); + return *this; + } + HH_INLINE V128& operator-=(const V128& other) { + v_ = _mm_sub_epi32(v_, other.v_); + return *this; + } + + HH_INLINE V128& operator&=(const V128& other) { + v_ = _mm_and_si128(v_, other.v_); + return *this; + } + HH_INLINE V128& operator|=(const V128& other) { + v_ = _mm_or_si128(v_, other.v_); + return *this; + } + HH_INLINE V128& operator^=(const V128& other) { + v_ = _mm_xor_si128(v_, other.v_); + return *this; + } + + HH_INLINE V128& operator<<=(const int count) { + v_ = _mm_slli_epi32(v_, count); + return *this; + } + HH_INLINE V128& operator<<=(const Intrinsic& count) { + v_ = _mm_sll_epi32(v_, count); + return *this; + } + + HH_INLINE V128& operator>>=(const int count) { + v_ = _mm_srli_epi32(v_, count); + return *this; + } + HH_INLINE V128& operator>>=(const Intrinsic& count) { + v_ = _mm_srl_epi32(v_, count); + return *this; + } + + private: + Intrinsic v_; +}; + +template <> +class V128<uint64_t> { + public: + using Intrinsic = __m128i; + using T = uint64_t; + static constexpr size_t N = 2; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE V128() {} + + // Lane 0 (p_0) is the lowest. + HH_INLINE V128(T p_1, T p_0) : v_(_mm_set_epi64x(p_1, p_0)) {} + + // Broadcasts i to all lanes (usually by loading from memory). + HH_INLINE explicit V128(T i) : v_(_mm_set_epi64x(i, i)) {} + + // Copy from other vector. + HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} + template <typename U> + HH_INLINE explicit V128(const V128<U>& other) : v_(other) {} + HH_INLINE V128& operator=(const V128& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE V128(const Intrinsic& v) : v_(v) {} + HH_INLINE V128& operator=(const Intrinsic& v) { + v_ = v; + return *this; + } + HH_INLINE operator Intrinsic() const { return v_; } + + // There are no greater-than comparison instructions for unsigned T. + HH_INLINE V128 operator==(const V128& other) const { + return V128(_mm_cmpeq_epi64(v_, other.v_)); + } + + HH_INLINE V128& operator+=(const V128& other) { + v_ = _mm_add_epi64(v_, other.v_); + return *this; + } + HH_INLINE V128& operator-=(const V128& other) { + v_ = _mm_sub_epi64(v_, other.v_); + return *this; + } + + HH_INLINE V128& operator&=(const V128& other) { + v_ = _mm_and_si128(v_, other.v_); + return *this; + } + HH_INLINE V128& operator|=(const V128& other) { + v_ = _mm_or_si128(v_, other.v_); + return *this; + } + HH_INLINE V128& operator^=(const V128& other) { + v_ = _mm_xor_si128(v_, other.v_); + return *this; + } + + HH_INLINE V128& operator<<=(const int count) { + v_ = _mm_slli_epi64(v_, count); + return *this; + } + HH_INLINE V128& operator<<=(const Intrinsic& count) { + v_ = _mm_sll_epi64(v_, count); + return *this; + } + + HH_INLINE V128& operator>>=(const int count) { + v_ = _mm_srli_epi64(v_, count); + return *this; + } + HH_INLINE V128& operator>>=(const Intrinsic& count) { + v_ = _mm_srl_epi64(v_, count); + return *this; + } + + private: + Intrinsic v_; +}; + +template <> +class V128<float> { + public: + using Intrinsic = __m128; + using T = float; + static constexpr size_t N = 4; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE V128() {} + + // Lane 0 (p_0) is the lowest. + HH_INLINE V128(T p_3, T p_2, T p_1, T p_0) + : v_(_mm_set_ps(p_3, p_2, p_1, p_0)) {} + + // Broadcasts to all lanes. + HH_INLINE explicit V128(T f) : v_(_mm_set1_ps(f)) {} + + // Copy from other vector. + HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} + template <typename U> + HH_INLINE explicit V128(const V128<U>& other) : v_(other) {} + HH_INLINE V128& operator=(const V128& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE V128(const Intrinsic& v) : v_(v) {} + HH_INLINE V128& operator=(const Intrinsic& v) { + v_ = v; + return *this; + } + HH_INLINE operator Intrinsic() const { return v_; } + + HH_INLINE V128 operator==(const V128& other) const { + return V128(_mm_cmpeq_ps(v_, other.v_)); + } + HH_INLINE V128 operator<(const V128& other) const { + return V128(_mm_cmplt_ps(v_, other.v_)); + } + HH_INLINE V128 operator>(const V128& other) const { + return V128(_mm_cmplt_ps(other.v_, v_)); + } + + HH_INLINE V128& operator*=(const V128& other) { + v_ = _mm_mul_ps(v_, other.v_); + return *this; + } + HH_INLINE V128& operator/=(const V128& other) { + v_ = _mm_div_ps(v_, other.v_); + return *this; + } + HH_INLINE V128& operator+=(const V128& other) { + v_ = _mm_add_ps(v_, other.v_); + return *this; + } + HH_INLINE V128& operator-=(const V128& other) { + v_ = _mm_sub_ps(v_, other.v_); + return *this; + } + + HH_INLINE V128& operator&=(const V128& other) { + v_ = _mm_and_ps(v_, other.v_); + return *this; + } + HH_INLINE V128& operator|=(const V128& other) { + v_ = _mm_or_ps(v_, other.v_); + return *this; + } + HH_INLINE V128& operator^=(const V128& other) { + v_ = _mm_xor_ps(v_, other.v_); + return *this; + } + + private: + Intrinsic v_; +}; + +template <> +class V128<double> { + public: + using Intrinsic = __m128d; + using T = double; + static constexpr size_t N = 2; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE V128() {} + + // Lane 0 (p_0) is the lowest. + HH_INLINE V128(T p_1, T p_0) : v_(_mm_set_pd(p_1, p_0)) {} + + // Broadcasts to all lanes. + HH_INLINE explicit V128(T f) : v_(_mm_set1_pd(f)) {} + + // Copy from other vector. + HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} + template <typename U> + HH_INLINE explicit V128(const V128<U>& other) : v_(other) {} + HH_INLINE V128& operator=(const V128& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE V128(const Intrinsic& v) : v_(v) {} + HH_INLINE V128& operator=(const Intrinsic& v) { + v_ = v; + return *this; + } + HH_INLINE operator Intrinsic() const { return v_; } + + HH_INLINE V128 operator==(const V128& other) const { + return V128(_mm_cmpeq_pd(v_, other.v_)); + } + HH_INLINE V128 operator<(const V128& other) const { + return V128(_mm_cmplt_pd(v_, other.v_)); + } + HH_INLINE V128 operator>(const V128& other) const { + return V128(_mm_cmplt_pd(other.v_, v_)); + } + + HH_INLINE V128& operator*=(const V128& other) { + v_ = _mm_mul_pd(v_, other.v_); + return *this; + } + HH_INLINE V128& operator/=(const V128& other) { + v_ = _mm_div_pd(v_, other.v_); + return *this; + } + HH_INLINE V128& operator+=(const V128& other) { + v_ = _mm_add_pd(v_, other.v_); + return *this; + } + HH_INLINE V128& operator-=(const V128& other) { + v_ = _mm_sub_pd(v_, other.v_); + return *this; + } + + HH_INLINE V128& operator&=(const V128& other) { + v_ = _mm_and_pd(v_, other.v_); + return *this; + } + HH_INLINE V128& operator|=(const V128& other) { + v_ = _mm_or_pd(v_, other.v_); + return *this; + } + HH_INLINE V128& operator^=(const V128& other) { + v_ = _mm_xor_pd(v_, other.v_); + return *this; + } + + private: + Intrinsic v_; +}; + +// Nonmember functions for any V128 via member functions. + +template <typename T> +HH_INLINE V128<T> operator*(const V128<T>& left, const V128<T>& right) { + V128<T> t(left); + return t *= right; +} + +template <typename T> +HH_INLINE V128<T> operator/(const V128<T>& left, const V128<T>& right) { + V128<T> t(left); + return t /= right; +} + +template <typename T> +HH_INLINE V128<T> operator+(const V128<T>& left, const V128<T>& right) { + V128<T> t(left); + return t += right; +} + +template <typename T> +HH_INLINE V128<T> operator-(const V128<T>& left, const V128<T>& right) { + V128<T> t(left); + return t -= right; +} + +template <typename T> +HH_INLINE V128<T> operator&(const V128<T>& left, const V128<T>& right) { + V128<T> t(left); + return t &= right; +} + +template <typename T> +HH_INLINE V128<T> operator|(const V128<T>& left, const V128<T>& right) { + V128<T> t(left); + return t |= right; +} + +template <typename T> +HH_INLINE V128<T> operator^(const V128<T>& left, const V128<T>& right) { + V128<T> t(left); + return t ^= right; +} + +template <typename T> +HH_INLINE V128<T> operator<<(const V128<T>& v, const int count) { + V128<T> t(v); + return t <<= count; +} + +template <typename T> +HH_INLINE V128<T> operator>>(const V128<T>& v, const int count) { + V128<T> t(v); + return t >>= count; +} + +template <typename T> +HH_INLINE V128<T> operator<<(const V128<T>& v, const __m128i& count) { + V128<T> t(v); + return t <<= count; +} + +template <typename T> +HH_INLINE V128<T> operator>>(const V128<T>& v, const __m128i& count) { + V128<T> t(v); + return t >>= count; +} + +using V16x8U = V128<uint8_t>; +using V8x16U = V128<uint16_t>; +using V4x32U = V128<uint32_t>; +using V2x64U = V128<uint64_t>; +using V4x32F = V128<float>; +using V2x64F = V128<double>; + +// Load/Store for any V128. + +// We differentiate between targets' vector types via template specialization. +// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may +// generate better code in unoptimized builds. Only declare the primary +// templates to avoid needing mutual exclusion with vector256. + +template <class V> +HH_INLINE V Load(const typename V::T* const HH_RESTRICT from); + +template <class V> +HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from); + +// "from" must be vector-aligned. +template <> +HH_INLINE V16x8U Load<V16x8U>(const V16x8U::T* const HH_RESTRICT from) { + const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); + return V16x8U(_mm_load_si128(p)); +} +template <> +HH_INLINE V8x16U Load<V8x16U>(const V8x16U::T* const HH_RESTRICT from) { + const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); + return V8x16U(_mm_load_si128(p)); +} +template <> +HH_INLINE V4x32U Load<V4x32U>(const V4x32U::T* const HH_RESTRICT from) { + const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); + return V4x32U(_mm_load_si128(p)); +} +template <> +HH_INLINE V2x64U Load<V2x64U>(const V2x64U::T* const HH_RESTRICT from) { + const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); + return V2x64U(_mm_load_si128(p)); +} +template <> +HH_INLINE V4x32F Load<V4x32F>(const V4x32F::T* const HH_RESTRICT from) { + return V4x32F(_mm_load_ps(from)); +} +template <> +HH_INLINE V2x64F Load<V2x64F>(const V2x64F::T* const HH_RESTRICT from) { + return V2x64F(_mm_load_pd(from)); +} + +template <> +HH_INLINE V16x8U +LoadUnaligned<V16x8U>(const V16x8U::T* const HH_RESTRICT from) { + const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); + return V16x8U(_mm_loadu_si128(p)); +} +template <> +HH_INLINE V8x16U +LoadUnaligned<V8x16U>(const V8x16U::T* const HH_RESTRICT from) { + const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); + return V8x16U(_mm_loadu_si128(p)); +} +template <> +HH_INLINE V4x32U +LoadUnaligned<V4x32U>(const V4x32U::T* const HH_RESTRICT from) { + const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); + return V4x32U(_mm_loadu_si128(p)); +} +template <> +HH_INLINE V2x64U +LoadUnaligned<V2x64U>(const V2x64U::T* const HH_RESTRICT from) { + const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from); + return V2x64U(_mm_loadu_si128(p)); +} +template <> +HH_INLINE V4x32F +LoadUnaligned<V4x32F>(const V4x32F::T* const HH_RESTRICT from) { + return V4x32F(_mm_loadu_ps(from)); +} +template <> +HH_INLINE V2x64F +LoadUnaligned<V2x64F>(const V2x64F::T* const HH_RESTRICT from) { + return V2x64F(_mm_loadu_pd(from)); +} + +// "to" must be vector-aligned. +template <typename T> +HH_INLINE void Store(const V128<T>& v, T* const HH_RESTRICT to) { + _mm_store_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v); +} +HH_INLINE void Store(const V128<float>& v, float* const HH_RESTRICT to) { + _mm_store_ps(to, v); +} +HH_INLINE void Store(const V128<double>& v, double* const HH_RESTRICT to) { + _mm_store_pd(to, v); +} + +template <typename T> +HH_INLINE void StoreUnaligned(const V128<T>& v, T* const HH_RESTRICT to) { + _mm_storeu_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v); +} +HH_INLINE void StoreUnaligned(const V128<float>& v, + float* const HH_RESTRICT to) { + _mm_storeu_ps(to, v); +} +HH_INLINE void StoreUnaligned(const V128<double>& v, + double* const HH_RESTRICT to) { + _mm_storeu_pd(to, v); +} + +// Writes directly to (aligned) memory, bypassing the cache. This is useful for +// data that will not be read again in the near future. +template <typename T> +HH_INLINE void Stream(const V128<T>& v, T* const HH_RESTRICT to) { + _mm_stream_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v); +} +HH_INLINE void Stream(const V128<float>& v, float* const HH_RESTRICT to) { + _mm_stream_ps(to, v); +} +HH_INLINE void Stream(const V128<double>& v, double* const HH_RESTRICT to) { + _mm_stream_pd(to, v); +} + +// Miscellaneous functions. + +template <typename T> +HH_INLINE V128<T> RotateLeft(const V128<T>& v, const int count) { + constexpr size_t num_bits = sizeof(T) * 8; + return (v << count) | (v >> (num_bits - count)); +} + +template <typename T> +HH_INLINE V128<T> AndNot(const V128<T>& neg_mask, const V128<T>& values) { + return V128<T>(_mm_andnot_si128(neg_mask, values)); +} +template <> +HH_INLINE V128<float> AndNot(const V128<float>& neg_mask, + const V128<float>& values) { + return V128<float>(_mm_andnot_ps(neg_mask, values)); +} +template <> +HH_INLINE V128<double> AndNot(const V128<double>& neg_mask, + const V128<double>& values) { + return V128<double>(_mm_andnot_pd(neg_mask, values)); +} + +HH_INLINE V4x32F Select(const V4x32F& a, const V4x32F& b, const V4x32F& mask) { + return V4x32F(_mm_blendv_ps(a, b, mask)); +} + +HH_INLINE V2x64F Select(const V2x64F& a, const V2x64F& b, const V2x64F& mask) { + return V2x64F(_mm_blendv_pd(a, b, mask)); +} + +// Min/Max + +HH_INLINE V16x8U Min(const V16x8U& v0, const V16x8U& v1) { + return V16x8U(_mm_min_epu8(v0, v1)); +} + +HH_INLINE V16x8U Max(const V16x8U& v0, const V16x8U& v1) { + return V16x8U(_mm_max_epu8(v0, v1)); +} + +HH_INLINE V8x16U Min(const V8x16U& v0, const V8x16U& v1) { + return V8x16U(_mm_min_epu16(v0, v1)); +} + +HH_INLINE V8x16U Max(const V8x16U& v0, const V8x16U& v1) { + return V8x16U(_mm_max_epu16(v0, v1)); +} + +HH_INLINE V4x32U Min(const V4x32U& v0, const V4x32U& v1) { + return V4x32U(_mm_min_epu32(v0, v1)); +} + +HH_INLINE V4x32U Max(const V4x32U& v0, const V4x32U& v1) { + return V4x32U(_mm_max_epu32(v0, v1)); +} + +HH_INLINE V4x32F Min(const V4x32F& v0, const V4x32F& v1) { + return V4x32F(_mm_min_ps(v0, v1)); +} + +HH_INLINE V4x32F Max(const V4x32F& v0, const V4x32F& v1) { + return V4x32F(_mm_max_ps(v0, v1)); +} + +HH_INLINE V2x64F Min(const V2x64F& v0, const V2x64F& v1) { + return V2x64F(_mm_min_pd(v0, v1)); +} + +HH_INLINE V2x64F Max(const V2x64F& v0, const V2x64F& v1) { + return V2x64F(_mm_max_pd(v0, v1)); +} + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HH_DISABLE_TARGET_SPECIFIC +#endif // HIGHWAYHASH_VECTOR128_H_ diff --git a/contrib/libs/highwayhash/highwayhash/vector256.h b/contrib/libs/highwayhash/highwayhash/vector256.h index d1ccec49ef..29199ddf00 100644 --- a/contrib/libs/highwayhash/highwayhash/vector256.h +++ b/contrib/libs/highwayhash/highwayhash/vector256.h @@ -1,758 +1,758 @@ -// Copyright 2016 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_VECTOR256_H_ -#define HIGHWAYHASH_VECTOR256_H_ - -// Defines SIMD vector classes ("V4x64U") with overloaded arithmetic operators: -// const V4x64U masked_sum = (a + b) & m; -// This is shorter and more readable than compiler intrinsics: -// const __m256i masked_sum = _mm256_and_si256(_mm256_add_epi64(a, b), m); -// There is typically no runtime cost for these abstractions. -// -// The naming convention is VNxBBT where N is the number of lanes, BB the -// number of bits per lane and T is the lane type: unsigned integer (U), -// signed integer (I), or floating-point (F). - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include <stddef.h> -#include <stdint.h> - -#include "highwayhash/arch_specific.h" -#include "highwayhash/compiler_specific.h" - -// For auto-dependency generation, we need to include all headers but not their -// contents (otherwise compilation fails because -mavx2 is not specified). -#ifndef HH_DISABLE_TARGET_SPECIFIC - -// (This include cannot be moved within a namespace due to conflicts with -// other system headers; see the comment in hh_sse41.h.) -#include <immintrin.h> - -namespace highwayhash { -// To prevent ODR violations when including this from multiple translation -// units (TU) that are compiled with different flags, the contents must reside -// in a namespace whose name is unique to the TU. NOTE: this behavior is -// incompatible with precompiled modules and requires textual inclusion instead. -namespace HH_TARGET_NAME { - -// Primary template for 256-bit AVX2 vectors; only specializations are used. -template <typename T> -class V256 {}; - -template <> -class V256<uint8_t> { - public: - using Intrinsic = __m256i; - using T = uint8_t; - static constexpr size_t N = 32; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE V256() {} - - // Broadcasts i to all lanes. - HH_INLINE explicit V256(T i) - : v_(_mm256_broadcastb_epi8(_mm_cvtsi32_si128(i))) {} - - // Copy from other vector. - HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} - template <typename U> - HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} - HH_INLINE V256& operator=(const V256& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE V256(const Intrinsic& v) : v_(v) {} - HH_INLINE V256& operator=(const Intrinsic& v) { - v_ = v; - return *this; - } - HH_INLINE operator Intrinsic() const { return v_; } - - // There are no greater-than comparison instructions for unsigned T. - HH_INLINE V256 operator==(const V256& other) const { - return V256(_mm256_cmpeq_epi8(v_, other.v_)); - } - - HH_INLINE V256& operator+=(const V256& other) { - v_ = _mm256_add_epi8(v_, other.v_); - return *this; - } - HH_INLINE V256& operator-=(const V256& other) { - v_ = _mm256_sub_epi8(v_, other.v_); - return *this; - } - - HH_INLINE V256& operator&=(const V256& other) { - v_ = _mm256_and_si256(v_, other.v_); - return *this; - } - HH_INLINE V256& operator|=(const V256& other) { - v_ = _mm256_or_si256(v_, other.v_); - return *this; - } - HH_INLINE V256& operator^=(const V256& other) { - v_ = _mm256_xor_si256(v_, other.v_); - return *this; - } - - private: - Intrinsic v_; -}; - -template <> -class V256<uint16_t> { - public: - using Intrinsic = __m256i; - using T = uint16_t; - static constexpr size_t N = 16; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE V256() {} - - // Lane 0 (p_0) is the lowest. - HH_INLINE V256(T p_F, T p_E, T p_D, T p_C, T p_B, T p_A, T p_9, T p_8, T p_7, - T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) - : v_(_mm256_set_epi16(p_F, p_E, p_D, p_C, p_B, p_A, p_9, p_8, p_7, p_6, - p_5, p_4, p_3, p_2, p_1, p_0)) {} - - // Broadcasts i to all lanes. - HH_INLINE explicit V256(T i) - : v_(_mm256_broadcastw_epi16(_mm_cvtsi32_si128(i))) {} - - // Copy from other vector. - HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} - template <typename U> - HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} - HH_INLINE V256& operator=(const V256& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE V256(const Intrinsic& v) : v_(v) {} - HH_INLINE V256& operator=(const Intrinsic& v) { - v_ = v; - return *this; - } - HH_INLINE operator Intrinsic() const { return v_; } - - // There are no greater-than comparison instructions for unsigned T. - HH_INLINE V256 operator==(const V256& other) const { - return V256(_mm256_cmpeq_epi16(v_, other.v_)); - } - - HH_INLINE V256& operator+=(const V256& other) { - v_ = _mm256_add_epi16(v_, other.v_); - return *this; - } - HH_INLINE V256& operator-=(const V256& other) { - v_ = _mm256_sub_epi16(v_, other.v_); - return *this; - } - - HH_INLINE V256& operator&=(const V256& other) { - v_ = _mm256_and_si256(v_, other.v_); - return *this; - } - HH_INLINE V256& operator|=(const V256& other) { - v_ = _mm256_or_si256(v_, other.v_); - return *this; - } - HH_INLINE V256& operator^=(const V256& other) { - v_ = _mm256_xor_si256(v_, other.v_); - return *this; - } - - HH_INLINE V256& operator<<=(const int count) { - v_ = _mm256_slli_epi16(v_, count); - return *this; - } - - HH_INLINE V256& operator>>=(const int count) { - v_ = _mm256_srli_epi16(v_, count); - return *this; - } - - private: - Intrinsic v_; -}; - -template <> -class V256<uint32_t> { - public: - using Intrinsic = __m256i; - using T = uint32_t; - static constexpr size_t N = 8; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE V256() {} - - // Lane 0 (p_0) is the lowest. - HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) - : v_(_mm256_set_epi32(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {} - - // Broadcasts i to all lanes. - HH_INLINE explicit V256(T i) - : v_(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(i))) {} - - // Copy from other vector. - HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} - template <typename U> - HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} - HH_INLINE V256& operator=(const V256& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE V256(const Intrinsic& v) : v_(v) {} - HH_INLINE V256& operator=(const Intrinsic& v) { - v_ = v; - return *this; - } - HH_INLINE operator Intrinsic() const { return v_; } - - // There are no greater-than comparison instructions for unsigned T. - HH_INLINE V256 operator==(const V256& other) const { - return V256(_mm256_cmpeq_epi32(v_, other.v_)); - } - - HH_INLINE V256& operator+=(const V256& other) { - v_ = _mm256_add_epi32(v_, other.v_); - return *this; - } - HH_INLINE V256& operator-=(const V256& other) { - v_ = _mm256_sub_epi32(v_, other.v_); - return *this; - } - - HH_INLINE V256& operator&=(const V256& other) { - v_ = _mm256_and_si256(v_, other.v_); - return *this; - } - HH_INLINE V256& operator|=(const V256& other) { - v_ = _mm256_or_si256(v_, other.v_); - return *this; - } - HH_INLINE V256& operator^=(const V256& other) { - v_ = _mm256_xor_si256(v_, other.v_); - return *this; - } - - HH_INLINE V256& operator<<=(const int count) { - v_ = _mm256_slli_epi32(v_, count); - return *this; - } - - HH_INLINE V256& operator>>=(const int count) { - v_ = _mm256_srli_epi32(v_, count); - return *this; - } - - private: - Intrinsic v_; -}; - -template <> -class V256<uint64_t> { - public: - using Intrinsic = __m256i; - using T = uint64_t; - static constexpr size_t N = 4; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE V256() {} - - // Lane 0 (p_0) is the lowest. - HH_INLINE V256(T p_3, T p_2, T p_1, T p_0) - : v_(_mm256_set_epi64x(p_3, p_2, p_1, p_0)) {} - - // Broadcasts i to all lanes. - HH_INLINE explicit V256(T i) - : v_(_mm256_broadcastq_epi64(_mm_cvtsi64_si128(i))) {} - - // Copy from other vector. - HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} - template <typename U> - HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} - HH_INLINE V256& operator=(const V256& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE V256(const Intrinsic& v) : v_(v) {} - HH_INLINE V256& operator=(const Intrinsic& v) { - v_ = v; - return *this; - } - HH_INLINE operator Intrinsic() const { return v_; } - - // There are no greater-than comparison instructions for unsigned T. - HH_INLINE V256 operator==(const V256& other) const { - return V256(_mm256_cmpeq_epi64(v_, other.v_)); - } - - HH_INLINE V256& operator+=(const V256& other) { - v_ = _mm256_add_epi64(v_, other.v_); - return *this; - } - HH_INLINE V256& operator-=(const V256& other) { - v_ = _mm256_sub_epi64(v_, other.v_); - return *this; - } - - HH_INLINE V256& operator&=(const V256& other) { - v_ = _mm256_and_si256(v_, other.v_); - return *this; - } - HH_INLINE V256& operator|=(const V256& other) { - v_ = _mm256_or_si256(v_, other.v_); - return *this; - } - HH_INLINE V256& operator^=(const V256& other) { - v_ = _mm256_xor_si256(v_, other.v_); - return *this; - } - - HH_INLINE V256& operator<<=(const int count) { - v_ = _mm256_slli_epi64(v_, count); - return *this; - } - - HH_INLINE V256& operator>>=(const int count) { - v_ = _mm256_srli_epi64(v_, count); - return *this; - } - - private: - Intrinsic v_; -}; - -template <> -class V256<float> { - public: - using Intrinsic = __m256; - using T = float; - static constexpr size_t N = 8; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE V256() {} - - // Lane 0 (p_0) is the lowest. - HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) - : v_(_mm256_set_ps(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {} - - // Broadcasts to all lanes. - HH_INLINE explicit V256(T f) : v_(_mm256_set1_ps(f)) {} - - // Copy from other vector. - HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} - template <typename U> - HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} - HH_INLINE V256& operator=(const V256& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE V256(const Intrinsic& v) : v_(v) {} - HH_INLINE V256& operator=(const Intrinsic& v) { - v_ = v; - return *this; - } - HH_INLINE operator Intrinsic() const { return v_; } - - HH_INLINE V256 operator==(const V256& other) const { - return V256(_mm256_cmp_ps(v_, other.v_, 0)); - } - HH_INLINE V256 operator<(const V256& other) const { - return V256(_mm256_cmp_ps(v_, other.v_, 1)); - } - HH_INLINE V256 operator>(const V256& other) const { - return V256(_mm256_cmp_ps(other.v_, v_, 1)); - } - - HH_INLINE V256& operator*=(const V256& other) { - v_ = _mm256_mul_ps(v_, other.v_); - return *this; - } - HH_INLINE V256& operator/=(const V256& other) { - v_ = _mm256_div_ps(v_, other.v_); - return *this; - } - HH_INLINE V256& operator+=(const V256& other) { - v_ = _mm256_add_ps(v_, other.v_); - return *this; - } - HH_INLINE V256& operator-=(const V256& other) { - v_ = _mm256_sub_ps(v_, other.v_); - return *this; - } - - HH_INLINE V256& operator&=(const V256& other) { - v_ = _mm256_and_ps(v_, other.v_); - return *this; - } - HH_INLINE V256& operator|=(const V256& other) { - v_ = _mm256_or_ps(v_, other.v_); - return *this; - } - HH_INLINE V256& operator^=(const V256& other) { - v_ = _mm256_xor_ps(v_, other.v_); - return *this; - } - - private: - Intrinsic v_; -}; - -template <> -class V256<double> { - public: - using Intrinsic = __m256d; - using T = double; - static constexpr size_t N = 4; - - // Leaves v_ uninitialized - typically used for output parameters. - HH_INLINE V256() {} - - // Lane 0 (p_0) is the lowest. - HH_INLINE V256(T p_3, T p_2, T p_1, T p_0) - : v_(_mm256_set_pd(p_3, p_2, p_1, p_0)) {} - - // Broadcasts to all lanes. - HH_INLINE explicit V256(T f) : v_(_mm256_set1_pd(f)) {} - - // Copy from other vector. - HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} - template <typename U> - HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} - HH_INLINE V256& operator=(const V256& other) { - v_ = other.v_; - return *this; - } - - // Convert from/to intrinsics. - HH_INLINE V256(const Intrinsic& v) : v_(v) {} - HH_INLINE V256& operator=(const Intrinsic& v) { - v_ = v; - return *this; - } - HH_INLINE operator Intrinsic() const { return v_; } - - HH_INLINE V256 operator==(const V256& other) const { - return V256(_mm256_cmp_pd(v_, other.v_, 0)); - } - HH_INLINE V256 operator<(const V256& other) const { - return V256(_mm256_cmp_pd(v_, other.v_, 1)); - } - HH_INLINE V256 operator>(const V256& other) const { - return V256(_mm256_cmp_pd(other.v_, v_, 1)); - } - - HH_INLINE V256& operator*=(const V256& other) { - v_ = _mm256_mul_pd(v_, other.v_); - return *this; - } - HH_INLINE V256& operator/=(const V256& other) { - v_ = _mm256_div_pd(v_, other.v_); - return *this; - } - HH_INLINE V256& operator+=(const V256& other) { - v_ = _mm256_add_pd(v_, other.v_); - return *this; - } - HH_INLINE V256& operator-=(const V256& other) { - v_ = _mm256_sub_pd(v_, other.v_); - return *this; - } - - HH_INLINE V256& operator&=(const V256& other) { - v_ = _mm256_and_pd(v_, other.v_); - return *this; - } - HH_INLINE V256& operator|=(const V256& other) { - v_ = _mm256_or_pd(v_, other.v_); - return *this; - } - HH_INLINE V256& operator^=(const V256& other) { - v_ = _mm256_xor_pd(v_, other.v_); - return *this; - } - - private: - Intrinsic v_; -}; - -// Nonmember functions for any V256 via member functions. - -template <typename T> -HH_INLINE V256<T> operator*(const V256<T>& left, const V256<T>& right) { - V256<T> t(left); - return t *= right; -} - -template <typename T> -HH_INLINE V256<T> operator/(const V256<T>& left, const V256<T>& right) { - V256<T> t(left); - return t /= right; -} - -template <typename T> -HH_INLINE V256<T> operator+(const V256<T>& left, const V256<T>& right) { - V256<T> t(left); - return t += right; -} - -template <typename T> -HH_INLINE V256<T> operator-(const V256<T>& left, const V256<T>& right) { - V256<T> t(left); - return t -= right; -} - -template <typename T> -HH_INLINE V256<T> operator&(const V256<T>& left, const V256<T>& right) { - V256<T> t(left); - return t &= right; -} - -template <typename T> -HH_INLINE V256<T> operator|(const V256<T> left, const V256<T>& right) { - V256<T> t(left); - return t |= right; -} - -template <typename T> -HH_INLINE V256<T> operator^(const V256<T>& left, const V256<T>& right) { - V256<T> t(left); - return t ^= right; -} - -template <typename T> -HH_INLINE V256<T> operator<<(const V256<T>& v, const int count) { - V256<T> t(v); - return t <<= count; -} - -template <typename T> -HH_INLINE V256<T> operator>>(const V256<T>& v, const int count) { - V256<T> t(v); - return t >>= count; -} - -// We do not provide operator<<(V, __m128i) because it has 4 cycle latency -// (to broadcast the shift count). It is faster to use sllv_epi64 etc. instead. - -using V32x8U = V256<uint8_t>; -using V16x16U = V256<uint16_t>; -using V8x32U = V256<uint32_t>; -using V4x64U = V256<uint64_t>; -using V8x32F = V256<float>; -using V4x64F = V256<double>; - -// Load/Store for any V256. - -// We differentiate between targets' vector types via template specialization. -// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may -// generate better code in unoptimized builds. Only declare the primary -// templates to avoid needing mutual exclusion with vector128. - -template <class V> -HH_INLINE V Load(const typename V::T* const HH_RESTRICT from); - -template <class V> -HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from); - -template <> -HH_INLINE V32x8U Load(const V32x8U::T* const HH_RESTRICT from) { - const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); - return V32x8U(_mm256_load_si256(p)); -} -template <> -HH_INLINE V16x16U Load(const V16x16U::T* const HH_RESTRICT from) { - const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); - return V16x16U(_mm256_load_si256(p)); -} -template <> -HH_INLINE V8x32U Load(const V8x32U::T* const HH_RESTRICT from) { - const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); - return V8x32U(_mm256_load_si256(p)); -} -template <> -HH_INLINE V4x64U Load(const V4x64U::T* const HH_RESTRICT from) { - const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); - return V4x64U(_mm256_load_si256(p)); -} -template <> -HH_INLINE V8x32F Load(const V8x32F::T* const HH_RESTRICT from) { - return V8x32F(_mm256_load_ps(from)); -} -template <> -HH_INLINE V4x64F Load(const V4x64F::T* const HH_RESTRICT from) { - return V4x64F(_mm256_load_pd(from)); -} - -template <> -HH_INLINE V32x8U LoadUnaligned(const V32x8U::T* const HH_RESTRICT from) { - const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); - return V32x8U(_mm256_loadu_si256(p)); -} -template <> -HH_INLINE V16x16U LoadUnaligned(const V16x16U::T* const HH_RESTRICT from) { - const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); - return V16x16U(_mm256_loadu_si256(p)); -} -template <> -HH_INLINE V8x32U LoadUnaligned(const V8x32U::T* const HH_RESTRICT from) { - const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); - return V8x32U(_mm256_loadu_si256(p)); -} -template <> -HH_INLINE V4x64U LoadUnaligned(const V4x64U::T* const HH_RESTRICT from) { - const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); - return V4x64U(_mm256_loadu_si256(p)); -} -template <> -HH_INLINE V8x32F LoadUnaligned(const V8x32F::T* const HH_RESTRICT from) { - return V8x32F(_mm256_loadu_ps(from)); -} -template <> -HH_INLINE V4x64F LoadUnaligned(const V4x64F::T* const HH_RESTRICT from) { - return V4x64F(_mm256_loadu_pd(from)); -} - -// "to" must be vector-aligned. -template <typename T> -HH_INLINE void Store(const V256<T>& v, T* const HH_RESTRICT to) { - _mm256_store_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v); -} -HH_INLINE void Store(const V256<float>& v, float* const HH_RESTRICT to) { - _mm256_store_ps(to, v); -} -HH_INLINE void Store(const V256<double>& v, double* const HH_RESTRICT to) { - _mm256_store_pd(to, v); -} - -template <typename T> -HH_INLINE void StoreUnaligned(const V256<T>& v, T* const HH_RESTRICT to) { - _mm256_storeu_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v); -} -HH_INLINE void StoreUnaligned(const V256<float>& v, - float* const HH_RESTRICT to) { - _mm256_storeu_ps(to, v); -} -HH_INLINE void StoreUnaligned(const V256<double>& v, - double* const HH_RESTRICT to) { - _mm256_storeu_pd(to, v); -} - -// Writes directly to (aligned) memory, bypassing the cache. This is useful for -// data that will not be read again in the near future. -template <typename T> -HH_INLINE void Stream(const V256<T>& v, T* const HH_RESTRICT to) { - _mm256_stream_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v); -} -HH_INLINE void Stream(const V256<float>& v, float* const HH_RESTRICT to) { - _mm256_stream_ps(to, v); -} -HH_INLINE void Stream(const V256<double>& v, double* const HH_RESTRICT to) { - _mm256_stream_pd(to, v); -} - -// Miscellaneous functions. - -template <typename T> -HH_INLINE V256<T> RotateLeft(const V256<T>& v, const int count) { - constexpr size_t num_bits = sizeof(T) * 8; - return (v << count) | (v >> (num_bits - count)); -} - -template <typename T> -HH_INLINE V256<T> AndNot(const V256<T>& neg_mask, const V256<T>& values) { - return V256<T>(_mm256_andnot_si256(neg_mask, values)); -} -template <> -HH_INLINE V256<float> AndNot(const V256<float>& neg_mask, - const V256<float>& values) { - return V256<float>(_mm256_andnot_ps(neg_mask, values)); -} -template <> -HH_INLINE V256<double> AndNot(const V256<double>& neg_mask, - const V256<double>& values) { - return V256<double>(_mm256_andnot_pd(neg_mask, values)); -} - -HH_INLINE V8x32F Select(const V8x32F& a, const V8x32F& b, const V8x32F& mask) { - return V8x32F(_mm256_blendv_ps(a, b, mask)); -} - -HH_INLINE V4x64F Select(const V4x64F& a, const V4x64F& b, const V4x64F& mask) { - return V4x64F(_mm256_blendv_pd(a, b, mask)); -} - -// Min/Max - -HH_INLINE V32x8U Min(const V32x8U& v0, const V32x8U& v1) { - return V32x8U(_mm256_min_epu8(v0, v1)); -} - -HH_INLINE V32x8U Max(const V32x8U& v0, const V32x8U& v1) { - return V32x8U(_mm256_max_epu8(v0, v1)); -} - -HH_INLINE V16x16U Min(const V16x16U& v0, const V16x16U& v1) { - return V16x16U(_mm256_min_epu16(v0, v1)); -} - -HH_INLINE V16x16U Max(const V16x16U& v0, const V16x16U& v1) { - return V16x16U(_mm256_max_epu16(v0, v1)); -} - -HH_INLINE V8x32U Min(const V8x32U& v0, const V8x32U& v1) { - return V8x32U(_mm256_min_epu32(v0, v1)); -} - -HH_INLINE V8x32U Max(const V8x32U& v0, const V8x32U& v1) { - return V8x32U(_mm256_max_epu32(v0, v1)); -} - -HH_INLINE V8x32F Min(const V8x32F& v0, const V8x32F& v1) { - return V8x32F(_mm256_min_ps(v0, v1)); -} - -HH_INLINE V8x32F Max(const V8x32F& v0, const V8x32F& v1) { - return V8x32F(_mm256_max_ps(v0, v1)); -} - -HH_INLINE V4x64F Min(const V4x64F& v0, const V4x64F& v1) { - return V4x64F(_mm256_min_pd(v0, v1)); -} - -HH_INLINE V4x64F Max(const V4x64F& v0, const V4x64F& v1) { - return V4x64F(_mm256_max_pd(v0, v1)); -} - -} // namespace HH_TARGET_NAME -} // namespace highwayhash - -#endif // HH_DISABLE_TARGET_SPECIFIC -#endif // HIGHWAYHASH_VECTOR256_H_ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_VECTOR256_H_ +#define HIGHWAYHASH_VECTOR256_H_ + +// Defines SIMD vector classes ("V4x64U") with overloaded arithmetic operators: +// const V4x64U masked_sum = (a + b) & m; +// This is shorter and more readable than compiler intrinsics: +// const __m256i masked_sum = _mm256_and_si256(_mm256_add_epi64(a, b), m); +// There is typically no runtime cost for these abstractions. +// +// The naming convention is VNxBBT where N is the number of lanes, BB the +// number of bits per lane and T is the lane type: unsigned integer (U), +// signed integer (I), or floating-point (F). + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include <stddef.h> +#include <stdint.h> + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" + +// For auto-dependency generation, we need to include all headers but not their +// contents (otherwise compilation fails because -mavx2 is not specified). +#ifndef HH_DISABLE_TARGET_SPECIFIC + +// (This include cannot be moved within a namespace due to conflicts with +// other system headers; see the comment in hh_sse41.h.) +#include <immintrin.h> + +namespace highwayhash { +// To prevent ODR violations when including this from multiple translation +// units (TU) that are compiled with different flags, the contents must reside +// in a namespace whose name is unique to the TU. NOTE: this behavior is +// incompatible with precompiled modules and requires textual inclusion instead. +namespace HH_TARGET_NAME { + +// Primary template for 256-bit AVX2 vectors; only specializations are used. +template <typename T> +class V256 {}; + +template <> +class V256<uint8_t> { + public: + using Intrinsic = __m256i; + using T = uint8_t; + static constexpr size_t N = 32; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE V256() {} + + // Broadcasts i to all lanes. + HH_INLINE explicit V256(T i) + : v_(_mm256_broadcastb_epi8(_mm_cvtsi32_si128(i))) {} + + // Copy from other vector. + HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} + template <typename U> + HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} + HH_INLINE V256& operator=(const V256& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE V256(const Intrinsic& v) : v_(v) {} + HH_INLINE V256& operator=(const Intrinsic& v) { + v_ = v; + return *this; + } + HH_INLINE operator Intrinsic() const { return v_; } + + // There are no greater-than comparison instructions for unsigned T. + HH_INLINE V256 operator==(const V256& other) const { + return V256(_mm256_cmpeq_epi8(v_, other.v_)); + } + + HH_INLINE V256& operator+=(const V256& other) { + v_ = _mm256_add_epi8(v_, other.v_); + return *this; + } + HH_INLINE V256& operator-=(const V256& other) { + v_ = _mm256_sub_epi8(v_, other.v_); + return *this; + } + + HH_INLINE V256& operator&=(const V256& other) { + v_ = _mm256_and_si256(v_, other.v_); + return *this; + } + HH_INLINE V256& operator|=(const V256& other) { + v_ = _mm256_or_si256(v_, other.v_); + return *this; + } + HH_INLINE V256& operator^=(const V256& other) { + v_ = _mm256_xor_si256(v_, other.v_); + return *this; + } + + private: + Intrinsic v_; +}; + +template <> +class V256<uint16_t> { + public: + using Intrinsic = __m256i; + using T = uint16_t; + static constexpr size_t N = 16; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE V256() {} + + // Lane 0 (p_0) is the lowest. + HH_INLINE V256(T p_F, T p_E, T p_D, T p_C, T p_B, T p_A, T p_9, T p_8, T p_7, + T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) + : v_(_mm256_set_epi16(p_F, p_E, p_D, p_C, p_B, p_A, p_9, p_8, p_7, p_6, + p_5, p_4, p_3, p_2, p_1, p_0)) {} + + // Broadcasts i to all lanes. + HH_INLINE explicit V256(T i) + : v_(_mm256_broadcastw_epi16(_mm_cvtsi32_si128(i))) {} + + // Copy from other vector. + HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} + template <typename U> + HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} + HH_INLINE V256& operator=(const V256& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE V256(const Intrinsic& v) : v_(v) {} + HH_INLINE V256& operator=(const Intrinsic& v) { + v_ = v; + return *this; + } + HH_INLINE operator Intrinsic() const { return v_; } + + // There are no greater-than comparison instructions for unsigned T. + HH_INLINE V256 operator==(const V256& other) const { + return V256(_mm256_cmpeq_epi16(v_, other.v_)); + } + + HH_INLINE V256& operator+=(const V256& other) { + v_ = _mm256_add_epi16(v_, other.v_); + return *this; + } + HH_INLINE V256& operator-=(const V256& other) { + v_ = _mm256_sub_epi16(v_, other.v_); + return *this; + } + + HH_INLINE V256& operator&=(const V256& other) { + v_ = _mm256_and_si256(v_, other.v_); + return *this; + } + HH_INLINE V256& operator|=(const V256& other) { + v_ = _mm256_or_si256(v_, other.v_); + return *this; + } + HH_INLINE V256& operator^=(const V256& other) { + v_ = _mm256_xor_si256(v_, other.v_); + return *this; + } + + HH_INLINE V256& operator<<=(const int count) { + v_ = _mm256_slli_epi16(v_, count); + return *this; + } + + HH_INLINE V256& operator>>=(const int count) { + v_ = _mm256_srli_epi16(v_, count); + return *this; + } + + private: + Intrinsic v_; +}; + +template <> +class V256<uint32_t> { + public: + using Intrinsic = __m256i; + using T = uint32_t; + static constexpr size_t N = 8; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE V256() {} + + // Lane 0 (p_0) is the lowest. + HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) + : v_(_mm256_set_epi32(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {} + + // Broadcasts i to all lanes. + HH_INLINE explicit V256(T i) + : v_(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(i))) {} + + // Copy from other vector. + HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} + template <typename U> + HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} + HH_INLINE V256& operator=(const V256& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE V256(const Intrinsic& v) : v_(v) {} + HH_INLINE V256& operator=(const Intrinsic& v) { + v_ = v; + return *this; + } + HH_INLINE operator Intrinsic() const { return v_; } + + // There are no greater-than comparison instructions for unsigned T. + HH_INLINE V256 operator==(const V256& other) const { + return V256(_mm256_cmpeq_epi32(v_, other.v_)); + } + + HH_INLINE V256& operator+=(const V256& other) { + v_ = _mm256_add_epi32(v_, other.v_); + return *this; + } + HH_INLINE V256& operator-=(const V256& other) { + v_ = _mm256_sub_epi32(v_, other.v_); + return *this; + } + + HH_INLINE V256& operator&=(const V256& other) { + v_ = _mm256_and_si256(v_, other.v_); + return *this; + } + HH_INLINE V256& operator|=(const V256& other) { + v_ = _mm256_or_si256(v_, other.v_); + return *this; + } + HH_INLINE V256& operator^=(const V256& other) { + v_ = _mm256_xor_si256(v_, other.v_); + return *this; + } + + HH_INLINE V256& operator<<=(const int count) { + v_ = _mm256_slli_epi32(v_, count); + return *this; + } + + HH_INLINE V256& operator>>=(const int count) { + v_ = _mm256_srli_epi32(v_, count); + return *this; + } + + private: + Intrinsic v_; +}; + +template <> +class V256<uint64_t> { + public: + using Intrinsic = __m256i; + using T = uint64_t; + static constexpr size_t N = 4; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE V256() {} + + // Lane 0 (p_0) is the lowest. + HH_INLINE V256(T p_3, T p_2, T p_1, T p_0) + : v_(_mm256_set_epi64x(p_3, p_2, p_1, p_0)) {} + + // Broadcasts i to all lanes. + HH_INLINE explicit V256(T i) + : v_(_mm256_broadcastq_epi64(_mm_cvtsi64_si128(i))) {} + + // Copy from other vector. + HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} + template <typename U> + HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} + HH_INLINE V256& operator=(const V256& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE V256(const Intrinsic& v) : v_(v) {} + HH_INLINE V256& operator=(const Intrinsic& v) { + v_ = v; + return *this; + } + HH_INLINE operator Intrinsic() const { return v_; } + + // There are no greater-than comparison instructions for unsigned T. + HH_INLINE V256 operator==(const V256& other) const { + return V256(_mm256_cmpeq_epi64(v_, other.v_)); + } + + HH_INLINE V256& operator+=(const V256& other) { + v_ = _mm256_add_epi64(v_, other.v_); + return *this; + } + HH_INLINE V256& operator-=(const V256& other) { + v_ = _mm256_sub_epi64(v_, other.v_); + return *this; + } + + HH_INLINE V256& operator&=(const V256& other) { + v_ = _mm256_and_si256(v_, other.v_); + return *this; + } + HH_INLINE V256& operator|=(const V256& other) { + v_ = _mm256_or_si256(v_, other.v_); + return *this; + } + HH_INLINE V256& operator^=(const V256& other) { + v_ = _mm256_xor_si256(v_, other.v_); + return *this; + } + + HH_INLINE V256& operator<<=(const int count) { + v_ = _mm256_slli_epi64(v_, count); + return *this; + } + + HH_INLINE V256& operator>>=(const int count) { + v_ = _mm256_srli_epi64(v_, count); + return *this; + } + + private: + Intrinsic v_; +}; + +template <> +class V256<float> { + public: + using Intrinsic = __m256; + using T = float; + static constexpr size_t N = 8; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE V256() {} + + // Lane 0 (p_0) is the lowest. + HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) + : v_(_mm256_set_ps(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {} + + // Broadcasts to all lanes. + HH_INLINE explicit V256(T f) : v_(_mm256_set1_ps(f)) {} + + // Copy from other vector. + HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} + template <typename U> + HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} + HH_INLINE V256& operator=(const V256& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE V256(const Intrinsic& v) : v_(v) {} + HH_INLINE V256& operator=(const Intrinsic& v) { + v_ = v; + return *this; + } + HH_INLINE operator Intrinsic() const { return v_; } + + HH_INLINE V256 operator==(const V256& other) const { + return V256(_mm256_cmp_ps(v_, other.v_, 0)); + } + HH_INLINE V256 operator<(const V256& other) const { + return V256(_mm256_cmp_ps(v_, other.v_, 1)); + } + HH_INLINE V256 operator>(const V256& other) const { + return V256(_mm256_cmp_ps(other.v_, v_, 1)); + } + + HH_INLINE V256& operator*=(const V256& other) { + v_ = _mm256_mul_ps(v_, other.v_); + return *this; + } + HH_INLINE V256& operator/=(const V256& other) { + v_ = _mm256_div_ps(v_, other.v_); + return *this; + } + HH_INLINE V256& operator+=(const V256& other) { + v_ = _mm256_add_ps(v_, other.v_); + return *this; + } + HH_INLINE V256& operator-=(const V256& other) { + v_ = _mm256_sub_ps(v_, other.v_); + return *this; + } + + HH_INLINE V256& operator&=(const V256& other) { + v_ = _mm256_and_ps(v_, other.v_); + return *this; + } + HH_INLINE V256& operator|=(const V256& other) { + v_ = _mm256_or_ps(v_, other.v_); + return *this; + } + HH_INLINE V256& operator^=(const V256& other) { + v_ = _mm256_xor_ps(v_, other.v_); + return *this; + } + + private: + Intrinsic v_; +}; + +template <> +class V256<double> { + public: + using Intrinsic = __m256d; + using T = double; + static constexpr size_t N = 4; + + // Leaves v_ uninitialized - typically used for output parameters. + HH_INLINE V256() {} + + // Lane 0 (p_0) is the lowest. + HH_INLINE V256(T p_3, T p_2, T p_1, T p_0) + : v_(_mm256_set_pd(p_3, p_2, p_1, p_0)) {} + + // Broadcasts to all lanes. + HH_INLINE explicit V256(T f) : v_(_mm256_set1_pd(f)) {} + + // Copy from other vector. + HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} + template <typename U> + HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} + HH_INLINE V256& operator=(const V256& other) { + v_ = other.v_; + return *this; + } + + // Convert from/to intrinsics. + HH_INLINE V256(const Intrinsic& v) : v_(v) {} + HH_INLINE V256& operator=(const Intrinsic& v) { + v_ = v; + return *this; + } + HH_INLINE operator Intrinsic() const { return v_; } + + HH_INLINE V256 operator==(const V256& other) const { + return V256(_mm256_cmp_pd(v_, other.v_, 0)); + } + HH_INLINE V256 operator<(const V256& other) const { + return V256(_mm256_cmp_pd(v_, other.v_, 1)); + } + HH_INLINE V256 operator>(const V256& other) const { + return V256(_mm256_cmp_pd(other.v_, v_, 1)); + } + + HH_INLINE V256& operator*=(const V256& other) { + v_ = _mm256_mul_pd(v_, other.v_); + return *this; + } + HH_INLINE V256& operator/=(const V256& other) { + v_ = _mm256_div_pd(v_, other.v_); + return *this; + } + HH_INLINE V256& operator+=(const V256& other) { + v_ = _mm256_add_pd(v_, other.v_); + return *this; + } + HH_INLINE V256& operator-=(const V256& other) { + v_ = _mm256_sub_pd(v_, other.v_); + return *this; + } + + HH_INLINE V256& operator&=(const V256& other) { + v_ = _mm256_and_pd(v_, other.v_); + return *this; + } + HH_INLINE V256& operator|=(const V256& other) { + v_ = _mm256_or_pd(v_, other.v_); + return *this; + } + HH_INLINE V256& operator^=(const V256& other) { + v_ = _mm256_xor_pd(v_, other.v_); + return *this; + } + + private: + Intrinsic v_; +}; + +// Nonmember functions for any V256 via member functions. + +template <typename T> +HH_INLINE V256<T> operator*(const V256<T>& left, const V256<T>& right) { + V256<T> t(left); + return t *= right; +} + +template <typename T> +HH_INLINE V256<T> operator/(const V256<T>& left, const V256<T>& right) { + V256<T> t(left); + return t /= right; +} + +template <typename T> +HH_INLINE V256<T> operator+(const V256<T>& left, const V256<T>& right) { + V256<T> t(left); + return t += right; +} + +template <typename T> +HH_INLINE V256<T> operator-(const V256<T>& left, const V256<T>& right) { + V256<T> t(left); + return t -= right; +} + +template <typename T> +HH_INLINE V256<T> operator&(const V256<T>& left, const V256<T>& right) { + V256<T> t(left); + return t &= right; +} + +template <typename T> +HH_INLINE V256<T> operator|(const V256<T> left, const V256<T>& right) { + V256<T> t(left); + return t |= right; +} + +template <typename T> +HH_INLINE V256<T> operator^(const V256<T>& left, const V256<T>& right) { + V256<T> t(left); + return t ^= right; +} + +template <typename T> +HH_INLINE V256<T> operator<<(const V256<T>& v, const int count) { + V256<T> t(v); + return t <<= count; +} + +template <typename T> +HH_INLINE V256<T> operator>>(const V256<T>& v, const int count) { + V256<T> t(v); + return t >>= count; +} + +// We do not provide operator<<(V, __m128i) because it has 4 cycle latency +// (to broadcast the shift count). It is faster to use sllv_epi64 etc. instead. + +using V32x8U = V256<uint8_t>; +using V16x16U = V256<uint16_t>; +using V8x32U = V256<uint32_t>; +using V4x64U = V256<uint64_t>; +using V8x32F = V256<float>; +using V4x64F = V256<double>; + +// Load/Store for any V256. + +// We differentiate between targets' vector types via template specialization. +// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may +// generate better code in unoptimized builds. Only declare the primary +// templates to avoid needing mutual exclusion with vector128. + +template <class V> +HH_INLINE V Load(const typename V::T* const HH_RESTRICT from); + +template <class V> +HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from); + +template <> +HH_INLINE V32x8U Load(const V32x8U::T* const HH_RESTRICT from) { + const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); + return V32x8U(_mm256_load_si256(p)); +} +template <> +HH_INLINE V16x16U Load(const V16x16U::T* const HH_RESTRICT from) { + const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); + return V16x16U(_mm256_load_si256(p)); +} +template <> +HH_INLINE V8x32U Load(const V8x32U::T* const HH_RESTRICT from) { + const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); + return V8x32U(_mm256_load_si256(p)); +} +template <> +HH_INLINE V4x64U Load(const V4x64U::T* const HH_RESTRICT from) { + const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); + return V4x64U(_mm256_load_si256(p)); +} +template <> +HH_INLINE V8x32F Load(const V8x32F::T* const HH_RESTRICT from) { + return V8x32F(_mm256_load_ps(from)); +} +template <> +HH_INLINE V4x64F Load(const V4x64F::T* const HH_RESTRICT from) { + return V4x64F(_mm256_load_pd(from)); +} + +template <> +HH_INLINE V32x8U LoadUnaligned(const V32x8U::T* const HH_RESTRICT from) { + const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); + return V32x8U(_mm256_loadu_si256(p)); +} +template <> +HH_INLINE V16x16U LoadUnaligned(const V16x16U::T* const HH_RESTRICT from) { + const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); + return V16x16U(_mm256_loadu_si256(p)); +} +template <> +HH_INLINE V8x32U LoadUnaligned(const V8x32U::T* const HH_RESTRICT from) { + const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); + return V8x32U(_mm256_loadu_si256(p)); +} +template <> +HH_INLINE V4x64U LoadUnaligned(const V4x64U::T* const HH_RESTRICT from) { + const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); + return V4x64U(_mm256_loadu_si256(p)); +} +template <> +HH_INLINE V8x32F LoadUnaligned(const V8x32F::T* const HH_RESTRICT from) { + return V8x32F(_mm256_loadu_ps(from)); +} +template <> +HH_INLINE V4x64F LoadUnaligned(const V4x64F::T* const HH_RESTRICT from) { + return V4x64F(_mm256_loadu_pd(from)); +} + +// "to" must be vector-aligned. +template <typename T> +HH_INLINE void Store(const V256<T>& v, T* const HH_RESTRICT to) { + _mm256_store_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v); +} +HH_INLINE void Store(const V256<float>& v, float* const HH_RESTRICT to) { + _mm256_store_ps(to, v); +} +HH_INLINE void Store(const V256<double>& v, double* const HH_RESTRICT to) { + _mm256_store_pd(to, v); +} + +template <typename T> +HH_INLINE void StoreUnaligned(const V256<T>& v, T* const HH_RESTRICT to) { + _mm256_storeu_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v); +} +HH_INLINE void StoreUnaligned(const V256<float>& v, + float* const HH_RESTRICT to) { + _mm256_storeu_ps(to, v); +} +HH_INLINE void StoreUnaligned(const V256<double>& v, + double* const HH_RESTRICT to) { + _mm256_storeu_pd(to, v); +} + +// Writes directly to (aligned) memory, bypassing the cache. This is useful for +// data that will not be read again in the near future. +template <typename T> +HH_INLINE void Stream(const V256<T>& v, T* const HH_RESTRICT to) { + _mm256_stream_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v); +} +HH_INLINE void Stream(const V256<float>& v, float* const HH_RESTRICT to) { + _mm256_stream_ps(to, v); +} +HH_INLINE void Stream(const V256<double>& v, double* const HH_RESTRICT to) { + _mm256_stream_pd(to, v); +} + +// Miscellaneous functions. + +template <typename T> +HH_INLINE V256<T> RotateLeft(const V256<T>& v, const int count) { + constexpr size_t num_bits = sizeof(T) * 8; + return (v << count) | (v >> (num_bits - count)); +} + +template <typename T> +HH_INLINE V256<T> AndNot(const V256<T>& neg_mask, const V256<T>& values) { + return V256<T>(_mm256_andnot_si256(neg_mask, values)); +} +template <> +HH_INLINE V256<float> AndNot(const V256<float>& neg_mask, + const V256<float>& values) { + return V256<float>(_mm256_andnot_ps(neg_mask, values)); +} +template <> +HH_INLINE V256<double> AndNot(const V256<double>& neg_mask, + const V256<double>& values) { + return V256<double>(_mm256_andnot_pd(neg_mask, values)); +} + +HH_INLINE V8x32F Select(const V8x32F& a, const V8x32F& b, const V8x32F& mask) { + return V8x32F(_mm256_blendv_ps(a, b, mask)); +} + +HH_INLINE V4x64F Select(const V4x64F& a, const V4x64F& b, const V4x64F& mask) { + return V4x64F(_mm256_blendv_pd(a, b, mask)); +} + +// Min/Max + +HH_INLINE V32x8U Min(const V32x8U& v0, const V32x8U& v1) { + return V32x8U(_mm256_min_epu8(v0, v1)); +} + +HH_INLINE V32x8U Max(const V32x8U& v0, const V32x8U& v1) { + return V32x8U(_mm256_max_epu8(v0, v1)); +} + +HH_INLINE V16x16U Min(const V16x16U& v0, const V16x16U& v1) { + return V16x16U(_mm256_min_epu16(v0, v1)); +} + +HH_INLINE V16x16U Max(const V16x16U& v0, const V16x16U& v1) { + return V16x16U(_mm256_max_epu16(v0, v1)); +} + +HH_INLINE V8x32U Min(const V8x32U& v0, const V8x32U& v1) { + return V8x32U(_mm256_min_epu32(v0, v1)); +} + +HH_INLINE V8x32U Max(const V8x32U& v0, const V8x32U& v1) { + return V8x32U(_mm256_max_epu32(v0, v1)); +} + +HH_INLINE V8x32F Min(const V8x32F& v0, const V8x32F& v1) { + return V8x32F(_mm256_min_ps(v0, v1)); +} + +HH_INLINE V8x32F Max(const V8x32F& v0, const V8x32F& v1) { + return V8x32F(_mm256_max_ps(v0, v1)); +} + +HH_INLINE V4x64F Min(const V4x64F& v0, const V4x64F& v1) { + return V4x64F(_mm256_min_pd(v0, v1)); +} + +HH_INLINE V4x64F Max(const V4x64F& v0, const V4x64F& v1) { + return V4x64F(_mm256_max_pd(v0, v1)); +} + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HH_DISABLE_TARGET_SPECIFIC +#endif // HIGHWAYHASH_VECTOR256_H_ diff --git a/contrib/libs/highwayhash/highwayhash/vector_test.cc b/contrib/libs/highwayhash/highwayhash/vector_test.cc index d9f02567be..a8bdfacac2 100644 --- a/contrib/libs/highwayhash/highwayhash/vector_test.cc +++ b/contrib/libs/highwayhash/highwayhash/vector_test.cc @@ -1,59 +1,59 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <stdio.h> - -#ifdef HH_GOOGLETEST -#include "testing/base/public/gmock.h" -#include "testing/base/public/gunit.h" -#endif - -#include "highwayhash/instruction_sets.h" -#include "highwayhash/vector_test_target.h" - -namespace highwayhash { -namespace { - -void NotifyFailure(const char* target, const size_t size) { - const size_t lane_bits = (size & 0xFF) * 8; - const size_t lane_index = size >> 8; -#ifdef HH_GOOGLETEST - EXPECT_TRUE(false) << "VectorTest failed for " << target << " T=" << lane_bits - << ", lane " << lane_index; -#else - printf("VectorTest failed for %10s T=%zu, lane=%zu\n", target, lane_bits, - lane_index); -#endif -} - -void RunTests() { - const TargetBits tested = InstructionSets::RunAll<VectorTest>(&NotifyFailure); - HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) { - printf("%10s: done\n", TargetName(target)); - }); -} - -#ifdef HH_GOOGLETEST -TEST(VectorTest, Run) { RunTests(); } -#endif - -} // namespace -} // namespace highwayhash - -#ifndef HH_GOOGLETEST -int main(int argc, char* argv[]) { - highwayhash::RunTests(); - return 0; -} -#endif +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stdio.h> + +#ifdef HH_GOOGLETEST +#include "testing/base/public/gmock.h" +#include "testing/base/public/gunit.h" +#endif + +#include "highwayhash/instruction_sets.h" +#include "highwayhash/vector_test_target.h" + +namespace highwayhash { +namespace { + +void NotifyFailure(const char* target, const size_t size) { + const size_t lane_bits = (size & 0xFF) * 8; + const size_t lane_index = size >> 8; +#ifdef HH_GOOGLETEST + EXPECT_TRUE(false) << "VectorTest failed for " << target << " T=" << lane_bits + << ", lane " << lane_index; +#else + printf("VectorTest failed for %10s T=%zu, lane=%zu\n", target, lane_bits, + lane_index); +#endif +} + +void RunTests() { + const TargetBits tested = InstructionSets::RunAll<VectorTest>(&NotifyFailure); + HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) { + printf("%10s: done\n", TargetName(target)); + }); +} + +#ifdef HH_GOOGLETEST +TEST(VectorTest, Run) { RunTests(); } +#endif + +} // namespace +} // namespace highwayhash + +#ifndef HH_GOOGLETEST +int main(int argc, char* argv[]) { + highwayhash::RunTests(); + return 0; +} +#endif diff --git a/contrib/libs/highwayhash/highwayhash/vector_test_avx2.cc b/contrib/libs/highwayhash/highwayhash/vector_test_avx2.cc index 30ce2c992c..86a017f7f5 100644 --- a/contrib/libs/highwayhash/highwayhash/vector_test_avx2.cc +++ b/contrib/libs/highwayhash/highwayhash/vector_test_avx2.cc @@ -1,19 +1,19 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// WARNING: this is a "restricted" source file; avoid including any headers -// unless they are also restricted. See arch_specific.h for details. - -#define HH_TARGET_NAME AVX2 -#include "highwayhash/vector_test_target.cc" +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME AVX2 +#include "highwayhash/vector_test_target.cc" diff --git a/contrib/libs/highwayhash/highwayhash/vector_test_portable.cc b/contrib/libs/highwayhash/highwayhash/vector_test_portable.cc index a742b4be80..df23c28070 100644 --- a/contrib/libs/highwayhash/highwayhash/vector_test_portable.cc +++ b/contrib/libs/highwayhash/highwayhash/vector_test_portable.cc @@ -1,19 +1,19 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// WARNING: this is a "restricted" source file; avoid including any headers -// unless they are also restricted. See arch_specific.h for details. - -#define HH_TARGET_NAME Portable -#include "highwayhash/vector_test_target.cc" +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME Portable +#include "highwayhash/vector_test_target.cc" diff --git a/contrib/libs/highwayhash/highwayhash/vector_test_sse41.cc b/contrib/libs/highwayhash/highwayhash/vector_test_sse41.cc index 80e11b5d9c..4d6fbee2b4 100644 --- a/contrib/libs/highwayhash/highwayhash/vector_test_sse41.cc +++ b/contrib/libs/highwayhash/highwayhash/vector_test_sse41.cc @@ -1,19 +1,19 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// WARNING: this is a "restricted" source file; avoid including any headers -// unless they are also restricted. See arch_specific.h for details. - -#define HH_TARGET_NAME SSE41 -#include "highwayhash/vector_test_target.cc" +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME SSE41 +#include "highwayhash/vector_test_target.cc" diff --git a/contrib/libs/highwayhash/highwayhash/vector_test_target.cc b/contrib/libs/highwayhash/highwayhash/vector_test_target.cc index f9eed7f59a..16d6ef1825 100644 --- a/contrib/libs/highwayhash/highwayhash/vector_test_target.cc +++ b/contrib/libs/highwayhash/highwayhash/vector_test_target.cc @@ -1,220 +1,220 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// WARNING: this is a "restricted" source file; avoid including any headers -// unless they are also restricted. See arch_specific.h for details. - -#include "highwayhash/vector_test_target.h" - -#include "highwayhash/arch_specific.h" - -#if HH_TARGET == HH_TARGET_AVX2 -#include "highwayhash/vector256.h" -#elif HH_TARGET == HH_TARGET_SSE41 -#include "highwayhash/vector128.h" -#elif HH_TARGET == HH_TARGET_Portable -#include "highwayhash/scalar.h" -#else -#error "Unknown target, add its include here." -#endif - -#ifndef HH_DISABLE_TARGET_SPECIFIC -namespace highwayhash { -namespace HH_TARGET_NAME { -namespace { - -#if HH_TARGET == HH_TARGET_AVX2 -template <typename T> -using V = V256<T>; -#elif HH_TARGET == HH_TARGET_SSE41 -template <typename T> -using V = V128<T>; -#elif HH_TARGET == HH_TARGET_Portable -template <typename T> -using V = Scalar<T>; -#else -#error "Unknown target, add its vector typedef here." -#endif - -template <class T> -void NotifyIfUnequal(const V<T>& v, const T expected, const HHNotify notify) { - T lanes[V<T>::N] HH_ALIGNAS(32); - Store(v, lanes); - for (size_t i = 0; i < V<T>::N; ++i) { - if (lanes[i] != expected) { - notify(TargetName(HH_TARGET), (i << 8) | sizeof(T)); - } - } -} - -template <class T> -void NotifyIfUnequal(const T& t, const T expected, const HHNotify notify) { - if (t != expected) { - notify(TargetName(HH_TARGET), sizeof(T)); - } -} - -// MaxValue<T>()() replaces std::numeric_limits<T>::max(). -template <typename T> -struct MaxValue; -template <> -struct MaxValue<uint8_t> { - constexpr uint8_t operator()() const { return 0xFFu; } -}; -template <> -struct MaxValue<uint16_t> { - constexpr uint16_t operator()() const { return 0xFFFFu; } -}; -template <> -struct MaxValue<uint32_t> { - constexpr uint32_t operator()() const { return 0xFFFFFFFFu; } -}; -template <> -struct MaxValue<uint64_t> { - constexpr uint64_t operator()() const { return 0xFFFFFFFFFFFFFFFFull; } -}; - -template <typename T> -void TestMembersAndBinaryOperatorsExceptShifts(const HHNotify notify) { - // uninitialized - V<T> v; - - // broadcast - const V<T> v2(2); - NotifyIfUnequal(v2, T(2), notify); - - // assign from V - const V<T> v3(3); - V<T> v3b; - v3b = v3; - NotifyIfUnequal(v3b, T(3), notify); - - // equal - const V<T> veq(v3 == v3b); - NotifyIfUnequal(veq, MaxValue<T>()(), notify); - - // Copying to, and constructing from intrinsic yields same result. - typename V<T>::Intrinsic nv2 = v2; - V<T> v2b(nv2); - NotifyIfUnequal(v2b, T(2), notify); - - // .. assignment also works. - V<T> v2c; - v2c = nv2; - NotifyIfUnequal(v2c, T(2), notify); - - const V<T> add = v2 + v3; - NotifyIfUnequal(add, T(5), notify); - - const V<T> sub = v3 - v2; - NotifyIfUnequal(sub, T(1), notify); - - const V<T> vand = v3 & v2; - NotifyIfUnequal(vand, T(2), notify); - - const V<T> vor = add | v2; - NotifyIfUnequal(vor, T(7), notify); - - const V<T> vxor = v3 ^ v2; - NotifyIfUnequal(vxor, T(1), notify); -} - -// SSE does not allow shifting uint8_t, so instantiate for all other types. -template <class T> -void TestShifts(const HHNotify notify) { - const V<T> v1(1); - // Shifting out of right side => zero - NotifyIfUnequal(v1 >> 1, T(0), notify); - - // Simple left shift - NotifyIfUnequal(v1 << 1, T(2), notify); - - // Sign bit - constexpr int kSign = (sizeof(T) * 8) - 1; - constexpr T max = MaxValue<T>()(); - constexpr T sign = ~(max >> 1); - NotifyIfUnequal(v1 << kSign, sign, notify); - - // Shifting out of left side => zero - NotifyIfUnequal(v1 << (kSign + 1), T(0), notify); -} - -template <class T> -void TestLoadStore(const HHNotify notify) { - const size_t n = V<T>::N; - T lanes[2 * n] HH_ALIGNAS(32); - for (size_t i = 0; i < n; ++i) { - lanes[i] = 4; - } - for (size_t i = n; i < 2 * n; ++i) { - lanes[i] = 5; - } - // Aligned load - const V<T> v4 = Load<V<T>>(lanes); - NotifyIfUnequal(v4, T(4), notify); - - // Aligned store - T lanes4[n] HH_ALIGNAS(32); - Store(v4, lanes4); - NotifyIfUnequal(Load<V<T>>(lanes4), T(4), notify); - - // Unaligned load - const V<T> vu = LoadUnaligned<V<T>>(lanes + 1); - Store(vu, lanes4); - NotifyIfUnequal(lanes4[n - 1], T(5), notify); - for (size_t i = 1; i < n - 1; ++i) { - NotifyIfUnequal(lanes4[i], T(4), notify); - } - - // Unaligned store - StoreUnaligned(v4, lanes + n / 2); - size_t i; - for (i = 0; i < 3 * n / 2; ++i) { - NotifyIfUnequal(lanes[i], T(4), notify); - } - // Subsequent values remain unchanged. - for (; i < 2 * n; ++i) { - NotifyIfUnequal(lanes[i], T(5), notify); - } -} - -void TestAll(const HHNotify notify) { - TestMembersAndBinaryOperatorsExceptShifts<uint8_t>(notify); - TestMembersAndBinaryOperatorsExceptShifts<uint16_t>(notify); - TestMembersAndBinaryOperatorsExceptShifts<uint32_t>(notify); - TestMembersAndBinaryOperatorsExceptShifts<uint64_t>(notify); - - TestShifts<uint16_t>(notify); - TestShifts<uint32_t>(notify); - TestShifts<uint64_t>(notify); - - TestLoadStore<uint8_t>(notify); - TestLoadStore<uint16_t>(notify); - TestLoadStore<uint32_t>(notify); - TestLoadStore<uint64_t>(notify); -} - -} // namespace -} // namespace HH_TARGET_NAME - -template <TargetBits Target> -void VectorTest<Target>::operator()(const HHNotify notify) const { - HH_TARGET_NAME::TestAll(notify); -} - -// Instantiate for the current target. -template struct VectorTest<HH_TARGET>; - -} // namespace highwayhash -#endif // HH_DISABLE_TARGET_SPECIFIC +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#include "highwayhash/vector_test_target.h" + +#include "highwayhash/arch_specific.h" + +#if HH_TARGET == HH_TARGET_AVX2 +#include "highwayhash/vector256.h" +#elif HH_TARGET == HH_TARGET_SSE41 +#include "highwayhash/vector128.h" +#elif HH_TARGET == HH_TARGET_Portable +#include "highwayhash/scalar.h" +#else +#error "Unknown target, add its include here." +#endif + +#ifndef HH_DISABLE_TARGET_SPECIFIC +namespace highwayhash { +namespace HH_TARGET_NAME { +namespace { + +#if HH_TARGET == HH_TARGET_AVX2 +template <typename T> +using V = V256<T>; +#elif HH_TARGET == HH_TARGET_SSE41 +template <typename T> +using V = V128<T>; +#elif HH_TARGET == HH_TARGET_Portable +template <typename T> +using V = Scalar<T>; +#else +#error "Unknown target, add its vector typedef here." +#endif + +template <class T> +void NotifyIfUnequal(const V<T>& v, const T expected, const HHNotify notify) { + T lanes[V<T>::N] HH_ALIGNAS(32); + Store(v, lanes); + for (size_t i = 0; i < V<T>::N; ++i) { + if (lanes[i] != expected) { + notify(TargetName(HH_TARGET), (i << 8) | sizeof(T)); + } + } +} + +template <class T> +void NotifyIfUnequal(const T& t, const T expected, const HHNotify notify) { + if (t != expected) { + notify(TargetName(HH_TARGET), sizeof(T)); + } +} + +// MaxValue<T>()() replaces std::numeric_limits<T>::max(). +template <typename T> +struct MaxValue; +template <> +struct MaxValue<uint8_t> { + constexpr uint8_t operator()() const { return 0xFFu; } +}; +template <> +struct MaxValue<uint16_t> { + constexpr uint16_t operator()() const { return 0xFFFFu; } +}; +template <> +struct MaxValue<uint32_t> { + constexpr uint32_t operator()() const { return 0xFFFFFFFFu; } +}; +template <> +struct MaxValue<uint64_t> { + constexpr uint64_t operator()() const { return 0xFFFFFFFFFFFFFFFFull; } +}; + +template <typename T> +void TestMembersAndBinaryOperatorsExceptShifts(const HHNotify notify) { + // uninitialized + V<T> v; + + // broadcast + const V<T> v2(2); + NotifyIfUnequal(v2, T(2), notify); + + // assign from V + const V<T> v3(3); + V<T> v3b; + v3b = v3; + NotifyIfUnequal(v3b, T(3), notify); + + // equal + const V<T> veq(v3 == v3b); + NotifyIfUnequal(veq, MaxValue<T>()(), notify); + + // Copying to, and constructing from intrinsic yields same result. + typename V<T>::Intrinsic nv2 = v2; + V<T> v2b(nv2); + NotifyIfUnequal(v2b, T(2), notify); + + // .. assignment also works. + V<T> v2c; + v2c = nv2; + NotifyIfUnequal(v2c, T(2), notify); + + const V<T> add = v2 + v3; + NotifyIfUnequal(add, T(5), notify); + + const V<T> sub = v3 - v2; + NotifyIfUnequal(sub, T(1), notify); + + const V<T> vand = v3 & v2; + NotifyIfUnequal(vand, T(2), notify); + + const V<T> vor = add | v2; + NotifyIfUnequal(vor, T(7), notify); + + const V<T> vxor = v3 ^ v2; + NotifyIfUnequal(vxor, T(1), notify); +} + +// SSE does not allow shifting uint8_t, so instantiate for all other types. +template <class T> +void TestShifts(const HHNotify notify) { + const V<T> v1(1); + // Shifting out of right side => zero + NotifyIfUnequal(v1 >> 1, T(0), notify); + + // Simple left shift + NotifyIfUnequal(v1 << 1, T(2), notify); + + // Sign bit + constexpr int kSign = (sizeof(T) * 8) - 1; + constexpr T max = MaxValue<T>()(); + constexpr T sign = ~(max >> 1); + NotifyIfUnequal(v1 << kSign, sign, notify); + + // Shifting out of left side => zero + NotifyIfUnequal(v1 << (kSign + 1), T(0), notify); +} + +template <class T> +void TestLoadStore(const HHNotify notify) { + const size_t n = V<T>::N; + T lanes[2 * n] HH_ALIGNAS(32); + for (size_t i = 0; i < n; ++i) { + lanes[i] = 4; + } + for (size_t i = n; i < 2 * n; ++i) { + lanes[i] = 5; + } + // Aligned load + const V<T> v4 = Load<V<T>>(lanes); + NotifyIfUnequal(v4, T(4), notify); + + // Aligned store + T lanes4[n] HH_ALIGNAS(32); + Store(v4, lanes4); + NotifyIfUnequal(Load<V<T>>(lanes4), T(4), notify); + + // Unaligned load + const V<T> vu = LoadUnaligned<V<T>>(lanes + 1); + Store(vu, lanes4); + NotifyIfUnequal(lanes4[n - 1], T(5), notify); + for (size_t i = 1; i < n - 1; ++i) { + NotifyIfUnequal(lanes4[i], T(4), notify); + } + + // Unaligned store + StoreUnaligned(v4, lanes + n / 2); + size_t i; + for (i = 0; i < 3 * n / 2; ++i) { + NotifyIfUnequal(lanes[i], T(4), notify); + } + // Subsequent values remain unchanged. + for (; i < 2 * n; ++i) { + NotifyIfUnequal(lanes[i], T(5), notify); + } +} + +void TestAll(const HHNotify notify) { + TestMembersAndBinaryOperatorsExceptShifts<uint8_t>(notify); + TestMembersAndBinaryOperatorsExceptShifts<uint16_t>(notify); + TestMembersAndBinaryOperatorsExceptShifts<uint32_t>(notify); + TestMembersAndBinaryOperatorsExceptShifts<uint64_t>(notify); + + TestShifts<uint16_t>(notify); + TestShifts<uint32_t>(notify); + TestShifts<uint64_t>(notify); + + TestLoadStore<uint8_t>(notify); + TestLoadStore<uint16_t>(notify); + TestLoadStore<uint32_t>(notify); + TestLoadStore<uint64_t>(notify); +} + +} // namespace +} // namespace HH_TARGET_NAME + +template <TargetBits Target> +void VectorTest<Target>::operator()(const HHNotify notify) const { + HH_TARGET_NAME::TestAll(notify); +} + +// Instantiate for the current target. +template struct VectorTest<HH_TARGET>; + +} // namespace highwayhash +#endif // HH_DISABLE_TARGET_SPECIFIC diff --git a/contrib/libs/highwayhash/highwayhash/vector_test_target.h b/contrib/libs/highwayhash/highwayhash/vector_test_target.h index f1ff6382dc..c26f876912 100644 --- a/contrib/libs/highwayhash/highwayhash/vector_test_target.h +++ b/contrib/libs/highwayhash/highwayhash/vector_test_target.h @@ -1,37 +1,37 @@ -// Copyright 2017 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAYHASH_VECTOR_TEST_TARGET_H_ -#define HIGHWAYHASH_VECTOR_TEST_TARGET_H_ - -// WARNING: this is a "restricted" header because it is included from -// translation units compiled with different flags. This header and its -// dependencies must not define any function unless it is static inline and/or -// within namespace HH_TARGET_NAME. See arch_specific.h for details. - -#include "highwayhash/arch_specific.h" -#include "highwayhash/hh_types.h" - -namespace highwayhash { - -// Usage: InstructionSets::RunAll<VectorTest>(). Calls "notify" for each test -// failure. -template <TargetBits Target> -struct VectorTest { - void operator()(const HHNotify notify) const; -}; - -} // namespace highwayhash - -#endif // HIGHWAYHASH_VECTOR_TEST_TARGET_H_ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_VECTOR_TEST_TARGET_H_ +#define HIGHWAYHASH_VECTOR_TEST_TARGET_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/arch_specific.h" +#include "highwayhash/hh_types.h" + +namespace highwayhash { + +// Usage: InstructionSets::RunAll<VectorTest>(). Calls "notify" for each test +// failure. +template <TargetBits Target> +struct VectorTest { + void operator()(const HHNotify notify) const; +}; + +} // namespace highwayhash + +#endif // HIGHWAYHASH_VECTOR_TEST_TARGET_H_ diff --git a/contrib/libs/highwayhash/ya.make b/contrib/libs/highwayhash/ya.make index 4f6dad6193..aec086bdcb 100644 --- a/contrib/libs/highwayhash/ya.make +++ b/contrib/libs/highwayhash/ya.make @@ -1,46 +1,46 @@ -LIBRARY() - -LICENSE(Apache-2.0) - +LIBRARY() + +LICENSE(Apache-2.0) + LICENSE_TEXTS(.yandex_meta/licenses.list.txt) VERSION(2017-05-08-2b666ae078292b01024453d01480f3b362a2a012) OWNER(somov) - -NO_COMPILER_WARNINGS() - + +NO_COMPILER_WARNINGS() + ADDINCL(GLOBAL contrib/libs/highwayhash) - -SRCDIR(contrib/libs/highwayhash/highwayhash) - -SRCS( - # Dispatcher - arch_specific.cc - instruction_sets.cc - nanobenchmark.cc - os_specific.cc - # SipHash - sip_hash.cc - scalar_sip_tree_hash.cc - # sip_tree_hash.cc with AVX2 if available - # HighwayHash - hh_portable.cc - # hh_avx2.cc with AVX2 - # hh_sse41.cc with SSE4.1 - # Library - c_bindings.cc -) - -IF (ARCH_X86_64) - PEERDIR( - contrib/libs/highwayhash/arch/avx2 - contrib/libs/highwayhash/arch/sse41 - ) -ELSE() - SRCS( - sip_tree_hash.cc - ) -ENDIF() - -END() + +SRCDIR(contrib/libs/highwayhash/highwayhash) + +SRCS( + # Dispatcher + arch_specific.cc + instruction_sets.cc + nanobenchmark.cc + os_specific.cc + # SipHash + sip_hash.cc + scalar_sip_tree_hash.cc + # sip_tree_hash.cc with AVX2 if available + # HighwayHash + hh_portable.cc + # hh_avx2.cc with AVX2 + # hh_sse41.cc with SSE4.1 + # Library + c_bindings.cc +) + +IF (ARCH_X86_64) + PEERDIR( + contrib/libs/highwayhash/arch/avx2 + contrib/libs/highwayhash/arch/sse41 + ) +ELSE() + SRCS( + sip_tree_hash.cc + ) +ENDIF() + +END() |