aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/highwayhash
diff options
context:
space:
mode:
authorsomov <somov@yandex-team.ru>2022-02-10 16:45:47 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:47 +0300
commita5950576e397b1909261050b8c7da16db58f10b1 (patch)
tree7ba7677f6a4c3e19e2cefab34d16df2c8963b4d4 /contrib/libs/highwayhash
parent81eddc8c0b55990194e112b02d127b87d54164a9 (diff)
downloadydb-a5950576e397b1909261050b8c7da16db58f10b1.tar.gz
Restoring authorship annotation for <somov@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/highwayhash')
-rw-r--r--contrib/libs/highwayhash/IMPORT2
-rw-r--r--contrib/libs/highwayhash/LICENSE406
-rw-r--r--contrib/libs/highwayhash/README.md700
-rw-r--r--contrib/libs/highwayhash/arch/avx2/ya.make34
-rw-r--r--contrib/libs/highwayhash/arch/sse41/ya.make32
-rw-r--r--contrib/libs/highwayhash/highwayhash/arch_specific.cc236
-rw-r--r--contrib/libs/highwayhash/highwayhash/arch_specific.h306
-rw-r--r--contrib/libs/highwayhash/highwayhash/benchmark.cc626
-rw-r--r--contrib/libs/highwayhash/highwayhash/c_bindings.cc70
-rw-r--r--contrib/libs/highwayhash/highwayhash/c_bindings.h110
-rw-r--r--contrib/libs/highwayhash/highwayhash/compiler_specific.h180
-rw-r--r--contrib/libs/highwayhash/highwayhash/data_parallel.h682
-rw-r--r--contrib/libs/highwayhash/highwayhash/data_parallel_benchmark.cc302
-rw-r--r--contrib/libs/highwayhash/highwayhash/data_parallel_test.cc350
-rw-r--r--contrib/libs/highwayhash/highwayhash/endianess.h216
-rw-r--r--contrib/libs/highwayhash/highwayhash/example.cc60
-rw-r--r--contrib/libs/highwayhash/highwayhash/hh_avx2.cc38
-rw-r--r--contrib/libs/highwayhash/highwayhash/hh_avx2.h766
-rw-r--r--contrib/libs/highwayhash/highwayhash/hh_buffer.h206
-rw-r--r--contrib/libs/highwayhash/highwayhash/hh_portable.cc38
-rw-r--r--contrib/libs/highwayhash/highwayhash/hh_portable.h602
-rw-r--r--contrib/libs/highwayhash/highwayhash/hh_sse41.cc38
-rw-r--r--contrib/libs/highwayhash/highwayhash/hh_sse41.h660
-rw-r--r--contrib/libs/highwayhash/highwayhash/hh_types.h100
-rw-r--r--contrib/libs/highwayhash/highwayhash/highwayhash.h404
-rw-r--r--contrib/libs/highwayhash/highwayhash/highwayhash_target.cc208
-rw-r--r--contrib/libs/highwayhash/highwayhash/highwayhash_target.h182
-rw-r--r--contrib/libs/highwayhash/highwayhash/highwayhash_test.cc776
-rw-r--r--contrib/libs/highwayhash/highwayhash/highwayhash_test_avx2.cc38
-rw-r--r--contrib/libs/highwayhash/highwayhash/highwayhash_test_portable.cc38
-rw-r--r--contrib/libs/highwayhash/highwayhash/highwayhash_test_sse41.cc38
-rw-r--r--contrib/libs/highwayhash/highwayhash/highwayhash_test_target.cc422
-rw-r--r--contrib/libs/highwayhash/highwayhash/highwayhash_test_target.h178
-rw-r--r--contrib/libs/highwayhash/highwayhash/iaca.h126
-rw-r--r--contrib/libs/highwayhash/highwayhash/instruction_sets.cc282
-rw-r--r--contrib/libs/highwayhash/highwayhash/instruction_sets.h176
-rw-r--r--contrib/libs/highwayhash/highwayhash/load3.h288
-rw-r--r--contrib/libs/highwayhash/highwayhash/nanobenchmark.cc874
-rw-r--r--contrib/libs/highwayhash/highwayhash/nanobenchmark.h316
-rw-r--r--contrib/libs/highwayhash/highwayhash/nanobenchmark_example.cc96
-rw-r--r--contrib/libs/highwayhash/highwayhash/os_specific.cc478
-rw-r--r--contrib/libs/highwayhash/highwayhash/os_specific.h108
-rw-r--r--contrib/libs/highwayhash/highwayhash/profiler.h1508
-rw-r--r--contrib/libs/highwayhash/highwayhash/profiler_example.cc194
-rw-r--r--contrib/libs/highwayhash/highwayhash/robust_statistics.h270
-rw-r--r--contrib/libs/highwayhash/highwayhash/scalar.h704
-rw-r--r--contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.cc366
-rw-r--r--contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.h74
-rw-r--r--contrib/libs/highwayhash/highwayhash/sip_hash.cc66
-rw-r--r--contrib/libs/highwayhash/highwayhash/sip_hash.h342
-rw-r--r--contrib/libs/highwayhash/highwayhash/sip_hash_test.cc300
-rw-r--r--contrib/libs/highwayhash/highwayhash/sip_tree_hash.cc454
-rw-r--r--contrib/libs/highwayhash/highwayhash/sip_tree_hash.h104
-rw-r--r--contrib/libs/highwayhash/highwayhash/state_helpers.h254
-rw-r--r--contrib/libs/highwayhash/highwayhash/tsc_timer.h408
-rw-r--r--contrib/libs/highwayhash/highwayhash/vector128.h1592
-rw-r--r--contrib/libs/highwayhash/highwayhash/vector256.h1516
-rw-r--r--contrib/libs/highwayhash/highwayhash/vector_test.cc118
-rw-r--r--contrib/libs/highwayhash/highwayhash/vector_test_avx2.cc38
-rw-r--r--contrib/libs/highwayhash/highwayhash/vector_test_portable.cc38
-rw-r--r--contrib/libs/highwayhash/highwayhash/vector_test_sse41.cc38
-rw-r--r--contrib/libs/highwayhash/highwayhash/vector_test_target.cc440
-rw-r--r--contrib/libs/highwayhash/highwayhash/vector_test_target.h74
-rw-r--r--contrib/libs/highwayhash/ya.make80
64 files changed, 10383 insertions, 10383 deletions
diff --git a/contrib/libs/highwayhash/IMPORT b/contrib/libs/highwayhash/IMPORT
index c1b4084ec0..7b70d9300b 100644
--- a/contrib/libs/highwayhash/IMPORT
+++ b/contrib/libs/highwayhash/IMPORT
@@ -1 +1 @@
-Imported from https://github.com/google/highwayhash commit 2b666ae078292b01024453d01480f3b362a2a012 (master branch, 2017-05-08)
+Imported from https://github.com/google/highwayhash commit 2b666ae078292b01024453d01480f3b362a2a012 (master branch, 2017-05-08)
diff --git a/contrib/libs/highwayhash/LICENSE b/contrib/libs/highwayhash/LICENSE
index 6b0b1270ff..4d581db0a5 100644
--- a/contrib/libs/highwayhash/LICENSE
+++ b/contrib/libs/highwayhash/LICENSE
@@ -1,203 +1,203 @@
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
diff --git a/contrib/libs/highwayhash/README.md b/contrib/libs/highwayhash/README.md
index 2d311c66c1..4d120be2a2 100644
--- a/contrib/libs/highwayhash/README.md
+++ b/contrib/libs/highwayhash/README.md
@@ -1,350 +1,350 @@
-Strong (well-distributed and unpredictable) hashes:
-
-* Portable implementation of
- [SipHash](https://www.131002.net/siphash/siphash.pdf)
-* HighwayHash, a 5x faster SIMD hash with [security
- claims](https://arxiv.org/abs/1612.06257)
-
-## Quick Start
-
-To build on a Linux or Mac platform, simply run `make`. For Windows, we provide
-a Visual Studio 2015 project in the `msvc` subdirectory.
-
-Run `benchmark` for speed measurements. `sip_hash_test` and `highwayhash_test`
-ensure the implementations return known-good values for a given set of inputs.
-
-64-bit SipHash for any CPU:
-
- #include "highwayhash/sip_hash.h"
- using namespace highwayhash;
- const HH_U64 key2[2] HH_ALIGNAS(16) = {1234, 5678};
- char in[8] = {1};
- return SipHash(key2, in, 8);
-
-64, 128 or 256 bit HighwayHash for the CPU determined by compiler flags:
-
- #include "highwayhash/highwayhash.h"
- using namespace highwayhash;
- const HHKey key HH_ALIGNAS(32) = {1, 2, 3, 4};
- char in[8] = {1};
- HHResult64 result; // or HHResult128 or HHResult256
- HHStateT<HH_TARGET> state(key);
- HighwayHashT(&state, in, 8, &result);
-
-64, 128 or 256 bit HighwayHash for the CPU on which we're currently running:
-
- #include "highwayhash/highwayhash_target.h"
- #include "highwayhash/instruction_sets.h"
- using namespace highwayhash;
- const HHKey key HH_ALIGNAS(32) = {1, 2, 3, 4};
- char in[8] = {1};
- HHResult64 result; // or HHResult128 or HHResult256
- InstructionSets::Run<HighwayHash>(key, in, 8, &result);
-
-C-callable 64-bit HighwayHash for the CPU on which we're currently running:
-
- #include "highwayhash/c_bindings.h"
- const uint64_t key[4] = {1, 2, 3, 4};
- char in[8] = {1};
- return HighwayHash64(key, in, 8);
-
-## Introduction
-
-Hash functions are widely used, so it is desirable to increase their speed and
-security. This package provides two 'strong' (well-distributed and
-unpredictable) hash functions: a faster version of SipHash, and an even faster
-algorithm we call HighwayHash.
-
-SipHash is a fast but 'cryptographically strong' pseudo-random function by
-Aumasson and Bernstein [https://www.131002.net/siphash/siphash.pdf].
-
-HighwayHash is a new way of mixing inputs which may inspire new
-cryptographically strong hashes. Large inputs are processed at a rate of 0.24
-cycles per byte, and latency remains low even for small inputs. HighwayHash is
-faster than SipHash for all input sizes, with 5 times higher throughput at 1
-KiB. We discuss design choices and provide statistical analysis and preliminary
-cryptanalysis in https://arxiv.org/abs/1612.06257.
-
-## Applications
-
-Unlike prior strong hashes, these functions are fast enough to be recommended
-as safer replacements for weak hashes in many applications. The additional CPU
-cost appears affordable, based on profiling data indicating C++ hash functions
-account for less than 0.25% of CPU usage.
-
-Hash-based selection of random subsets is useful for A/B experiments and similar
-applications. Such random generators are idempotent (repeatable and
-deterministic), which is helpful for parallel algorithms and testing. To avoid
-bias, it is important that the hash function be unpredictable and
-indistinguishable from a uniform random generator. We have verified the bit
-distribution and avalanche properties of SipHash and HighwayHash.
-
-64-bit hashes are also useful for authenticating short-lived messages such as
-network/RPC packets. This requires that the hash function withstand
-differential, length extension and other attacks. We have published a formal
-security analysis for HighwayHash. New cryptanalysis tools may still need to be
-developed for further analysis.
-
-Strong hashes are also important parts of methods for protecting hash tables
-against unacceptable worst-case behavior and denial of service attacks
-(see "hash flooding" below).
-
-## SipHash
-
-Our SipHash implementation is a fast and portable drop-in replacement for
-the reference C code. Outputs are identical for the given test cases (messages
-between 0 and 63 bytes).
-
-Interestingly, it is about twice as fast as a SIMD implementation using SSE4.1
-(https://goo.gl/80GBSD). This is presumably due to the lack of SIMD bit rotate
-instructions.
-
-SipHash13 is a faster but weaker variant with one mixing round per update and
-three during finalization.
-
-We also provide a data-parallel 'tree hash' variant that enables efficient SIMD
-while retaining safety guarantees. This is about twice as fast as SipHash, but
-does not return the same results.
-
-## HighwayHash
-
-We have devised a new way of mixing inputs with AVX2 multiply and permute
-instructions. The multiplications are 32x32 -> 64 bits and therefore infeasible
-to reverse. Permuting equalizes the distribution of the resulting bytes.
-
-The internal state occupies four 256-bit AVX2 registers. Due to limitations of
-the instruction set, the registers are partitioned into two 512-bit halves that
-remain independent until the reduce phase. The algorithm outputs 64 bit digests
-or up to 256 bits at no extra cost.
-
-In addition to high throughput, the algorithm is designed for low finalization
-cost. The result is more than twice as fast as SipTreeHash.
-
-For older CPUs, we also provide an SSE4.1 version (80% as fast for large inputs
-and 95% as fast for short inputs) and a portable version (10% as fast).
-
-Statistical analyses and preliminary cryptanalysis are given in
-https://arxiv.org/abs/1612.06257.
-
-## Versioning and stability
-
-SipHash and HighwayHash 1.0 are 'fingerprint functions' whose input -> hash
-mapping will not change. This is important for applications that write hashes to
-persistent storage.
-
-HighwayHash has not yet reached 1.0 and may still change in the near future. We
-will announce when it is frozen.
-
-## Speed measurements
-
-To measure the CPU cost of a hash function, we can either create an artificial
-'microbenchmark' (easier to control, but probably not representative of the
-actual runtime), or insert instrumentation directly into an application (risks
-influencing the results through observer overhead). We provide novel variants of
-both approaches that mitigate their respective disadvantages.
-
-profiler.h uses software write-combining to stream program traces to memory
-with minimal overhead. These can be analyzed offline, or when memory is full,
-to learn how much time was spent in each (possibly nested) zone.
-
-nanobenchmark.h enables cycle-accurate measurements of very short functions.
-It uses CPU fences and robust statistics to minimize variability, and also
-avoids unrealistic branch prediction effects.
-
-We compile the C++ implementations with a patched GCC 4.9 and run on a single
-core of a Xeon E5-2690 v3 clocked at 2.6 GHz. CPU cost is measured as cycles per
-byte for various input sizes:
-
-Algorithm | 8 | 31 | 32 | 63 | 64 | 1024
----------------- | ----- | ---- | ---- | ---- | ---- | ----
-HighwayHashAVX2 | 7.34 | 1.81 | 1.71 | 1.04 | 0.95 | 0.24
-HighwayHashSSE41 | 8.00 | 2.11 | 1.75 | 1.13 | 0.96 | 0.30
-SipTreeHash | 16.51 | 4.57 | 4.09 | 2.22 | 2.29 | 0.57
-SipTreeHash13 | 12.33 | 3.47 | 3.06 | 1.68 | 1.63 | 0.33
-SipHash | 8.13 | 2.58 | 2.73 | 1.87 | 1.93 | 1.26
-SipHash13 | 6.96 | 2.09 | 2.12 | 1.32 | 1.33 | 0.68
-
-SipTreeHash is slower than SipHash for small inputs because it processes blocks
-of 32 bytes. AVX2 and SSE4.1 HighwayHash are faster than SipHash for all input
-sizes due to their highly optimized handling of partial vectors.
-
-Note that previous measurements included the initialization of their input,
-which dramatically increased timings especially for small inputs.
-
-## CPU requirements
-
-SipTreeHash[13] requires an AVX2-capable CPU (e.g. Haswell). HighwayHash
-includes a dispatcher that chooses the best available (AVX2, SSE4.1 or portable)
-implementation at runtime, as well as a directly callable function template that
-can only run on the CPU for which it was built. SipHash[13] and
-ScalarSipTreeHash[13] have no particular CPU requirements.
-
-Our implementations use custom AVX2 vector classes with overloaded operators
-(e.g. `const V4x64U a = b + c`) for type-safety and improved readability vs.
-compiler intrinsics (e.g. `const __m256i a = _mm256_add_epi64(b, c)`).
-
-We intend to port HighwayHash to other SIMD-capable platforms, especially ARM.
-
-Our instruction_sets dispatcher avoids running newer instructions on older CPUs
-that do not support them. However, intrinsics, and therefore also any vector
-classes that use them, require a compiler flag that also enables the compiler to
-generate code for that CPU. This means the intrinsics must be placed in separate
-translation units that are compiled with the required flags. It is important
-that these source files and their headers not define any inline functions,
-because that might break the one definition rule and cause crashes.
-
-To minimize dispatch overhead when hashes are computed often (e.g. in a loop),
-we can inline the hash function into its caller using templates. The dispatch
-overhead will only be paid once (e.g. before the loop). The template mechanism
-also avoids duplicating code in each CPU-specific implementation.
-
-## Defending against hash flooding
-
-To mitigate hash flooding attacks, we need to take both the hash function and
-the data structure into account.
-
-We wish to defend (web) services that utilize hash sets/maps against
-denial-of-service attacks. Such data structures assign attacker-controlled
-input messages `m` to a hash table bin `b` by computing the hash `H(s, m)`
-using a hash function `H` seeded by `s`, and mapping it to a bin with some
-narrowing function `b = R(h)`, discussed below.
-
-Attackers may attempt to trigger 'flooding' (excessive work in insertions or
-lookups) by finding multiple `m` that map to the same bin. If the attacker has
-local access, they can do far worse, so we assume the attacker can only issue
-remote requests. If the attacker is able to send large numbers of requests,
-they can already deny service, so we need only ensure the attacker's cost is
-sufficiently large compared to the service's provisioning.
-
-If the hash function is 'weak', attackers can easily generate 'hash collisions'
-(inputs mapping to the same hash values) that are independent of the seed. In
-other words, certain input messages will cause collisions regardless of the seed
-value. The author of SipHash has published C++ programs to generate such
-'universal (key-independent) multicollisions' for CityHash and Murmur. Similar
-'differential' attacks are likely possible for any hash function consisting only
-of reversible operations (e.g. addition/multiplication/rotation) with a constant
-operand. `n` requests with such inputs cause `n^2` work for an unprotected hash
-table, which is unacceptable.
-
-By contrast, 'strong' hashes such as SipHash or HighwayHash require infeasible
-attacker effort to find a hash collision (an expected 2^32 guesses of `m` per
-the birthday paradox) or recover the seed (2^63 requests). These security claims
-assume the seed is secret. It is reasonable to suppose `s` is initially unknown
-to attackers, e.g. generated on startup or even per-connection. A timing attack
-by Wool/Bar-Yosef recovers 13-bit seeds by testing all 8K possibilities using
-millions of requests, which takes several days (even assuming unrealistic 150 us
-round-trip times). It appears infeasible to recover 64-bit seeds in this way.
-
-However, attackers are only looking for multiple `m` mapping to the same bin
-rather than identical hash values. We assume they know or are able to discover
-the hash table size `p`. It is common to choose `p = 2^i` to enable an efficient
-`R(h) := h & (p - 1)`, which simply retains the lower hash bits. It may be
-easier for attackers to compute partial collisions where only the lower `i` bits
-match. This can be prevented by choosing a prime `p` so that `R(h) := h % p`
-incorporates all hash bits. The costly modulo operation can be avoided by
-multiplying with the inverse (https://goo.gl/l7ASm8). An interesting alternative
-suggested by Kyoung Jae Seo chooses a random subset of the `h` bits. Such an `R`
-function can be computed in just 3 cycles using PEXT from the BMI2 instruction
-set. This is expected to defend against SAT-solver attacks on the hash bits at a
-slightly lower cost than the multiplicative inverse method, and still allows
-power-of-two table sizes.
-
-Summary thus far: given a strong hash function and secret seed, it appears
-infeasible for attackers to generate hash collisions because `s` and/or `R` are
-unknown. However, they can still observe the timings of data structure
-operations for various `m`. With typical table sizes of 2^10 to 2^17 entries,
-attackers can detect some 'bin collisions' (inputs mapping to the same bin).
-Although this will be costly for the attacker, they can then send many instances
-of such inputs, so we need to limit the resulting work for our data structure.
-
-Hash tables with separate chaining typically store bin entries in a linked list,
-so worst-case inputs lead to unacceptable linear-time lookup cost. We instead
-seek optimal asymptotic worst-case complexity for each operation (insertion,
-deletion and lookups), which is a constant factor times the logarithm of the
-data structure size. This naturally leads to a tree-like data structure for each
-bin. The Java8 HashMap only replaces its linked list with trees when needed.
-This leads to additional cost and complexity for deciding whether a bin is a
-list or tree.
-
-Our first proposal (suggested by Github user funny-falcon) avoids this overhead
-by always storing one tree per bin. It may also be worthwhile to store the first
-entry directly in the bin, which avoids allocating any tree nodes in the common
-case where bins are sparsely populated. What kind of tree should be used?
-Scapegoat and splay trees only offer amortized complexity guarantees, whereas
-treaps require an entropy source and have higher constant factors in practice.
-Self-balancing structures such as 2-3 or red-black trees require additional
-bookkeeping information. We can hope to reduce rebalancing cost by realizing
-that the output bits of strong `H` functions are uniformly distributed. When
-using them as keys instead of the original message `m`, recent relaxed balancing
-schemes such as left-leaning red-black or weak AVL trees may require fewer tree
-rotations to maintain their invariants. Note that `H` already determines the
-bin, so we should only use the remaining bits. 64-bit hashes are likely
-sufficient for this purpose, and HighwayHash generates up to 256 bits. It seems
-unlikely that attackers can craft inputs resulting in worst cases for both the
-bin index and tree key without being able to generate hash collisions, which
-would contradict the security claims of strong hashes. Even if they succeed, the
-relaxed tree balancing still guarantees an upper bound on height and therefore
-the worst-case operation cost. For the AVL variant, the constant factors are
-slightly lower than for red-black trees.
-
-The second proposed approach uses augmented/de-amortized cuckoo hash tables
-(https://goo.gl/PFwwkx). These guarantee worst-case `log n` bounds for all
-operations, but only if the hash function is 'indistinguishable from random'
-(uniformly distributed regardless of the input distribution), which is claimed
-for SipHash and HighwayHash but certainly not for weak hashes.
-
-Both alternatives retain good average case performance and defend against
-flooding by limiting the amount of extra work an attacker can cause. The first
-approach guarantees an upper bound of `log n` additional work even if the hash
-function is compromised.
-
-In summary, a strong hash function is not, by itself, sufficient to protect a
-chained hash table from flooding attacks. However, strong hash functions are
-important parts of two schemes for preventing denial of service. Using weak hash
-functions can slightly accelerate the best-case and average-case performance of
-a service, but at the risk of greatly reduced attack costs and worst-case
-performance.
-
-## Third-party implementations / bindings
-
-Thanks to Damian Gryski for making us aware of these third-party
-implementations or bindings. Please feel free to get in touch or
-raise an issue and we'll add yours as well.
-
-By | Language | URL
---- | --- | ---
-Damian Gryski | Go and SSE | https://github.com/dgryski/go-highway/
-Lovell Fuller | node.js bindings | https://github.com/lovell/highwayhash
-Vinzent Steinberg | Rust bindings | https://github.com/vks/highwayhash-rs
-
-## Modules
-
-### Hashes
-
-* c_bindings.h declares C-callable versions of SipHash/HighwayHash.
-* sip_hash.cc is the compatible implementation of SipHash, and also provides
- the final reduction for sip_tree_hash.
-* sip_tree_hash.cc is the faster but incompatible SIMD j-lanes tree hash.
-* scalar_sip_tree_hash.cc is a non-SIMD version.
-* state_helpers.h simplifies the implementation of the SipHash variants.
-* highwayhash.h is our new, fast hash function.
-* hh_avx2.h, hh_sse41.h and hh_portable.h are its various implementations.
-* highwayhash_target.h chooses the best available implementation at runtime.
-
-### Infrastructure
-
-* arch_specific.h offers byte swapping and CPUID detection.
-* compiler_specific.h defines some compiler-dependent language extensions.
-* data_parallel.h provides a C++11 ThreadPool and PerThread (similar to
- OpenMP).
-* instruction_sets.h and targets.h enable efficient CPU-specific dispatching.
-* nanobenchmark.h measures elapsed times with < 1 cycle variability.
-* os_specific.h sets thread affinity and priority for benchmarking.
-* profiler.h is a low-overhead, deterministic hierarchical profiler.
-* tsc_timer.h obtains high-resolution timestamps without CPU reordering.
-* vector256.h and vector128.h contain wrapper classes for AVX2 and SSE4.1.
-
-By Jan Wassenberg <jan.wassenberg@gmail.com> and Jyrki Alakuijala
-<jyrki.alakuijala@gmail.com>, updated 2017-02-07
-
-This is not an official Google product.
+Strong (well-distributed and unpredictable) hashes:
+
+* Portable implementation of
+ [SipHash](https://www.131002.net/siphash/siphash.pdf)
+* HighwayHash, a 5x faster SIMD hash with [security
+ claims](https://arxiv.org/abs/1612.06257)
+
+## Quick Start
+
+To build on a Linux or Mac platform, simply run `make`. For Windows, we provide
+a Visual Studio 2015 project in the `msvc` subdirectory.
+
+Run `benchmark` for speed measurements. `sip_hash_test` and `highwayhash_test`
+ensure the implementations return known-good values for a given set of inputs.
+
+64-bit SipHash for any CPU:
+
+ #include "highwayhash/sip_hash.h"
+ using namespace highwayhash;
+ const HH_U64 key2[2] HH_ALIGNAS(16) = {1234, 5678};
+ char in[8] = {1};
+ return SipHash(key2, in, 8);
+
+64, 128 or 256 bit HighwayHash for the CPU determined by compiler flags:
+
+ #include "highwayhash/highwayhash.h"
+ using namespace highwayhash;
+ const HHKey key HH_ALIGNAS(32) = {1, 2, 3, 4};
+ char in[8] = {1};
+ HHResult64 result; // or HHResult128 or HHResult256
+ HHStateT<HH_TARGET> state(key);
+ HighwayHashT(&state, in, 8, &result);
+
+64, 128 or 256 bit HighwayHash for the CPU on which we're currently running:
+
+ #include "highwayhash/highwayhash_target.h"
+ #include "highwayhash/instruction_sets.h"
+ using namespace highwayhash;
+ const HHKey key HH_ALIGNAS(32) = {1, 2, 3, 4};
+ char in[8] = {1};
+ HHResult64 result; // or HHResult128 or HHResult256
+ InstructionSets::Run<HighwayHash>(key, in, 8, &result);
+
+C-callable 64-bit HighwayHash for the CPU on which we're currently running:
+
+ #include "highwayhash/c_bindings.h"
+ const uint64_t key[4] = {1, 2, 3, 4};
+ char in[8] = {1};
+ return HighwayHash64(key, in, 8);
+
+## Introduction
+
+Hash functions are widely used, so it is desirable to increase their speed and
+security. This package provides two 'strong' (well-distributed and
+unpredictable) hash functions: a faster version of SipHash, and an even faster
+algorithm we call HighwayHash.
+
+SipHash is a fast but 'cryptographically strong' pseudo-random function by
+Aumasson and Bernstein [https://www.131002.net/siphash/siphash.pdf].
+
+HighwayHash is a new way of mixing inputs which may inspire new
+cryptographically strong hashes. Large inputs are processed at a rate of 0.24
+cycles per byte, and latency remains low even for small inputs. HighwayHash is
+faster than SipHash for all input sizes, with 5 times higher throughput at 1
+KiB. We discuss design choices and provide statistical analysis and preliminary
+cryptanalysis in https://arxiv.org/abs/1612.06257.
+
+## Applications
+
+Unlike prior strong hashes, these functions are fast enough to be recommended
+as safer replacements for weak hashes in many applications. The additional CPU
+cost appears affordable, based on profiling data indicating C++ hash functions
+account for less than 0.25% of CPU usage.
+
+Hash-based selection of random subsets is useful for A/B experiments and similar
+applications. Such random generators are idempotent (repeatable and
+deterministic), which is helpful for parallel algorithms and testing. To avoid
+bias, it is important that the hash function be unpredictable and
+indistinguishable from a uniform random generator. We have verified the bit
+distribution and avalanche properties of SipHash and HighwayHash.
+
+64-bit hashes are also useful for authenticating short-lived messages such as
+network/RPC packets. This requires that the hash function withstand
+differential, length extension and other attacks. We have published a formal
+security analysis for HighwayHash. New cryptanalysis tools may still need to be
+developed for further analysis.
+
+Strong hashes are also important parts of methods for protecting hash tables
+against unacceptable worst-case behavior and denial of service attacks
+(see "hash flooding" below).
+
+## SipHash
+
+Our SipHash implementation is a fast and portable drop-in replacement for
+the reference C code. Outputs are identical for the given test cases (messages
+between 0 and 63 bytes).
+
+Interestingly, it is about twice as fast as a SIMD implementation using SSE4.1
+(https://goo.gl/80GBSD). This is presumably due to the lack of SIMD bit rotate
+instructions.
+
+SipHash13 is a faster but weaker variant with one mixing round per update and
+three during finalization.
+
+We also provide a data-parallel 'tree hash' variant that enables efficient SIMD
+while retaining safety guarantees. This is about twice as fast as SipHash, but
+does not return the same results.
+
+## HighwayHash
+
+We have devised a new way of mixing inputs with AVX2 multiply and permute
+instructions. The multiplications are 32x32 -> 64 bits and therefore infeasible
+to reverse. Permuting equalizes the distribution of the resulting bytes.
+
+The internal state occupies four 256-bit AVX2 registers. Due to limitations of
+the instruction set, the registers are partitioned into two 512-bit halves that
+remain independent until the reduce phase. The algorithm outputs 64 bit digests
+or up to 256 bits at no extra cost.
+
+In addition to high throughput, the algorithm is designed for low finalization
+cost. The result is more than twice as fast as SipTreeHash.
+
+For older CPUs, we also provide an SSE4.1 version (80% as fast for large inputs
+and 95% as fast for short inputs) and a portable version (10% as fast).
+
+Statistical analyses and preliminary cryptanalysis are given in
+https://arxiv.org/abs/1612.06257.
+
+## Versioning and stability
+
+SipHash and HighwayHash 1.0 are 'fingerprint functions' whose input -> hash
+mapping will not change. This is important for applications that write hashes to
+persistent storage.
+
+HighwayHash has not yet reached 1.0 and may still change in the near future. We
+will announce when it is frozen.
+
+## Speed measurements
+
+To measure the CPU cost of a hash function, we can either create an artificial
+'microbenchmark' (easier to control, but probably not representative of the
+actual runtime), or insert instrumentation directly into an application (risks
+influencing the results through observer overhead). We provide novel variants of
+both approaches that mitigate their respective disadvantages.
+
+profiler.h uses software write-combining to stream program traces to memory
+with minimal overhead. These can be analyzed offline, or when memory is full,
+to learn how much time was spent in each (possibly nested) zone.
+
+nanobenchmark.h enables cycle-accurate measurements of very short functions.
+It uses CPU fences and robust statistics to minimize variability, and also
+avoids unrealistic branch prediction effects.
+
+We compile the C++ implementations with a patched GCC 4.9 and run on a single
+core of a Xeon E5-2690 v3 clocked at 2.6 GHz. CPU cost is measured as cycles per
+byte for various input sizes:
+
+Algorithm | 8 | 31 | 32 | 63 | 64 | 1024
+---------------- | ----- | ---- | ---- | ---- | ---- | ----
+HighwayHashAVX2 | 7.34 | 1.81 | 1.71 | 1.04 | 0.95 | 0.24
+HighwayHashSSE41 | 8.00 | 2.11 | 1.75 | 1.13 | 0.96 | 0.30
+SipTreeHash | 16.51 | 4.57 | 4.09 | 2.22 | 2.29 | 0.57
+SipTreeHash13 | 12.33 | 3.47 | 3.06 | 1.68 | 1.63 | 0.33
+SipHash | 8.13 | 2.58 | 2.73 | 1.87 | 1.93 | 1.26
+SipHash13 | 6.96 | 2.09 | 2.12 | 1.32 | 1.33 | 0.68
+
+SipTreeHash is slower than SipHash for small inputs because it processes blocks
+of 32 bytes. AVX2 and SSE4.1 HighwayHash are faster than SipHash for all input
+sizes due to their highly optimized handling of partial vectors.
+
+Note that previous measurements included the initialization of their input,
+which dramatically increased timings especially for small inputs.
+
+## CPU requirements
+
+SipTreeHash[13] requires an AVX2-capable CPU (e.g. Haswell). HighwayHash
+includes a dispatcher that chooses the best available (AVX2, SSE4.1 or portable)
+implementation at runtime, as well as a directly callable function template that
+can only run on the CPU for which it was built. SipHash[13] and
+ScalarSipTreeHash[13] have no particular CPU requirements.
+
+Our implementations use custom AVX2 vector classes with overloaded operators
+(e.g. `const V4x64U a = b + c`) for type-safety and improved readability vs.
+compiler intrinsics (e.g. `const __m256i a = _mm256_add_epi64(b, c)`).
+
+We intend to port HighwayHash to other SIMD-capable platforms, especially ARM.
+
+Our instruction_sets dispatcher avoids running newer instructions on older CPUs
+that do not support them. However, intrinsics, and therefore also any vector
+classes that use them, require a compiler flag that also enables the compiler to
+generate code for that CPU. This means the intrinsics must be placed in separate
+translation units that are compiled with the required flags. It is important
+that these source files and their headers not define any inline functions,
+because that might break the one definition rule and cause crashes.
+
+To minimize dispatch overhead when hashes are computed often (e.g. in a loop),
+we can inline the hash function into its caller using templates. The dispatch
+overhead will only be paid once (e.g. before the loop). The template mechanism
+also avoids duplicating code in each CPU-specific implementation.
+
+## Defending against hash flooding
+
+To mitigate hash flooding attacks, we need to take both the hash function and
+the data structure into account.
+
+We wish to defend (web) services that utilize hash sets/maps against
+denial-of-service attacks. Such data structures assign attacker-controlled
+input messages `m` to a hash table bin `b` by computing the hash `H(s, m)`
+using a hash function `H` seeded by `s`, and mapping it to a bin with some
+narrowing function `b = R(h)`, discussed below.
+
+Attackers may attempt to trigger 'flooding' (excessive work in insertions or
+lookups) by finding multiple `m` that map to the same bin. If the attacker has
+local access, they can do far worse, so we assume the attacker can only issue
+remote requests. If the attacker is able to send large numbers of requests,
+they can already deny service, so we need only ensure the attacker's cost is
+sufficiently large compared to the service's provisioning.
+
+If the hash function is 'weak', attackers can easily generate 'hash collisions'
+(inputs mapping to the same hash values) that are independent of the seed. In
+other words, certain input messages will cause collisions regardless of the seed
+value. The author of SipHash has published C++ programs to generate such
+'universal (key-independent) multicollisions' for CityHash and Murmur. Similar
+'differential' attacks are likely possible for any hash function consisting only
+of reversible operations (e.g. addition/multiplication/rotation) with a constant
+operand. `n` requests with such inputs cause `n^2` work for an unprotected hash
+table, which is unacceptable.
+
+By contrast, 'strong' hashes such as SipHash or HighwayHash require infeasible
+attacker effort to find a hash collision (an expected 2^32 guesses of `m` per
+the birthday paradox) or recover the seed (2^63 requests). These security claims
+assume the seed is secret. It is reasonable to suppose `s` is initially unknown
+to attackers, e.g. generated on startup or even per-connection. A timing attack
+by Wool/Bar-Yosef recovers 13-bit seeds by testing all 8K possibilities using
+millions of requests, which takes several days (even assuming unrealistic 150 us
+round-trip times). It appears infeasible to recover 64-bit seeds in this way.
+
+However, attackers are only looking for multiple `m` mapping to the same bin
+rather than identical hash values. We assume they know or are able to discover
+the hash table size `p`. It is common to choose `p = 2^i` to enable an efficient
+`R(h) := h & (p - 1)`, which simply retains the lower hash bits. It may be
+easier for attackers to compute partial collisions where only the lower `i` bits
+match. This can be prevented by choosing a prime `p` so that `R(h) := h % p`
+incorporates all hash bits. The costly modulo operation can be avoided by
+multiplying with the inverse (https://goo.gl/l7ASm8). An interesting alternative
+suggested by Kyoung Jae Seo chooses a random subset of the `h` bits. Such an `R`
+function can be computed in just 3 cycles using PEXT from the BMI2 instruction
+set. This is expected to defend against SAT-solver attacks on the hash bits at a
+slightly lower cost than the multiplicative inverse method, and still allows
+power-of-two table sizes.
+
+Summary thus far: given a strong hash function and secret seed, it appears
+infeasible for attackers to generate hash collisions because `s` and/or `R` are
+unknown. However, they can still observe the timings of data structure
+operations for various `m`. With typical table sizes of 2^10 to 2^17 entries,
+attackers can detect some 'bin collisions' (inputs mapping to the same bin).
+Although this will be costly for the attacker, they can then send many instances
+of such inputs, so we need to limit the resulting work for our data structure.
+
+Hash tables with separate chaining typically store bin entries in a linked list,
+so worst-case inputs lead to unacceptable linear-time lookup cost. We instead
+seek optimal asymptotic worst-case complexity for each operation (insertion,
+deletion and lookups), which is a constant factor times the logarithm of the
+data structure size. This naturally leads to a tree-like data structure for each
+bin. The Java8 HashMap only replaces its linked list with trees when needed.
+This leads to additional cost and complexity for deciding whether a bin is a
+list or tree.
+
+Our first proposal (suggested by Github user funny-falcon) avoids this overhead
+by always storing one tree per bin. It may also be worthwhile to store the first
+entry directly in the bin, which avoids allocating any tree nodes in the common
+case where bins are sparsely populated. What kind of tree should be used?
+Scapegoat and splay trees only offer amortized complexity guarantees, whereas
+treaps require an entropy source and have higher constant factors in practice.
+Self-balancing structures such as 2-3 or red-black trees require additional
+bookkeeping information. We can hope to reduce rebalancing cost by realizing
+that the output bits of strong `H` functions are uniformly distributed. When
+using them as keys instead of the original message `m`, recent relaxed balancing
+schemes such as left-leaning red-black or weak AVL trees may require fewer tree
+rotations to maintain their invariants. Note that `H` already determines the
+bin, so we should only use the remaining bits. 64-bit hashes are likely
+sufficient for this purpose, and HighwayHash generates up to 256 bits. It seems
+unlikely that attackers can craft inputs resulting in worst cases for both the
+bin index and tree key without being able to generate hash collisions, which
+would contradict the security claims of strong hashes. Even if they succeed, the
+relaxed tree balancing still guarantees an upper bound on height and therefore
+the worst-case operation cost. For the AVL variant, the constant factors are
+slightly lower than for red-black trees.
+
+The second proposed approach uses augmented/de-amortized cuckoo hash tables
+(https://goo.gl/PFwwkx). These guarantee worst-case `log n` bounds for all
+operations, but only if the hash function is 'indistinguishable from random'
+(uniformly distributed regardless of the input distribution), which is claimed
+for SipHash and HighwayHash but certainly not for weak hashes.
+
+Both alternatives retain good average case performance and defend against
+flooding by limiting the amount of extra work an attacker can cause. The first
+approach guarantees an upper bound of `log n` additional work even if the hash
+function is compromised.
+
+In summary, a strong hash function is not, by itself, sufficient to protect a
+chained hash table from flooding attacks. However, strong hash functions are
+important parts of two schemes for preventing denial of service. Using weak hash
+functions can slightly accelerate the best-case and average-case performance of
+a service, but at the risk of greatly reduced attack costs and worst-case
+performance.
+
+## Third-party implementations / bindings
+
+Thanks to Damian Gryski for making us aware of these third-party
+implementations or bindings. Please feel free to get in touch or
+raise an issue and we'll add yours as well.
+
+By | Language | URL
+--- | --- | ---
+Damian Gryski | Go and SSE | https://github.com/dgryski/go-highway/
+Lovell Fuller | node.js bindings | https://github.com/lovell/highwayhash
+Vinzent Steinberg | Rust bindings | https://github.com/vks/highwayhash-rs
+
+## Modules
+
+### Hashes
+
+* c_bindings.h declares C-callable versions of SipHash/HighwayHash.
+* sip_hash.cc is the compatible implementation of SipHash, and also provides
+ the final reduction for sip_tree_hash.
+* sip_tree_hash.cc is the faster but incompatible SIMD j-lanes tree hash.
+* scalar_sip_tree_hash.cc is a non-SIMD version.
+* state_helpers.h simplifies the implementation of the SipHash variants.
+* highwayhash.h is our new, fast hash function.
+* hh_avx2.h, hh_sse41.h and hh_portable.h are its various implementations.
+* highwayhash_target.h chooses the best available implementation at runtime.
+
+### Infrastructure
+
+* arch_specific.h offers byte swapping and CPUID detection.
+* compiler_specific.h defines some compiler-dependent language extensions.
+* data_parallel.h provides a C++11 ThreadPool and PerThread (similar to
+ OpenMP).
+* instruction_sets.h and targets.h enable efficient CPU-specific dispatching.
+* nanobenchmark.h measures elapsed times with < 1 cycle variability.
+* os_specific.h sets thread affinity and priority for benchmarking.
+* profiler.h is a low-overhead, deterministic hierarchical profiler.
+* tsc_timer.h obtains high-resolution timestamps without CPU reordering.
+* vector256.h and vector128.h contain wrapper classes for AVX2 and SSE4.1.
+
+By Jan Wassenberg <jan.wassenberg@gmail.com> and Jyrki Alakuijala
+<jyrki.alakuijala@gmail.com>, updated 2017-02-07
+
+This is not an official Google product.
diff --git a/contrib/libs/highwayhash/arch/avx2/ya.make b/contrib/libs/highwayhash/arch/avx2/ya.make
index 3084a352d8..df09ac249e 100644
--- a/contrib/libs/highwayhash/arch/avx2/ya.make
+++ b/contrib/libs/highwayhash/arch/avx2/ya.make
@@ -1,22 +1,22 @@
-LIBRARY()
-
+LIBRARY()
+
WITHOUT_LICENSE_TEXTS()
LICENSE(Apache-2.0)
OWNER(somov)
-
-ADDINCL(contrib/libs/highwayhash)
-
-SRCDIR(contrib/libs/highwayhash/highwayhash)
-
-CFLAGS(-mavx2)
-
-NO_COMPILER_WARNINGS()
-
-SRCS(
- sip_tree_hash.cc
- hh_avx2.cc
-)
-
-END()
+
+ADDINCL(contrib/libs/highwayhash)
+
+SRCDIR(contrib/libs/highwayhash/highwayhash)
+
+CFLAGS(-mavx2)
+
+NO_COMPILER_WARNINGS()
+
+SRCS(
+ sip_tree_hash.cc
+ hh_avx2.cc
+)
+
+END()
diff --git a/contrib/libs/highwayhash/arch/sse41/ya.make b/contrib/libs/highwayhash/arch/sse41/ya.make
index d94ad97038..e56731ef9a 100644
--- a/contrib/libs/highwayhash/arch/sse41/ya.make
+++ b/contrib/libs/highwayhash/arch/sse41/ya.make
@@ -1,21 +1,21 @@
-LIBRARY()
-
+LIBRARY()
+
WITHOUT_LICENSE_TEXTS()
LICENSE(Apache-2.0)
OWNER(somov)
-
-ADDINCL(contrib/libs/highwayhash)
-
-SRCDIR(contrib/libs/highwayhash/highwayhash)
-
-CFLAGS(-msse4.1)
-
-NO_COMPILER_WARNINGS()
-
-SRCS(
- hh_sse41.cc
-)
-
-END()
+
+ADDINCL(contrib/libs/highwayhash)
+
+SRCDIR(contrib/libs/highwayhash/highwayhash)
+
+CFLAGS(-msse4.1)
+
+NO_COMPILER_WARNINGS()
+
+SRCS(
+ hh_sse41.cc
+)
+
+END()
diff --git a/contrib/libs/highwayhash/highwayhash/arch_specific.cc b/contrib/libs/highwayhash/highwayhash/arch_specific.cc
index 1ab839f58b..b8048e46ee 100644
--- a/contrib/libs/highwayhash/highwayhash/arch_specific.cc
+++ b/contrib/libs/highwayhash/highwayhash/arch_specific.cc
@@ -1,118 +1,118 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "highwayhash/arch_specific.h"
-
-#include <stdint.h>
-
-#if HH_ARCH_X64 && !HH_MSC_VERSION
-# include <cpuid.h>
-#endif
-
-#include <string.h> // memcpy
-#include <string>
-
-namespace highwayhash {
-
-const char* TargetName(const TargetBits target_bit) {
- switch (target_bit) {
- case HH_TARGET_Portable:
- return "Portable";
- case HH_TARGET_SSE41:
- return "SSE41";
- case HH_TARGET_AVX2:
- return "AVX2";
- default:
- return nullptr; // zero, multiple, or unknown bits
- }
-}
-
-#if HH_ARCH_X64
-
-void Cpuid(const uint32_t level, const uint32_t count,
- uint32_t* HH_RESTRICT abcd) {
-#if HH_MSC_VERSION
- int regs[4];
- __cpuidex(regs, level, count);
- for (int i = 0; i < 4; ++i) {
- abcd[i] = regs[i];
- }
-#else
- uint32_t a, b, c, d;
- __cpuid_count(level, count, a, b, c, d);
- abcd[0] = a;
- abcd[1] = b;
- abcd[2] = c;
- abcd[3] = d;
-#endif
-}
-
-uint32_t ApicId() {
- uint32_t abcd[4];
- Cpuid(1, 0, abcd);
- return abcd[1] >> 24; // ebx
-}
-
-namespace {
-
-std::string BrandString() {
- char brand_string[49];
- uint32_t abcd[4];
-
- // Check if brand string is supported (it is on all reasonable Intel/AMD)
- Cpuid(0x80000000U, 0, abcd);
- if (abcd[0] < 0x80000004U) {
- return std::string();
- }
-
- for (int i = 0; i < 3; ++i) {
- Cpuid(0x80000002U + i, 0, abcd);
- memcpy(brand_string + i * 16, &abcd, sizeof(abcd));
- }
- brand_string[48] = 0;
- return brand_string;
-}
-
-double DetectInvariantCyclesPerSecond() {
- const std::string& brand_string = BrandString();
- // Brand strings include the maximum configured frequency. These prefixes are
- // defined by Intel CPUID documentation.
- const char* prefixes[3] = {"MHz", "GHz", "THz"};
- const double multipliers[3] = {1E6, 1E9, 1E12};
- for (size_t i = 0; i < 3; ++i) {
- const size_t pos_prefix = brand_string.find(prefixes[i]);
- if (pos_prefix != std::string::npos) {
- const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
- if (pos_space != std::string::npos) {
- const std::string digits =
- brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
- return std::stod(digits) * multipliers[i];
- }
- }
- }
-
- return 0.0;
-}
-
-} // namespace
-
-double InvariantCyclesPerSecond() {
- // Thread-safe caching - this is called several times.
- static const double cycles_per_second = DetectInvariantCyclesPerSecond();
- return cycles_per_second;
-}
-
-#endif // HH_ARCH_X64
-
-} // namespace highwayhash
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/arch_specific.h"
+
+#include <stdint.h>
+
+#if HH_ARCH_X64 && !HH_MSC_VERSION
+# include <cpuid.h>
+#endif
+
+#include <string.h> // memcpy
+#include <string>
+
+namespace highwayhash {
+
+const char* TargetName(const TargetBits target_bit) {
+ switch (target_bit) {
+ case HH_TARGET_Portable:
+ return "Portable";
+ case HH_TARGET_SSE41:
+ return "SSE41";
+ case HH_TARGET_AVX2:
+ return "AVX2";
+ default:
+ return nullptr; // zero, multiple, or unknown bits
+ }
+}
+
+#if HH_ARCH_X64
+
+void Cpuid(const uint32_t level, const uint32_t count,
+ uint32_t* HH_RESTRICT abcd) {
+#if HH_MSC_VERSION
+ int regs[4];
+ __cpuidex(regs, level, count);
+ for (int i = 0; i < 4; ++i) {
+ abcd[i] = regs[i];
+ }
+#else
+ uint32_t a, b, c, d;
+ __cpuid_count(level, count, a, b, c, d);
+ abcd[0] = a;
+ abcd[1] = b;
+ abcd[2] = c;
+ abcd[3] = d;
+#endif
+}
+
+uint32_t ApicId() {
+ uint32_t abcd[4];
+ Cpuid(1, 0, abcd);
+ return abcd[1] >> 24; // ebx
+}
+
+namespace {
+
+std::string BrandString() {
+ char brand_string[49];
+ uint32_t abcd[4];
+
+ // Check if brand string is supported (it is on all reasonable Intel/AMD)
+ Cpuid(0x80000000U, 0, abcd);
+ if (abcd[0] < 0x80000004U) {
+ return std::string();
+ }
+
+ for (int i = 0; i < 3; ++i) {
+ Cpuid(0x80000002U + i, 0, abcd);
+ memcpy(brand_string + i * 16, &abcd, sizeof(abcd));
+ }
+ brand_string[48] = 0;
+ return brand_string;
+}
+
+double DetectInvariantCyclesPerSecond() {
+ const std::string& brand_string = BrandString();
+ // Brand strings include the maximum configured frequency. These prefixes are
+ // defined by Intel CPUID documentation.
+ const char* prefixes[3] = {"MHz", "GHz", "THz"};
+ const double multipliers[3] = {1E6, 1E9, 1E12};
+ for (size_t i = 0; i < 3; ++i) {
+ const size_t pos_prefix = brand_string.find(prefixes[i]);
+ if (pos_prefix != std::string::npos) {
+ const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
+ if (pos_space != std::string::npos) {
+ const std::string digits =
+ brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
+ return std::stod(digits) * multipliers[i];
+ }
+ }
+ }
+
+ return 0.0;
+}
+
+} // namespace
+
+double InvariantCyclesPerSecond() {
+ // Thread-safe caching - this is called several times.
+ static const double cycles_per_second = DetectInvariantCyclesPerSecond();
+ return cycles_per_second;
+}
+
+#endif // HH_ARCH_X64
+
+} // namespace highwayhash
diff --git a/contrib/libs/highwayhash/highwayhash/arch_specific.h b/contrib/libs/highwayhash/highwayhash/arch_specific.h
index 7419d8ebbc..9fce08bd85 100644
--- a/contrib/libs/highwayhash/highwayhash/arch_specific.h
+++ b/contrib/libs/highwayhash/highwayhash/arch_specific.h
@@ -1,153 +1,153 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_ARCH_SPECIFIC_H_
-#define HIGHWAYHASH_ARCH_SPECIFIC_H_
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME.
-//
-// Background: older GCC/Clang require flags such as -mavx2 before AVX2 SIMD
-// intrinsics can be used. These intrinsics are only used within blocks that
-// first verify CPU capabilities. However, the flag also allows the compiler to
-// generate AVX2 code in other places. This can violate the One Definition Rule,
-// which requires multiple instances of a function with external linkage
-// (e.g. extern inline in a header) to be "equivalent". To prevent the resulting
-// crashes on non-AVX2 CPUs, any header (transitively) included from a
-// translation unit compiled with different flags is "restricted". This means
-// all function definitions must have internal linkage (e.g. static inline), or
-// reside in namespace HH_TARGET_NAME, which expands to a name unique to the
-// current compiler flags.
-//
-// Most C system headers are safe to include, but C++ headers should generally
-// be avoided because they often do not specify static linkage and cannot
-// reliably be wrapped in a namespace.
-
-#include "highwayhash/compiler_specific.h"
-
-#include <stdint.h>
-
-#if HH_MSC_VERSION
-#include <intrin.h> // _byteswap_*
-#endif
-
-namespace highwayhash {
-
-#if defined(__x86_64__) || defined(_M_X64)
-#define HH_ARCH_X64 1
-#else
-#define HH_ARCH_X64 0
-#endif
-
-#ifdef __aarch64__
-#define HH_ARCH_AARCH64 1
-#else
-#define HH_ARCH_AARCH64 0
-#endif
-
-#if defined(__powerpc64__) || defined(_M_PPC)
-#define HH_ARCH_PPC 1
-#else
-#define HH_ARCH_PPC 0
-#endif
-
-// Target := instruction set extension(s) such as SSE41. A translation unit can
-// only provide a single target-specific implementation because they require
-// different compiler flags.
-
-// Either the build system specifies the target by defining HH_TARGET_NAME
-// (which is necessary for Portable on X64, and SSE41 on MSVC), or we'll choose
-// the most efficient one that can be compiled given the current flags:
-#ifndef HH_TARGET_NAME
-
-// To avoid excessive code size and dispatch overhead, we only support a few
-// groups of extensions, e.g. FMA+BMI2+AVX+AVX2 =: "AVX2". These names must
-// match the HH_TARGET_* suffixes below.
-#ifdef __AVX2__
-#define HH_TARGET_NAME AVX2
-#elif defined(__SSE4_1__)
-#define HH_TARGET_NAME SSE41
-#else
-#define HH_TARGET_NAME Portable
-#endif
-
-#endif // HH_TARGET_NAME
-
-#define HH_CONCAT(first, second) first##second
-// Required due to macro expansion rules.
-#define HH_EXPAND_CONCAT(first, second) HH_CONCAT(first, second)
-// Appends HH_TARGET_NAME to "identifier_prefix".
-#define HH_ADD_TARGET_SUFFIX(identifier_prefix) \
- HH_EXPAND_CONCAT(identifier_prefix, HH_TARGET_NAME)
-
-// HH_TARGET expands to an integer constant. Typical usage: HHStateT<HH_TARGET>.
-// This ensures your code will work correctly when compiler flags are changed,
-// and benefit from subsequently added targets/specializations.
-#define HH_TARGET HH_ADD_TARGET_SUFFIX(HH_TARGET_)
-
-// Deprecated former name of HH_TARGET; please use HH_TARGET instead.
-#define HH_TARGET_PREFERRED HH_TARGET
-
-// Associate targets with integer literals so the preprocessor can compare them
-// with HH_TARGET. Do not instantiate templates with these values - use
-// HH_TARGET instead. Must be unique powers of two, see TargetBits. Always
-// defined even if unavailable on this HH_ARCH to allow calling TargetName.
-// The suffixes must match the HH_TARGET_NAME identifiers.
-#define HH_TARGET_Portable 1
-#define HH_TARGET_SSE41 2
-#define HH_TARGET_AVX2 4
-
-// Bit array for one or more HH_TARGET_*. Used to indicate which target(s) are
-// supported or were called by InstructionSets::RunAll.
-using TargetBits = unsigned;
-
-namespace HH_TARGET_NAME {
-
-// Calls func(bit_value) for every nonzero bit in "bits".
-template <class Func>
-void ForeachTarget(TargetBits bits, const Func& func) {
- while (bits != 0) {
- const TargetBits lowest = bits & (~bits + 1);
- func(lowest);
- bits &= ~lowest;
- }
-}
-
-} // namespace HH_TARGET_NAME
-
-// Returns a brief human-readable string literal identifying one of the above
-// bits, or nullptr if zero, multiple, or unknown bits are set.
-const char* TargetName(const TargetBits target_bit);
-
-#if HH_ARCH_X64
-
-// Calls CPUID instruction with eax=level and ecx=count and returns the result
-// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
-void Cpuid(const uint32_t level, const uint32_t count,
- uint32_t* HH_RESTRICT abcd);
-
-// Returns the APIC ID of the CPU on which we're currently running.
-uint32_t ApicId();
-
-// Returns nominal CPU clock frequency for converting tsc_timer cycles to
-// seconds. This is unaffected by CPU throttling ("invariant"). Thread-safe.
-double InvariantCyclesPerSecond();
-
-#endif // HH_ARCH_X64
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_ARCH_SPECIFIC_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_ARCH_SPECIFIC_H_
+#define HIGHWAYHASH_ARCH_SPECIFIC_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME.
+//
+// Background: older GCC/Clang require flags such as -mavx2 before AVX2 SIMD
+// intrinsics can be used. These intrinsics are only used within blocks that
+// first verify CPU capabilities. However, the flag also allows the compiler to
+// generate AVX2 code in other places. This can violate the One Definition Rule,
+// which requires multiple instances of a function with external linkage
+// (e.g. extern inline in a header) to be "equivalent". To prevent the resulting
+// crashes on non-AVX2 CPUs, any header (transitively) included from a
+// translation unit compiled with different flags is "restricted". This means
+// all function definitions must have internal linkage (e.g. static inline), or
+// reside in namespace HH_TARGET_NAME, which expands to a name unique to the
+// current compiler flags.
+//
+// Most C system headers are safe to include, but C++ headers should generally
+// be avoided because they often do not specify static linkage and cannot
+// reliably be wrapped in a namespace.
+
+#include "highwayhash/compiler_specific.h"
+
+#include <stdint.h>
+
+#if HH_MSC_VERSION
+#include <intrin.h> // _byteswap_*
+#endif
+
+namespace highwayhash {
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define HH_ARCH_X64 1
+#else
+#define HH_ARCH_X64 0
+#endif
+
+#ifdef __aarch64__
+#define HH_ARCH_AARCH64 1
+#else
+#define HH_ARCH_AARCH64 0
+#endif
+
+#if defined(__powerpc64__) || defined(_M_PPC)
+#define HH_ARCH_PPC 1
+#else
+#define HH_ARCH_PPC 0
+#endif
+
+// Target := instruction set extension(s) such as SSE41. A translation unit can
+// only provide a single target-specific implementation because they require
+// different compiler flags.
+
+// Either the build system specifies the target by defining HH_TARGET_NAME
+// (which is necessary for Portable on X64, and SSE41 on MSVC), or we'll choose
+// the most efficient one that can be compiled given the current flags:
+#ifndef HH_TARGET_NAME
+
+// To avoid excessive code size and dispatch overhead, we only support a few
+// groups of extensions, e.g. FMA+BMI2+AVX+AVX2 =: "AVX2". These names must
+// match the HH_TARGET_* suffixes below.
+#ifdef __AVX2__
+#define HH_TARGET_NAME AVX2
+#elif defined(__SSE4_1__)
+#define HH_TARGET_NAME SSE41
+#else
+#define HH_TARGET_NAME Portable
+#endif
+
+#endif // HH_TARGET_NAME
+
+#define HH_CONCAT(first, second) first##second
+// Required due to macro expansion rules.
+#define HH_EXPAND_CONCAT(first, second) HH_CONCAT(first, second)
+// Appends HH_TARGET_NAME to "identifier_prefix".
+#define HH_ADD_TARGET_SUFFIX(identifier_prefix) \
+ HH_EXPAND_CONCAT(identifier_prefix, HH_TARGET_NAME)
+
+// HH_TARGET expands to an integer constant. Typical usage: HHStateT<HH_TARGET>.
+// This ensures your code will work correctly when compiler flags are changed,
+// and benefit from subsequently added targets/specializations.
+#define HH_TARGET HH_ADD_TARGET_SUFFIX(HH_TARGET_)
+
+// Deprecated former name of HH_TARGET; please use HH_TARGET instead.
+#define HH_TARGET_PREFERRED HH_TARGET
+
+// Associate targets with integer literals so the preprocessor can compare them
+// with HH_TARGET. Do not instantiate templates with these values - use
+// HH_TARGET instead. Must be unique powers of two, see TargetBits. Always
+// defined even if unavailable on this HH_ARCH to allow calling TargetName.
+// The suffixes must match the HH_TARGET_NAME identifiers.
+#define HH_TARGET_Portable 1
+#define HH_TARGET_SSE41 2
+#define HH_TARGET_AVX2 4
+
+// Bit array for one or more HH_TARGET_*. Used to indicate which target(s) are
+// supported or were called by InstructionSets::RunAll.
+using TargetBits = unsigned;
+
+namespace HH_TARGET_NAME {
+
+// Calls func(bit_value) for every nonzero bit in "bits".
+template <class Func>
+void ForeachTarget(TargetBits bits, const Func& func) {
+ while (bits != 0) {
+ const TargetBits lowest = bits & (~bits + 1);
+ func(lowest);
+ bits &= ~lowest;
+ }
+}
+
+} // namespace HH_TARGET_NAME
+
+// Returns a brief human-readable string literal identifying one of the above
+// bits, or nullptr if zero, multiple, or unknown bits are set.
+const char* TargetName(const TargetBits target_bit);
+
+#if HH_ARCH_X64
+
+// Calls CPUID instruction with eax=level and ecx=count and returns the result
+// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
+void Cpuid(const uint32_t level, const uint32_t count,
+ uint32_t* HH_RESTRICT abcd);
+
+// Returns the APIC ID of the CPU on which we're currently running.
+uint32_t ApicId();
+
+// Returns nominal CPU clock frequency for converting tsc_timer cycles to
+// seconds. This is unaffected by CPU throttling ("invariant"). Thread-safe.
+double InvariantCyclesPerSecond();
+
+#endif // HH_ARCH_X64
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_ARCH_SPECIFIC_H_
diff --git a/contrib/libs/highwayhash/highwayhash/benchmark.cc b/contrib/libs/highwayhash/highwayhash/benchmark.cc
index 0422690872..7279b295b9 100644
--- a/contrib/libs/highwayhash/highwayhash/benchmark.cc
+++ b/contrib/libs/highwayhash/highwayhash/benchmark.cc
@@ -1,313 +1,313 @@
-// Copyright 2016 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Measures hash function throughput for various input sizes.
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdio>
-#include <cstdlib>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "highwayhash/compiler_specific.h"
-#include "highwayhash/instruction_sets.h"
-#include "highwayhash/nanobenchmark.h"
-#include "highwayhash/os_specific.h"
-#include "highwayhash/robust_statistics.h"
-
-// Which functions to enable (includes check for compiler support)
-#define BENCHMARK_SIP 0
-#define BENCHMARK_SIP_TREE 0
-#define BENCHMARK_HIGHWAY 1
-#define BENCHMARK_HIGHWAY_CAT 1
-#define BENCHMARK_FARM 0
-
-#include "highwayhash/highwayhash_test_target.h"
-#if BENCHMARK_SIP
-#include "highwayhash/sip_hash.h"
-#endif
-#if BENCHMARK_SIP_TREE
-#include "highwayhash/scalar_sip_tree_hash.h"
-#include "highwayhash/sip_tree_hash.h"
-#endif
-#if BENCHMARK_FARM
-#include "third_party/farmhash/src/farmhash.h"
-#endif
-
-namespace highwayhash {
-namespace {
-
-// Stores time measurements from benchmarks, with support for printing them
-// as LaTeX figures or tables.
-class Measurements {
- public:
- void Add(const char* caption, const size_t bytes, const double cycles) {
- const float cpb = static_cast<float>(cycles / bytes);
- results_.emplace_back(caption, static_cast<int>(bytes), cpb);
- }
-
- // Prints results as a LaTeX table (only for in_sizes matching the
- // desired values).
- void PrintTable(const std::vector<size_t>& in_sizes) {
- std::vector<size_t> unique = in_sizes;
- std::sort(unique.begin(), unique.end());
- unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
-
- printf("\\begin{tabular}{");
- for (size_t i = 0; i < unique.size() + 1; ++i) {
- printf("%s", i == 0 ? "r" : "|r");
- }
- printf("}\n\\toprule\nAlgorithm");
- for (const size_t in_size : unique) {
- printf(" & %zu", in_size);
- }
- printf("\\\\\n\\midrule\n");
-
- const SpeedsForCaption cpb_for_caption = SortByCaptionFilterBySize(unique);
- for (const auto& item : cpb_for_caption) {
- printf("%22s", item.first.c_str());
- for (const float cpb : item.second) {
- printf(" & %5.2f", cpb);
- }
- printf("\\\\\n");
- }
- }
-
- // Prints results suitable for pgfplots.
- void PrintPlots() {
- const SpeedsForCaption cpb_for_caption = SortByCaption();
- assert(!cpb_for_caption.empty());
- const size_t num_sizes = cpb_for_caption.begin()->second.size();
-
- printf("Size ");
- // Flatten per-caption vectors into one iterator.
- std::vector<std::vector<float>::const_iterator> iterators;
- for (const auto& item : cpb_for_caption) {
- printf("%21s ", item.first.c_str());
- assert(item.second.size() == num_sizes);
- iterators.push_back(item.second.begin());
- }
- printf("\n");
-
- const std::vector<int>& sizes = UniqueSizes();
- assert(num_sizes == sizes.size());
- for (int i = 0; i < static_cast<int>(num_sizes); ++i) {
- printf("%d ", sizes[i]);
- for (auto& it : iterators) {
- printf("%5.2f ", 1.0f / *it); // bytes per cycle
- ++it;
- }
- printf("\n");
- }
- }
-
- private:
- struct Result {
- Result(const char* caption, const int in_size, const float cpb)
- : caption(caption), in_size(in_size), cpb(cpb) {}
-
- // Algorithm name.
- std::string caption;
- // Size of the input data [bytes].
- int in_size;
- // Measured throughput [cycles per byte].
- float cpb;
- };
-
- // Returns set of all input sizes for the first column of a size/speed plot.
- std::vector<int> UniqueSizes() {
- std::vector<int> sizes;
- sizes.reserve(results_.size());
- for (const Result& result : results_) {
- sizes.push_back(result.in_size);
- }
- std::sort(sizes.begin(), sizes.end());
- sizes.erase(std::unique(sizes.begin(), sizes.end()), sizes.end());
- return sizes;
- }
-
- using SpeedsForCaption = std::map<std::string, std::vector<float>>;
-
- SpeedsForCaption SortByCaption() const {
- SpeedsForCaption cpb_for_caption;
- for (const Result& result : results_) {
- cpb_for_caption[result.caption].push_back(result.cpb);
- }
- return cpb_for_caption;
- }
-
- // Only includes measurement results matching one of the given sizes.
- SpeedsForCaption SortByCaptionFilterBySize(
- const std::vector<size_t>& in_sizes) const {
- SpeedsForCaption cpb_for_caption;
- for (const Result& result : results_) {
- for (const size_t in_size : in_sizes) {
- if (result.in_size == static_cast<int>(in_size)) {
- cpb_for_caption[result.caption].push_back(result.cpb);
- }
- }
- }
- return cpb_for_caption;
- }
-
- std::vector<Result> results_;
-};
-
-void AddMeasurements(DurationsForInputs* input_map, const char* caption,
- Measurements* measurements) {
- for (size_t i = 0; i < input_map->num_items; ++i) {
- const DurationsForInputs::Item& item = input_map->items[i];
- std::vector<float> durations(item.durations,
- item.durations + item.num_durations);
- const float median = Median(&durations);
- const float variability = MedianAbsoluteDeviation(durations, median);
- printf("%s %4zu: median=%6.1f cycles; median L1 norm =%4.1f cycles\n",
- caption, item.input, median, variability);
- measurements->Add(caption, item.input, median);
- }
- input_map->num_items = 0;
-}
-
-#if BENCHMARK_SIP || BENCHMARK_FARM || (BENCHMARK_SIP_TREE && defined(__AVX2__))
-
-void MeasureAndAdd(DurationsForInputs* input_map, const char* caption,
- const Func func, Measurements* measurements) {
- MeasureDurations(func, input_map);
- AddMeasurements(input_map, caption, measurements);
-}
-
-#endif
-
-// InstructionSets::RunAll callback.
-void AddMeasurementsWithPrefix(const char* prefix, const char* target_name,
- DurationsForInputs* input_map, void* context) {
- std::string caption(prefix);
- caption += target_name;
- AddMeasurements(input_map, caption.c_str(),
- static_cast<Measurements*>(context));
-}
-
-#if BENCHMARK_SIP
-
-uint64_t RunSip(const size_t size) {
- const HH_U64 key2[2] HH_ALIGNAS(16) = {0, 1};
- char in[kMaxBenchmarkInputSize];
- memcpy(in, &size, sizeof(size));
- return SipHash(key2, in, size);
-}
-
-uint64_t RunSip13(const size_t size) {
- const HH_U64 key2[2] HH_ALIGNAS(16) = {0, 1};
- char in[kMaxBenchmarkInputSize];
- memcpy(in, &size, sizeof(size));
- return SipHash13(key2, in, size);
-}
-
-#endif
-
-#if BENCHMARK_SIP_TREE
-
-uint64_t RunSipTree(const size_t size) {
- const HH_U64 key4[4] HH_ALIGNAS(32) = {0, 1, 2, 3};
- char in[kMaxBenchmarkInputSize];
- memcpy(in, &size, sizeof(size));
- return SipTreeHash(key4, in, size);
-}
-
-uint64_t RunSipTree13(const size_t size) {
- const HH_U64 key4[4] HH_ALIGNAS(32) = {0, 1, 2, 3};
- char in[kMaxBenchmarkInputSize];
- memcpy(in, &size, sizeof(size));
- return SipTreeHash13(key4, in, size);
-}
-
-#endif
-
-#if BENCHMARK_FARM
-
-uint64_t RunFarm(const size_t size) {
- char in[kMaxBenchmarkInputSize];
- memcpy(in, &size, sizeof(size));
- return farmhash::Fingerprint64(reinterpret_cast<const char*>(in), size);
-}
-
-#endif
-
-void AddMeasurements(const std::vector<size_t>& in_sizes,
- Measurements* measurements) {
- DurationsForInputs input_map(in_sizes.data(), in_sizes.size(), 40);
-#if BENCHMARK_SIP
- MeasureAndAdd(&input_map, "SipHash", RunSip, measurements);
- MeasureAndAdd(&input_map, "SipHash13", RunSip13, measurements);
-#endif
-
-#if BENCHMARK_SIP_TREE && defined(__AVX2__)
- MeasureAndAdd(&input_map, "SipTreeHash", RunSipTree, measurements);
- MeasureAndAdd(&input_map, "SipTreeHash13", RunSipTree13, measurements);
-#endif
-
-#if BENCHMARK_FARM
- MeasureAndAdd(&input_map, "Farm", &RunFarm, measurements);
-#endif
-
-#if BENCHMARK_HIGHWAY
- InstructionSets::RunAll<HighwayHashBenchmark>(
- &input_map, &AddMeasurementsWithPrefix, measurements);
-#endif
-
-#if BENCHMARK_HIGHWAY_CAT
- InstructionSets::RunAll<HighwayHashCatBenchmark>(
- &input_map, &AddMeasurementsWithPrefix, measurements);
-#endif
-}
-
-void PrintTable() {
- const std::vector<size_t> in_sizes = {
- 7, 8, 31, 32, 63, 64, kMaxBenchmarkInputSize};
- Measurements measurements;
- AddMeasurements(in_sizes, &measurements);
- measurements.PrintTable(in_sizes);
-}
-
-void PrintPlots() {
- std::vector<size_t> in_sizes;
- for (int num_vectors = 0; num_vectors < 12; ++num_vectors) {
- for (int remainder : {0, 9, 18, 27}) {
- in_sizes.push_back(num_vectors * 32 + remainder);
- assert(in_sizes.back() <= kMaxBenchmarkInputSize);
- }
- }
-
- Measurements measurements;
- AddMeasurements(in_sizes, &measurements);
- measurements.PrintPlots();
-}
-
-} // namespace
-} // namespace highwayhash
-
-int main(int argc, char* argv[]) {
- highwayhash::PinThreadToRandomCPU();
- // No argument or t => table
- if (argc < 2 || argv[1][0] == 't') {
- highwayhash::PrintTable();
- } else if (argv[1][0] == 'p') {
- highwayhash::PrintPlots();
- }
- return 0;
-}
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Measures hash function throughput for various input sizes.
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/instruction_sets.h"
+#include "highwayhash/nanobenchmark.h"
+#include "highwayhash/os_specific.h"
+#include "highwayhash/robust_statistics.h"
+
+// Which functions to enable (includes check for compiler support)
+#define BENCHMARK_SIP 0
+#define BENCHMARK_SIP_TREE 0
+#define BENCHMARK_HIGHWAY 1
+#define BENCHMARK_HIGHWAY_CAT 1
+#define BENCHMARK_FARM 0
+
+#include "highwayhash/highwayhash_test_target.h"
+#if BENCHMARK_SIP
+#include "highwayhash/sip_hash.h"
+#endif
+#if BENCHMARK_SIP_TREE
+#include "highwayhash/scalar_sip_tree_hash.h"
+#include "highwayhash/sip_tree_hash.h"
+#endif
+#if BENCHMARK_FARM
+#include "third_party/farmhash/src/farmhash.h"
+#endif
+
+namespace highwayhash {
+namespace {
+
+// Stores time measurements from benchmarks, with support for printing them
+// as LaTeX figures or tables.
+class Measurements {
+ public:
+ void Add(const char* caption, const size_t bytes, const double cycles) {
+ const float cpb = static_cast<float>(cycles / bytes);
+ results_.emplace_back(caption, static_cast<int>(bytes), cpb);
+ }
+
+ // Prints results as a LaTeX table (only for in_sizes matching the
+ // desired values).
+ void PrintTable(const std::vector<size_t>& in_sizes) {
+ std::vector<size_t> unique = in_sizes;
+ std::sort(unique.begin(), unique.end());
+ unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
+
+ printf("\\begin{tabular}{");
+ for (size_t i = 0; i < unique.size() + 1; ++i) {
+ printf("%s", i == 0 ? "r" : "|r");
+ }
+ printf("}\n\\toprule\nAlgorithm");
+ for (const size_t in_size : unique) {
+ printf(" & %zu", in_size);
+ }
+ printf("\\\\\n\\midrule\n");
+
+ const SpeedsForCaption cpb_for_caption = SortByCaptionFilterBySize(unique);
+ for (const auto& item : cpb_for_caption) {
+ printf("%22s", item.first.c_str());
+ for (const float cpb : item.second) {
+ printf(" & %5.2f", cpb);
+ }
+ printf("\\\\\n");
+ }
+ }
+
+ // Prints results suitable for pgfplots.
+ void PrintPlots() {
+ const SpeedsForCaption cpb_for_caption = SortByCaption();
+ assert(!cpb_for_caption.empty());
+ const size_t num_sizes = cpb_for_caption.begin()->second.size();
+
+ printf("Size ");
+ // Flatten per-caption vectors into one iterator.
+ std::vector<std::vector<float>::const_iterator> iterators;
+ for (const auto& item : cpb_for_caption) {
+ printf("%21s ", item.first.c_str());
+ assert(item.second.size() == num_sizes);
+ iterators.push_back(item.second.begin());
+ }
+ printf("\n");
+
+ const std::vector<int>& sizes = UniqueSizes();
+ assert(num_sizes == sizes.size());
+ for (int i = 0; i < static_cast<int>(num_sizes); ++i) {
+ printf("%d ", sizes[i]);
+ for (auto& it : iterators) {
+ printf("%5.2f ", 1.0f / *it); // bytes per cycle
+ ++it;
+ }
+ printf("\n");
+ }
+ }
+
+ private:
+ struct Result {
+ Result(const char* caption, const int in_size, const float cpb)
+ : caption(caption), in_size(in_size), cpb(cpb) {}
+
+ // Algorithm name.
+ std::string caption;
+ // Size of the input data [bytes].
+ int in_size;
+ // Measured throughput [cycles per byte].
+ float cpb;
+ };
+
+ // Returns set of all input sizes for the first column of a size/speed plot.
+ std::vector<int> UniqueSizes() {
+ std::vector<int> sizes;
+ sizes.reserve(results_.size());
+ for (const Result& result : results_) {
+ sizes.push_back(result.in_size);
+ }
+ std::sort(sizes.begin(), sizes.end());
+ sizes.erase(std::unique(sizes.begin(), sizes.end()), sizes.end());
+ return sizes;
+ }
+
+ using SpeedsForCaption = std::map<std::string, std::vector<float>>;
+
+ SpeedsForCaption SortByCaption() const {
+ SpeedsForCaption cpb_for_caption;
+ for (const Result& result : results_) {
+ cpb_for_caption[result.caption].push_back(result.cpb);
+ }
+ return cpb_for_caption;
+ }
+
+ // Only includes measurement results matching one of the given sizes.
+ SpeedsForCaption SortByCaptionFilterBySize(
+ const std::vector<size_t>& in_sizes) const {
+ SpeedsForCaption cpb_for_caption;
+ for (const Result& result : results_) {
+ for (const size_t in_size : in_sizes) {
+ if (result.in_size == static_cast<int>(in_size)) {
+ cpb_for_caption[result.caption].push_back(result.cpb);
+ }
+ }
+ }
+ return cpb_for_caption;
+ }
+
+ std::vector<Result> results_;
+};
+
+void AddMeasurements(DurationsForInputs* input_map, const char* caption,
+ Measurements* measurements) {
+ for (size_t i = 0; i < input_map->num_items; ++i) {
+ const DurationsForInputs::Item& item = input_map->items[i];
+ std::vector<float> durations(item.durations,
+ item.durations + item.num_durations);
+ const float median = Median(&durations);
+ const float variability = MedianAbsoluteDeviation(durations, median);
+ printf("%s %4zu: median=%6.1f cycles; median L1 norm =%4.1f cycles\n",
+ caption, item.input, median, variability);
+ measurements->Add(caption, item.input, median);
+ }
+ input_map->num_items = 0;
+}
+
+#if BENCHMARK_SIP || BENCHMARK_FARM || (BENCHMARK_SIP_TREE && defined(__AVX2__))
+
+void MeasureAndAdd(DurationsForInputs* input_map, const char* caption,
+ const Func func, Measurements* measurements) {
+ MeasureDurations(func, input_map);
+ AddMeasurements(input_map, caption, measurements);
+}
+
+#endif
+
+// InstructionSets::RunAll callback.
+void AddMeasurementsWithPrefix(const char* prefix, const char* target_name,
+ DurationsForInputs* input_map, void* context) {
+ std::string caption(prefix);
+ caption += target_name;
+ AddMeasurements(input_map, caption.c_str(),
+ static_cast<Measurements*>(context));
+}
+
+#if BENCHMARK_SIP
+
+uint64_t RunSip(const size_t size) {
+ const HH_U64 key2[2] HH_ALIGNAS(16) = {0, 1};
+ char in[kMaxBenchmarkInputSize];
+ memcpy(in, &size, sizeof(size));
+ return SipHash(key2, in, size);
+}
+
+uint64_t RunSip13(const size_t size) {
+ const HH_U64 key2[2] HH_ALIGNAS(16) = {0, 1};
+ char in[kMaxBenchmarkInputSize];
+ memcpy(in, &size, sizeof(size));
+ return SipHash13(key2, in, size);
+}
+
+#endif
+
+#if BENCHMARK_SIP_TREE
+
+uint64_t RunSipTree(const size_t size) {
+ const HH_U64 key4[4] HH_ALIGNAS(32) = {0, 1, 2, 3};
+ char in[kMaxBenchmarkInputSize];
+ memcpy(in, &size, sizeof(size));
+ return SipTreeHash(key4, in, size);
+}
+
+uint64_t RunSipTree13(const size_t size) {
+ const HH_U64 key4[4] HH_ALIGNAS(32) = {0, 1, 2, 3};
+ char in[kMaxBenchmarkInputSize];
+ memcpy(in, &size, sizeof(size));
+ return SipTreeHash13(key4, in, size);
+}
+
+#endif
+
+#if BENCHMARK_FARM
+
+uint64_t RunFarm(const size_t size) {
+ char in[kMaxBenchmarkInputSize];
+ memcpy(in, &size, sizeof(size));
+ return farmhash::Fingerprint64(reinterpret_cast<const char*>(in), size);
+}
+
+#endif
+
+void AddMeasurements(const std::vector<size_t>& in_sizes,
+ Measurements* measurements) {
+ DurationsForInputs input_map(in_sizes.data(), in_sizes.size(), 40);
+#if BENCHMARK_SIP
+ MeasureAndAdd(&input_map, "SipHash", RunSip, measurements);
+ MeasureAndAdd(&input_map, "SipHash13", RunSip13, measurements);
+#endif
+
+#if BENCHMARK_SIP_TREE && defined(__AVX2__)
+ MeasureAndAdd(&input_map, "SipTreeHash", RunSipTree, measurements);
+ MeasureAndAdd(&input_map, "SipTreeHash13", RunSipTree13, measurements);
+#endif
+
+#if BENCHMARK_FARM
+ MeasureAndAdd(&input_map, "Farm", &RunFarm, measurements);
+#endif
+
+#if BENCHMARK_HIGHWAY
+ InstructionSets::RunAll<HighwayHashBenchmark>(
+ &input_map, &AddMeasurementsWithPrefix, measurements);
+#endif
+
+#if BENCHMARK_HIGHWAY_CAT
+ InstructionSets::RunAll<HighwayHashCatBenchmark>(
+ &input_map, &AddMeasurementsWithPrefix, measurements);
+#endif
+}
+
+void PrintTable() {
+ const std::vector<size_t> in_sizes = {
+ 7, 8, 31, 32, 63, 64, kMaxBenchmarkInputSize};
+ Measurements measurements;
+ AddMeasurements(in_sizes, &measurements);
+ measurements.PrintTable(in_sizes);
+}
+
+void PrintPlots() {
+ std::vector<size_t> in_sizes;
+ for (int num_vectors = 0; num_vectors < 12; ++num_vectors) {
+ for (int remainder : {0, 9, 18, 27}) {
+ in_sizes.push_back(num_vectors * 32 + remainder);
+ assert(in_sizes.back() <= kMaxBenchmarkInputSize);
+ }
+ }
+
+ Measurements measurements;
+ AddMeasurements(in_sizes, &measurements);
+ measurements.PrintPlots();
+}
+
+} // namespace
+} // namespace highwayhash
+
+int main(int argc, char* argv[]) {
+ highwayhash::PinThreadToRandomCPU();
+ // No argument or t => table
+ if (argc < 2 || argv[1][0] == 't') {
+ highwayhash::PrintTable();
+ } else if (argv[1][0] == 'p') {
+ highwayhash::PrintPlots();
+ }
+ return 0;
+}
diff --git a/contrib/libs/highwayhash/highwayhash/c_bindings.cc b/contrib/libs/highwayhash/highwayhash/c_bindings.cc
index 7e0488fb46..21d5c3652d 100644
--- a/contrib/libs/highwayhash/highwayhash/c_bindings.cc
+++ b/contrib/libs/highwayhash/highwayhash/c_bindings.cc
@@ -1,35 +1,35 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "highwayhash/c_bindings.h"
-
-#include "highwayhash/highwayhash_target.h"
-#include "highwayhash/instruction_sets.h"
-
-using highwayhash::InstructionSets;
-using highwayhash::HighwayHash;
-
-extern "C" {
-
-// Ideally this would reside in highwayhash_target.cc, but that file is
-// compiled multiple times and we must only define this function once.
-uint64_t HighwayHash64(const HHKey key, const char* bytes,
- const uint64_t size) {
- HHResult64 result;
- InstructionSets::Run<HighwayHash>(*reinterpret_cast<const HHKey*>(key), bytes,
- size, &result);
- return result;
-}
-
-} // extern "C"
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/c_bindings.h"
+
+#include "highwayhash/highwayhash_target.h"
+#include "highwayhash/instruction_sets.h"
+
+using highwayhash::InstructionSets;
+using highwayhash::HighwayHash;
+
+extern "C" {
+
+// Ideally this would reside in highwayhash_target.cc, but that file is
+// compiled multiple times and we must only define this function once.
+uint64_t HighwayHash64(const HHKey key, const char* bytes,
+ const uint64_t size) {
+ HHResult64 result;
+ InstructionSets::Run<HighwayHash>(*reinterpret_cast<const HHKey*>(key), bytes,
+ size, &result);
+ return result;
+}
+
+} // extern "C"
diff --git a/contrib/libs/highwayhash/highwayhash/c_bindings.h b/contrib/libs/highwayhash/highwayhash/c_bindings.h
index 7d52de7d75..dd24019041 100644
--- a/contrib/libs/highwayhash/highwayhash/c_bindings.h
+++ b/contrib/libs/highwayhash/highwayhash/c_bindings.h
@@ -1,55 +1,55 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_HIGHWAYHASH_C_BINDINGS_H_
-#define HIGHWAYHASH_HIGHWAYHASH_C_BINDINGS_H_
-
-// C-callable function prototypes, documented in the other header files.
-
-#include <stdint.h>
-
-#include "hh_types.h"
-
-#ifdef __cplusplus
-extern "C" {
-
-// Bring the symbols out of the namespace.
-using highwayhash::HHKey;
-using highwayhash::HHPacket;
-using highwayhash::HHResult64;
-using highwayhash::HHResult128;
-using highwayhash::HHResult256;
-#endif
-
-uint64_t SipHashC(const uint64_t* key, const char* bytes, const uint64_t size);
-uint64_t SipHash13C(const uint64_t* key, const char* bytes,
- const uint64_t size);
-
-// Uses the best implementation of HighwayHash for the current CPU and
-// calculates 64-bit hash of given data.
-uint64_t HighwayHash64(const HHKey key, const char* bytes, const uint64_t size);
-
-// Defined by highwayhash_target.cc, which requires a _Target* suffix.
-uint64_t HighwayHash64_TargetPortable(const HHKey key, const char* bytes,
- const uint64_t size);
-uint64_t HighwayHash64_TargetSSE41(const HHKey key, const char* bytes,
- const uint64_t size);
-uint64_t HighwayHash64_TargetAVX2(const HHKey key, const char* bytes,
- const uint64_t size);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // HIGHWAYHASH_HIGHWAYHASH_C_BINDINGS_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HIGHWAYHASH_C_BINDINGS_H_
+#define HIGHWAYHASH_HIGHWAYHASH_C_BINDINGS_H_
+
+// C-callable function prototypes, documented in the other header files.
+
+#include <stdint.h>
+
+#include "hh_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+
+// Bring the symbols out of the namespace.
+using highwayhash::HHKey;
+using highwayhash::HHPacket;
+using highwayhash::HHResult64;
+using highwayhash::HHResult128;
+using highwayhash::HHResult256;
+#endif
+
+uint64_t SipHashC(const uint64_t* key, const char* bytes, const uint64_t size);
+uint64_t SipHash13C(const uint64_t* key, const char* bytes,
+ const uint64_t size);
+
+// Uses the best implementation of HighwayHash for the current CPU and
+// calculates 64-bit hash of given data.
+uint64_t HighwayHash64(const HHKey key, const char* bytes, const uint64_t size);
+
+// Defined by highwayhash_target.cc, which requires a _Target* suffix.
+uint64_t HighwayHash64_TargetPortable(const HHKey key, const char* bytes,
+ const uint64_t size);
+uint64_t HighwayHash64_TargetSSE41(const HHKey key, const char* bytes,
+ const uint64_t size);
+uint64_t HighwayHash64_TargetAVX2(const HHKey key, const char* bytes,
+ const uint64_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // HIGHWAYHASH_HIGHWAYHASH_C_BINDINGS_H_
diff --git a/contrib/libs/highwayhash/highwayhash/compiler_specific.h b/contrib/libs/highwayhash/highwayhash/compiler_specific.h
index 4789f9a610..958cb6849e 100644
--- a/contrib/libs/highwayhash/highwayhash/compiler_specific.h
+++ b/contrib/libs/highwayhash/highwayhash/compiler_specific.h
@@ -1,90 +1,90 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_COMPILER_SPECIFIC_H_
-#define HIGHWAYHASH_COMPILER_SPECIFIC_H_
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-// Compiler
-
-// #if is shorter and safer than #ifdef. *_VERSION are zero if not detected,
-// otherwise 100 * major + minor version. Note that other packages check for
-// #ifdef COMPILER_MSVC, so we cannot use that same name.
-
-#ifdef _MSC_VER
-#define HH_MSC_VERSION _MSC_VER
-#else
-#define HH_MSC_VERSION 0
-#endif
-
-#ifdef __GNUC__
-#define HH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-#else
-#define HH_GCC_VERSION 0
-#endif
-
-#ifdef __clang__
-#define HH_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
-#else
-#define HH_CLANG_VERSION 0
-#endif
-
-//-----------------------------------------------------------------------------
-
-#if HH_GCC_VERSION && HH_GCC_VERSION < 408
-#define HH_ALIGNAS(multiple) __attribute__((aligned(multiple)))
-#else
-#define HH_ALIGNAS(multiple) alignas(multiple) // C++11
-#endif
-
-#if HH_MSC_VERSION
-#define HH_RESTRICT __restrict
-#elif HH_GCC_VERSION
-#define HH_RESTRICT __restrict__
-#else
-#define HH_RESTRICT
-#endif
-
-#if HH_MSC_VERSION
-#define HH_INLINE __forceinline
-#define HH_NOINLINE __declspec(noinline)
-#else
-#define HH_INLINE inline
-#define HH_NOINLINE __attribute__((noinline))
-#endif
-
-#if HH_MSC_VERSION
-// Unsupported, __assume is not the same.
-#define HH_LIKELY(expr) expr
-#define HH_UNLIKELY(expr) expr
-#else
-#define HH_LIKELY(expr) __builtin_expect(!!(expr), 1)
-#define HH_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
-#endif
-
-#if HH_MSC_VERSION
-#include <intrin.h>
-#pragma intrinsic(_ReadWriteBarrier)
-#define HH_COMPILER_FENCE _ReadWriteBarrier()
-#elif HH_GCC_VERSION
-#define HH_COMPILER_FENCE asm volatile("" : : : "memory")
-#else
-#define HH_COMPILER_FENCE
-#endif
-
-#endif // HIGHWAYHASH_COMPILER_SPECIFIC_H_
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_COMPILER_SPECIFIC_H_
+#define HIGHWAYHASH_COMPILER_SPECIFIC_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+// Compiler
+
+// #if is shorter and safer than #ifdef. *_VERSION are zero if not detected,
+// otherwise 100 * major + minor version. Note that other packages check for
+// #ifdef COMPILER_MSVC, so we cannot use that same name.
+
+#ifdef _MSC_VER
+#define HH_MSC_VERSION _MSC_VER
+#else
+#define HH_MSC_VERSION 0
+#endif
+
+#ifdef __GNUC__
+#define HH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#define HH_GCC_VERSION 0
+#endif
+
+#ifdef __clang__
+#define HH_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
+#else
+#define HH_CLANG_VERSION 0
+#endif
+
+//-----------------------------------------------------------------------------
+
+#if HH_GCC_VERSION && HH_GCC_VERSION < 408
+#define HH_ALIGNAS(multiple) __attribute__((aligned(multiple)))
+#else
+#define HH_ALIGNAS(multiple) alignas(multiple) // C++11
+#endif
+
+#if HH_MSC_VERSION
+#define HH_RESTRICT __restrict
+#elif HH_GCC_VERSION
+#define HH_RESTRICT __restrict__
+#else
+#define HH_RESTRICT
+#endif
+
+#if HH_MSC_VERSION
+#define HH_INLINE __forceinline
+#define HH_NOINLINE __declspec(noinline)
+#else
+#define HH_INLINE inline
+#define HH_NOINLINE __attribute__((noinline))
+#endif
+
+#if HH_MSC_VERSION
+// Unsupported, __assume is not the same.
+#define HH_LIKELY(expr) expr
+#define HH_UNLIKELY(expr) expr
+#else
+#define HH_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define HH_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#endif
+
+#if HH_MSC_VERSION
+#include <intrin.h>
+#pragma intrinsic(_ReadWriteBarrier)
+#define HH_COMPILER_FENCE _ReadWriteBarrier()
+#elif HH_GCC_VERSION
+#define HH_COMPILER_FENCE asm volatile("" : : : "memory")
+#else
+#define HH_COMPILER_FENCE
+#endif
+
+#endif // HIGHWAYHASH_COMPILER_SPECIFIC_H_
diff --git a/contrib/libs/highwayhash/highwayhash/data_parallel.h b/contrib/libs/highwayhash/highwayhash/data_parallel.h
index d72afc953e..72d6a47e24 100644
--- a/contrib/libs/highwayhash/highwayhash/data_parallel.h
+++ b/contrib/libs/highwayhash/highwayhash/data_parallel.h
@@ -1,341 +1,341 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_DATA_PARALLEL_H_
-#define HIGHWAYHASH_DATA_PARALLEL_H_
-
-// Portable C++11 alternative to OpenMP for data-parallel computations:
-// provides low-overhead ThreadPool, plus PerThread with support for reduction.
-
-#include <stdio.h>
-#include <algorithm> // find_if
-#include <atomic>
-#include <condition_variable> //NOLINT
-#include <cstdint>
-#include <cstdlib>
-#include <functional>
-#include <memory>
-#include <mutex> //NOLINT
-#include <thread> //NOLINT
-#include <utility>
-#include <vector>
-
-#define DATA_PARALLEL_CHECK(condition) \
- while (!(condition)) { \
- printf("data_parallel check failed at line %d\n", __LINE__); \
- abort(); \
- }
-
-namespace highwayhash {
-
-// Highly scalable thread pool, especially suitable for data-parallel
-// computations in the fork-join model, where clients need to know when all
-// tasks have completed.
-//
-// Thread pools usually store small numbers of heterogeneous tasks in a queue.
-// When tasks are identical or differ only by an integer input parameter, it is
-// much faster to store just one function of an integer parameter and call it
-// for each value.
-//
-// This thread pool can efficiently load-balance millions of tasks using an
-// atomic counter, thus avoiding per-task syscalls. With 48 hyperthreads and
-// 1M tasks that add to an atomic counter, overall runtime is 10-20x higher
-// when using std::async, and up to 200x for a queue-based ThreadPool.
-//
-// Usage:
-// ThreadPool pool;
-// pool.Run(0, 1000000, [](const int i) { Func1(i); });
-// // When Run returns, all of its tasks have finished.
-//
-// pool.RunTasks({Func2, Func3, Func4});
-// // The destructor waits until all worker threads have exited cleanly.
-class ThreadPool {
- public:
- // Starts the given number of worker threads and blocks until they are ready.
- // "num_threads" defaults to one per hyperthread.
- explicit ThreadPool(
- const int num_threads = std::thread::hardware_concurrency())
- : num_threads_(num_threads) {
- DATA_PARALLEL_CHECK(num_threads_ > 0);
- threads_.reserve(num_threads_);
- for (int i = 0; i < num_threads_; ++i) {
- threads_.emplace_back(ThreadFunc, this);
- }
-
- padding_[0] = 0; // avoid unused member warning.
-
- WorkersReadyBarrier();
- }
-
- ThreadPool(const ThreadPool&) = delete;
- ThreadPool& operator&(const ThreadPool&) = delete;
-
- // Waits for all threads to exit.
- ~ThreadPool() {
- StartWorkers(kWorkerExit);
-
- for (std::thread& thread : threads_) {
- thread.join();
- }
- }
-
- // Runs func(i) on worker thread(s) for every i in [begin, end).
- // Not thread-safe - no two calls to Run and RunTasks may overlap.
- // Subsequent calls will reuse the same threads.
- //
- // Precondition: 0 <= begin <= end.
- template <class Func>
- void Run(const int begin, const int end, const Func& func) {
- DATA_PARALLEL_CHECK(0 <= begin && begin <= end);
- if (begin == end) {
- return;
- }
- const WorkerCommand worker_command = (WorkerCommand(end) << 32) + begin;
- // Ensure the inputs do not result in a reserved command.
- DATA_PARALLEL_CHECK(worker_command != kWorkerWait);
- DATA_PARALLEL_CHECK(worker_command != kWorkerExit);
-
- // If Func is large (many captures), this will allocate memory, but it is
- // still slower to use a std::ref wrapper.
- task_ = func;
- num_reserved_.store(0);
-
- StartWorkers(worker_command);
- WorkersReadyBarrier();
- }
-
- // Runs each task (closure, typically a lambda function) on worker thread(s).
- // Not thread-safe - no two calls to Run and RunTasks may overlap.
- // Subsequent calls will reuse the same threads.
- //
- // This is a more conventional interface for heterogeneous tasks that may be
- // independent/unrelated.
- void RunTasks(const std::vector<std::function<void(void)>>& tasks) {
- Run(0, static_cast<int>(tasks.size()),
- [&tasks](const int i) { tasks[i](); });
- }
-
- // Statically (and deterministically) splits [begin, end) into ranges and
- // calls "func" for each of them. Useful when "func" involves some overhead
- // (e.g. for PerThread::Get or random seeding) that should be amortized over
- // a range of values. "func" is void(int chunk, uint32_t begin, uint32_t end).
- template <class Func>
- void RunRanges(const uint32_t begin, const uint32_t end, const Func& func) {
- const uint32_t length = end - begin;
-
- // Use constant rather than num_threads_ for machine-independent splitting.
- const uint32_t chunk = std::max(1U, (length + 127) / 128);
- std::vector<std::pair<uint32_t, uint32_t>> ranges; // begin/end
- ranges.reserve(length / chunk + 1);
- for (uint32_t i = 0; i < length; i += chunk) {
- ranges.emplace_back(begin + i, begin + std::min(i + chunk, length));
- }
-
- Run(0, static_cast<int>(ranges.size()), [&ranges, func](const int i) {
- func(i, ranges[i].first, ranges[i].second);
- });
- }
-
- private:
- // After construction and between calls to Run, workers are "ready", i.e.
- // waiting on worker_start_cv_. They are "started" by sending a "command"
- // and notifying all worker_start_cv_ waiters. (That is why all workers
- // must be ready/waiting - otherwise, the notification will not reach all of
- // them and the main thread waits in vain for them to report readiness.)
- using WorkerCommand = uint64_t;
-
- // Special values; all others encode the begin/end parameters.
- static constexpr WorkerCommand kWorkerWait = 0;
- static constexpr WorkerCommand kWorkerExit = ~0ULL;
-
- void WorkersReadyBarrier() {
- std::unique_lock<std::mutex> lock(mutex_);
- workers_ready_cv_.wait(lock,
- [this]() { return workers_ready_ == num_threads_; });
- workers_ready_ = 0;
- }
-
- // Precondition: all workers are ready.
- void StartWorkers(const WorkerCommand worker_command) {
- std::unique_lock<std::mutex> lock(mutex_);
- worker_start_command_ = worker_command;
- // Workers will need this lock, so release it before they wake up.
- lock.unlock();
- worker_start_cv_.notify_all();
- }
-
- // Attempts to reserve and perform some work from the global range of tasks,
- // which is encoded within "command". Returns after all tasks are reserved.
- static void RunRange(ThreadPool* self, const WorkerCommand command) {
- const int begin = command & 0xFFFFFFFF;
- const int end = command >> 32;
- const int num_tasks = end - begin;
-
- // OpenMP introduced several "schedule" strategies:
- // "single" (static assignment of exactly one chunk per thread): slower.
- // "dynamic" (allocates k tasks at a time): competitive for well-chosen k.
- // "guided" (allocates k tasks, decreases k): computing k = remaining/n
- // is faster than halving k each iteration. We prefer this strategy
- // because it avoids user-specified parameters.
-
- for (;;) {
- const int num_reserved = self->num_reserved_.load();
- const int num_remaining = num_tasks - num_reserved;
- const int my_size = std::max(num_remaining / (self->num_threads_ * 2), 1);
- const int my_begin = begin + self->num_reserved_.fetch_add(my_size);
- const int my_end = std::min(my_begin + my_size, begin + num_tasks);
- // Another thread already reserved the last task.
- if (my_begin >= my_end) {
- break;
- }
- for (int i = my_begin; i < my_end; ++i) {
- self->task_(i);
- }
- }
- }
-
- static void ThreadFunc(ThreadPool* self) {
- // Until kWorkerExit command received:
- for (;;) {
- std::unique_lock<std::mutex> lock(self->mutex_);
- // Notify main thread that this thread is ready.
- if (++self->workers_ready_ == self->num_threads_) {
- self->workers_ready_cv_.notify_one();
- }
- RESUME_WAIT:
- // Wait for a command.
- self->worker_start_cv_.wait(lock);
- const WorkerCommand command = self->worker_start_command_;
- switch (command) {
- case kWorkerWait: // spurious wakeup:
- goto RESUME_WAIT; // lock still held, avoid incrementing ready.
- case kWorkerExit:
- return; // exits thread
- }
-
- lock.unlock();
- RunRange(self, command);
- }
- }
-
- const int num_threads_;
-
- // Unmodified after ctor, but cannot be const because we call thread::join().
- std::vector<std::thread> threads_;
-
- std::mutex mutex_; // guards both cv and their variables.
- std::condition_variable workers_ready_cv_;
- int workers_ready_ = 0;
- std::condition_variable worker_start_cv_;
- WorkerCommand worker_start_command_;
-
- // Written by main thread, read by workers (after mutex lock/unlock).
- std::function<void(int)> task_;
-
- // Updated by workers; alignment/padding avoids false sharing.
- alignas(64) std::atomic<int> num_reserved_{0};
- int padding_[15];
-};
-
-// Thread-local storage with support for reduction (combining into one result).
-// The "T" type must be unique to the call site because the list of threads'
-// copies is a static member. (With knowledge of the underlying threads, we
-// could eliminate this list and T allocations, but that is difficult to
-// arrange and we prefer this to be usable independently of ThreadPool.)
-//
-// Usage:
-// for (int i = 0; i < N; ++i) {
-// // in each thread:
-// T& my_copy = PerThread<T>::Get();
-// my_copy.Modify();
-//
-// // single-threaded:
-// T& combined = PerThread<T>::Reduce();
-// Use(combined);
-// PerThread<T>::Destroy();
-// }
-//
-// T is duck-typed and implements the following interface:
-//
-// // Returns true if T is default-initialized or Destroy was called without
-// // any subsequent re-initialization.
-// bool IsNull() const;
-//
-// // Releases any resources. Postcondition: IsNull() == true.
-// void Destroy();
-//
-// // Merges in data from "victim". Precondition: !IsNull() && !victim.IsNull().
-// void Assimilate(const T& victim);
-template <class T>
-class PerThread {
- public:
- // Returns reference to this thread's T instance (dynamically allocated,
- // so its address is unique). Callers are responsible for any initialization
- // beyond the default ctor.
- static T& Get() {
- static thread_local T* t;
- if (t == nullptr) {
- t = new T;
- static std::mutex mutex;
- std::lock_guard<std::mutex> lock(mutex);
- Threads().push_back(t);
- }
- return *t;
- }
-
- // Returns vector of all per-thread T. Used inside Reduce() or by clients
- // that require direct access to T instead of Assimilating them.
- // Function wrapper avoids separate static member variable definition.
- static std::vector<T*>& Threads() {
- static std::vector<T*> threads;
- return threads;
- }
-
- // Returns the first non-null T after assimilating all other threads' T
- // into it. Precondition: at least one non-null T exists (caller must have
- // called Get() and initialized the result).
- static T& Reduce() {
- std::vector<T*>& threads = Threads();
-
- // Find first non-null T
- const auto it = std::find_if(threads.begin(), threads.end(),
- [](const T* t) { return !t->IsNull(); });
- if (it == threads.end()) {
- abort();
- }
- T* const first = *it;
-
- for (const T* t : threads) {
- if (t != first && !t->IsNull()) {
- first->Assimilate(*t);
- }
- }
- return *first;
- }
-
- // Calls each thread's T::Destroy to release resources and/or prepare for
- // reuse by the same threads/ThreadPool. Note that all T remain allocated
- // (we need thread-independent pointers for iterating over each thread's T,
- // and deleting them would leave dangling pointers in each thread, which is
- // unacceptable because the same thread may call Get() again later.)
- static void Destroy() {
- for (T* t : Threads()) {
- t->Destroy();
- }
- }
-};
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_DATA_PARALLEL_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_DATA_PARALLEL_H_
+#define HIGHWAYHASH_DATA_PARALLEL_H_
+
+// Portable C++11 alternative to OpenMP for data-parallel computations:
+// provides low-overhead ThreadPool, plus PerThread with support for reduction.
+
+#include <stdio.h>
+#include <algorithm> // find_if
+#include <atomic>
+#include <condition_variable> //NOLINT
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <mutex> //NOLINT
+#include <thread> //NOLINT
+#include <utility>
+#include <vector>
+
+#define DATA_PARALLEL_CHECK(condition) \
+ while (!(condition)) { \
+ printf("data_parallel check failed at line %d\n", __LINE__); \
+ abort(); \
+ }
+
+namespace highwayhash {
+
+// Highly scalable thread pool, especially suitable for data-parallel
+// computations in the fork-join model, where clients need to know when all
+// tasks have completed.
+//
+// Thread pools usually store small numbers of heterogeneous tasks in a queue.
+// When tasks are identical or differ only by an integer input parameter, it is
+// much faster to store just one function of an integer parameter and call it
+// for each value.
+//
+// This thread pool can efficiently load-balance millions of tasks using an
+// atomic counter, thus avoiding per-task syscalls. With 48 hyperthreads and
+// 1M tasks that add to an atomic counter, overall runtime is 10-20x higher
+// when using std::async, and up to 200x for a queue-based ThreadPool.
+//
+// Usage:
+// ThreadPool pool;
+// pool.Run(0, 1000000, [](const int i) { Func1(i); });
+// // When Run returns, all of its tasks have finished.
+//
+// pool.RunTasks({Func2, Func3, Func4});
+// // The destructor waits until all worker threads have exited cleanly.
+class ThreadPool {
+ public:
+ // Starts the given number of worker threads and blocks until they are ready.
+ // "num_threads" defaults to one per hyperthread.
+ explicit ThreadPool(
+ const int num_threads = std::thread::hardware_concurrency())
+ : num_threads_(num_threads) {
+ DATA_PARALLEL_CHECK(num_threads_ > 0);
+ threads_.reserve(num_threads_);
+ for (int i = 0; i < num_threads_; ++i) {
+ threads_.emplace_back(ThreadFunc, this);
+ }
+
+ padding_[0] = 0; // avoid unused member warning.
+
+ WorkersReadyBarrier();
+ }
+
+ ThreadPool(const ThreadPool&) = delete;
+ ThreadPool& operator&(const ThreadPool&) = delete;
+
+ // Waits for all threads to exit.
+ ~ThreadPool() {
+ StartWorkers(kWorkerExit);
+
+ for (std::thread& thread : threads_) {
+ thread.join();
+ }
+ }
+
+ // Runs func(i) on worker thread(s) for every i in [begin, end).
+ // Not thread-safe - no two calls to Run and RunTasks may overlap.
+ // Subsequent calls will reuse the same threads.
+ //
+ // Precondition: 0 <= begin <= end.
+ template <class Func>
+ void Run(const int begin, const int end, const Func& func) {
+ DATA_PARALLEL_CHECK(0 <= begin && begin <= end);
+ if (begin == end) {
+ return;
+ }
+ const WorkerCommand worker_command = (WorkerCommand(end) << 32) + begin;
+ // Ensure the inputs do not result in a reserved command.
+ DATA_PARALLEL_CHECK(worker_command != kWorkerWait);
+ DATA_PARALLEL_CHECK(worker_command != kWorkerExit);
+
+ // If Func is large (many captures), this will allocate memory, but it is
+ // still slower to use a std::ref wrapper.
+ task_ = func;
+ num_reserved_.store(0);
+
+ StartWorkers(worker_command);
+ WorkersReadyBarrier();
+ }
+
+ // Runs each task (closure, typically a lambda function) on worker thread(s).
+ // Not thread-safe - no two calls to Run and RunTasks may overlap.
+ // Subsequent calls will reuse the same threads.
+ //
+ // This is a more conventional interface for heterogeneous tasks that may be
+ // independent/unrelated.
+ void RunTasks(const std::vector<std::function<void(void)>>& tasks) {
+ Run(0, static_cast<int>(tasks.size()),
+ [&tasks](const int i) { tasks[i](); });
+ }
+
+ // Statically (and deterministically) splits [begin, end) into ranges and
+ // calls "func" for each of them. Useful when "func" involves some overhead
+ // (e.g. for PerThread::Get or random seeding) that should be amortized over
+ // a range of values. "func" is void(int chunk, uint32_t begin, uint32_t end).
+ template <class Func>
+ void RunRanges(const uint32_t begin, const uint32_t end, const Func& func) {
+ const uint32_t length = end - begin;
+
+ // Use constant rather than num_threads_ for machine-independent splitting.
+ const uint32_t chunk = std::max(1U, (length + 127) / 128);
+ std::vector<std::pair<uint32_t, uint32_t>> ranges; // begin/end
+ ranges.reserve(length / chunk + 1);
+ for (uint32_t i = 0; i < length; i += chunk) {
+ ranges.emplace_back(begin + i, begin + std::min(i + chunk, length));
+ }
+
+ Run(0, static_cast<int>(ranges.size()), [&ranges, func](const int i) {
+ func(i, ranges[i].first, ranges[i].second);
+ });
+ }
+
+ private:
+ // After construction and between calls to Run, workers are "ready", i.e.
+ // waiting on worker_start_cv_. They are "started" by sending a "command"
+ // and notifying all worker_start_cv_ waiters. (That is why all workers
+ // must be ready/waiting - otherwise, the notification will not reach all of
+ // them and the main thread waits in vain for them to report readiness.)
+ using WorkerCommand = uint64_t;
+
+ // Special values; all others encode the begin/end parameters.
+ static constexpr WorkerCommand kWorkerWait = 0;
+ static constexpr WorkerCommand kWorkerExit = ~0ULL;
+
+ void WorkersReadyBarrier() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ workers_ready_cv_.wait(lock,
+ [this]() { return workers_ready_ == num_threads_; });
+ workers_ready_ = 0;
+ }
+
+ // Precondition: all workers are ready.
+ void StartWorkers(const WorkerCommand worker_command) {
+ std::unique_lock<std::mutex> lock(mutex_);
+ worker_start_command_ = worker_command;
+ // Workers will need this lock, so release it before they wake up.
+ lock.unlock();
+ worker_start_cv_.notify_all();
+ }
+
+ // Attempts to reserve and perform some work from the global range of tasks,
+ // which is encoded within "command". Returns after all tasks are reserved.
+ static void RunRange(ThreadPool* self, const WorkerCommand command) {
+ const int begin = command & 0xFFFFFFFF;
+ const int end = command >> 32;
+ const int num_tasks = end - begin;
+
+ // OpenMP introduced several "schedule" strategies:
+ // "single" (static assignment of exactly one chunk per thread): slower.
+ // "dynamic" (allocates k tasks at a time): competitive for well-chosen k.
+ // "guided" (allocates k tasks, decreases k): computing k = remaining/n
+ // is faster than halving k each iteration. We prefer this strategy
+ // because it avoids user-specified parameters.
+
+ for (;;) {
+ const int num_reserved = self->num_reserved_.load();
+ const int num_remaining = num_tasks - num_reserved;
+ const int my_size = std::max(num_remaining / (self->num_threads_ * 2), 1);
+ const int my_begin = begin + self->num_reserved_.fetch_add(my_size);
+ const int my_end = std::min(my_begin + my_size, begin + num_tasks);
+ // Another thread already reserved the last task.
+ if (my_begin >= my_end) {
+ break;
+ }
+ for (int i = my_begin; i < my_end; ++i) {
+ self->task_(i);
+ }
+ }
+ }
+
+ static void ThreadFunc(ThreadPool* self) {
+ // Until kWorkerExit command received:
+ for (;;) {
+ std::unique_lock<std::mutex> lock(self->mutex_);
+ // Notify main thread that this thread is ready.
+ if (++self->workers_ready_ == self->num_threads_) {
+ self->workers_ready_cv_.notify_one();
+ }
+ RESUME_WAIT:
+ // Wait for a command.
+ self->worker_start_cv_.wait(lock);
+ const WorkerCommand command = self->worker_start_command_;
+ switch (command) {
+ case kWorkerWait: // spurious wakeup:
+ goto RESUME_WAIT; // lock still held, avoid incrementing ready.
+ case kWorkerExit:
+ return; // exits thread
+ }
+
+ lock.unlock();
+ RunRange(self, command);
+ }
+ }
+
+ const int num_threads_;
+
+ // Unmodified after ctor, but cannot be const because we call thread::join().
+ std::vector<std::thread> threads_;
+
+ std::mutex mutex_; // guards both cv and their variables.
+ std::condition_variable workers_ready_cv_;
+ int workers_ready_ = 0;
+ std::condition_variable worker_start_cv_;
+ WorkerCommand worker_start_command_;
+
+ // Written by main thread, read by workers (after mutex lock/unlock).
+ std::function<void(int)> task_;
+
+ // Updated by workers; alignment/padding avoids false sharing.
+ alignas(64) std::atomic<int> num_reserved_{0};
+ int padding_[15];
+};
+
+// Thread-local storage with support for reduction (combining into one result).
+// The "T" type must be unique to the call site because the list of threads'
+// copies is a static member. (With knowledge of the underlying threads, we
+// could eliminate this list and T allocations, but that is difficult to
+// arrange and we prefer this to be usable independently of ThreadPool.)
+//
+// Usage:
+// for (int i = 0; i < N; ++i) {
+// // in each thread:
+// T& my_copy = PerThread<T>::Get();
+// my_copy.Modify();
+//
+// // single-threaded:
+// T& combined = PerThread<T>::Reduce();
+// Use(combined);
+// PerThread<T>::Destroy();
+// }
+//
+// T is duck-typed and implements the following interface:
+//
+// // Returns true if T is default-initialized or Destroy was called without
+// // any subsequent re-initialization.
+// bool IsNull() const;
+//
+// // Releases any resources. Postcondition: IsNull() == true.
+// void Destroy();
+//
+// // Merges in data from "victim". Precondition: !IsNull() && !victim.IsNull().
+// void Assimilate(const T& victim);
+template <class T>
+class PerThread {
+ public:
+ // Returns reference to this thread's T instance (dynamically allocated,
+ // so its address is unique). Callers are responsible for any initialization
+ // beyond the default ctor.
+ static T& Get() {
+ static thread_local T* t;
+ if (t == nullptr) {
+ t = new T;
+ static std::mutex mutex;
+ std::lock_guard<std::mutex> lock(mutex);
+ Threads().push_back(t);
+ }
+ return *t;
+ }
+
+ // Returns vector of all per-thread T. Used inside Reduce() or by clients
+ // that require direct access to T instead of Assimilating them.
+ // Function wrapper avoids separate static member variable definition.
+ static std::vector<T*>& Threads() {
+ static std::vector<T*> threads;
+ return threads;
+ }
+
+ // Returns the first non-null T after assimilating all other threads' T
+ // into it. Precondition: at least one non-null T exists (caller must have
+ // called Get() and initialized the result).
+ static T& Reduce() {
+ std::vector<T*>& threads = Threads();
+
+ // Find first non-null T
+ const auto it = std::find_if(threads.begin(), threads.end(),
+ [](const T* t) { return !t->IsNull(); });
+ if (it == threads.end()) {
+ abort();
+ }
+ T* const first = *it;
+
+ for (const T* t : threads) {
+ if (t != first && !t->IsNull()) {
+ first->Assimilate(*t);
+ }
+ }
+ return *first;
+ }
+
+ // Calls each thread's T::Destroy to release resources and/or prepare for
+ // reuse by the same threads/ThreadPool. Note that all T remain allocated
+ // (we need thread-independent pointers for iterating over each thread's T,
+ // and deleting them would leave dangling pointers in each thread, which is
+ // unacceptable because the same thread may call Get() again later.)
+ static void Destroy() {
+ for (T* t : Threads()) {
+ t->Destroy();
+ }
+ }
+};
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_DATA_PARALLEL_H_
diff --git a/contrib/libs/highwayhash/highwayhash/data_parallel_benchmark.cc b/contrib/libs/highwayhash/highwayhash/data_parallel_benchmark.cc
index ddc88b067f..fafdd93dbd 100644
--- a/contrib/libs/highwayhash/highwayhash/data_parallel_benchmark.cc
+++ b/contrib/libs/highwayhash/highwayhash/data_parallel_benchmark.cc
@@ -1,151 +1,151 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cmath>
-#include <cstdio>
-#include <future> //NOLINT
-#include <set>
-#include "testing/base/public/gunit.h"
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/data_parallel.h"
-#include "thread/threadpool.h"
-
-namespace highwayhash {
-namespace {
-
-constexpr int kBenchmarkTasks = 1000000;
-
-// Returns elapsed time [nanoseconds] for std::async.
-double BenchmarkAsync(uint64_t* total) {
- const base::Time t0 = base::Now();
- std::atomic<uint64_t> sum1{0};
- std::atomic<uint64_t> sum2{0};
-
- std::vector<std::future<void>> futures;
- futures.reserve(kBenchmarkTasks);
- for (int i = 0; i < kBenchmarkTasks; ++i) {
- futures.push_back(std::async(
- [&sum1, &sum2](const int i) {
- sum1.fetch_add(i);
- sum2.fetch_add(1);
- },
- i));
- }
-
- for (auto& future : futures) {
- future.get();
- }
-
- const base::Time t1 = base::Now();
- *total = sum1.load() + sum2.load();
- return base::ToDoubleNanoseconds(t1 - t0);
-}
-
-// Returns elapsed time [nanoseconds] for (atomic) ThreadPool.
-double BenchmarkPoolA(uint64_t* total) {
- const base::Time t0 = base::Now();
- std::atomic<uint64_t> sum1{0};
- std::atomic<uint64_t> sum2{0};
-
- ThreadPool pool;
- pool.Run(0, kBenchmarkTasks, [&sum1, &sum2](const int i) {
- sum1.fetch_add(i);
- sum2.fetch_add(1);
- });
-
- const base::Time t1 = base::Now();
- *total = sum1.load() + sum2.load();
- return base::ToDoubleNanoseconds(t1 - t0);
-}
-
-// Returns elapsed time [nanoseconds] for ::ThreadPool.
-double BenchmarkPoolG(uint64_t* total) {
- const base::Time t0 = base::Now();
- std::atomic<uint64_t> sum1{0};
- std::atomic<uint64_t> sum2{0};
-
- {
- ::ThreadPool pool(std::thread::hardware_concurrency());
- pool.StartWorkers();
- for (int i = 0; i < kBenchmarkTasks; ++i) {
- pool.Schedule([&sum1, &sum2, i]() {
- sum1.fetch_add(i);
- sum2.fetch_add(1);
- });
- }
- }
-
- const base::Time t1 = base::Now();
- *total = sum1.load() + sum2.load();
- return base::ToDoubleNanoseconds(t1 - t0);
-}
-
-// Compares ThreadPool speed to std::async and ::ThreadPool.
-TEST(DataParallelTest, Benchmarks) {
- uint64_t sum1, sum2, sum3;
- const double async_ns = BenchmarkAsync(&sum1);
- const double poolA_ns = BenchmarkPoolA(&sum2);
- const double poolG_ns = BenchmarkPoolG(&sum3);
-
- printf("Async %11.0f ns\nPoolA %11.0f ns\nPoolG %11.0f ns\n", async_ns,
- poolA_ns, poolG_ns);
- // baseline 20x, 10x with asan or msan, 5x with tsan
- EXPECT_GT(async_ns, poolA_ns * 4);
- // baseline 200x, 180x with asan, 70x with msan, 50x with tsan.
- EXPECT_GT(poolG_ns, poolA_ns * 20);
-
- // Should reach same result.
- EXPECT_EQ(sum1, sum2);
- EXPECT_EQ(sum2, sum3);
-}
-
-// Ensures multiple hardware threads are used (decided by the OS scheduler).
-TEST(DataParallelTest, TestApicIds) {
- for (int num_threads = 1; num_threads <= std::thread::hardware_concurrency();
- ++num_threads) {
- ThreadPool pool(num_threads);
-
- std::mutex mutex;
- std::set<unsigned> ids;
- double total = 0.0;
- pool.Run(0, 2 * num_threads, [&mutex, &ids, &total](const int i) {
- // Useless computations to keep the processor busy so that threads
- // can't just reuse the same processor.
- double sum = 0.0;
- for (int rep = 0; rep < 900 * (i + 30); ++rep) {
- sum += pow(rep, 0.5);
- }
-
- mutex.lock();
- ids.insert(ApicId());
- total += sum;
- mutex.unlock();
- });
-
- // No core ID / APIC ID available
- if (num_threads > 1 && ids.size() == 1) {
- EXPECT_EQ(0, *ids.begin());
- } else {
- // (The Linux scheduler doesn't use all available HTs, but the
- // computations should at least keep most cores busy.)
- EXPECT_GT(ids.size() + 2, num_threads / 4);
- }
-
- // (Ensure the busy-work is not elided.)
- EXPECT_GT(total, 1E4);
- }
-}
-
-} // namespace
-} // namespace highwayhash
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <cstdio>
+#include <future> //NOLINT
+#include <set>
+#include "testing/base/public/gunit.h"
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/data_parallel.h"
+#include "thread/threadpool.h"
+
+namespace highwayhash {
+namespace {
+
+constexpr int kBenchmarkTasks = 1000000;
+
+// Returns elapsed time [nanoseconds] for std::async.
+double BenchmarkAsync(uint64_t* total) {
+ const base::Time t0 = base::Now();
+ std::atomic<uint64_t> sum1{0};
+ std::atomic<uint64_t> sum2{0};
+
+ std::vector<std::future<void>> futures;
+ futures.reserve(kBenchmarkTasks);
+ for (int i = 0; i < kBenchmarkTasks; ++i) {
+ futures.push_back(std::async(
+ [&sum1, &sum2](const int i) {
+ sum1.fetch_add(i);
+ sum2.fetch_add(1);
+ },
+ i));
+ }
+
+ for (auto& future : futures) {
+ future.get();
+ }
+
+ const base::Time t1 = base::Now();
+ *total = sum1.load() + sum2.load();
+ return base::ToDoubleNanoseconds(t1 - t0);
+}
+
+// Returns elapsed time [nanoseconds] for (atomic) ThreadPool.
+double BenchmarkPoolA(uint64_t* total) {
+ const base::Time t0 = base::Now();
+ std::atomic<uint64_t> sum1{0};
+ std::atomic<uint64_t> sum2{0};
+
+ ThreadPool pool;
+ pool.Run(0, kBenchmarkTasks, [&sum1, &sum2](const int i) {
+ sum1.fetch_add(i);
+ sum2.fetch_add(1);
+ });
+
+ const base::Time t1 = base::Now();
+ *total = sum1.load() + sum2.load();
+ return base::ToDoubleNanoseconds(t1 - t0);
+}
+
+// Returns elapsed time [nanoseconds] for ::ThreadPool.
+double BenchmarkPoolG(uint64_t* total) {
+ const base::Time t0 = base::Now();
+ std::atomic<uint64_t> sum1{0};
+ std::atomic<uint64_t> sum2{0};
+
+ {
+ ::ThreadPool pool(std::thread::hardware_concurrency());
+ pool.StartWorkers();
+ for (int i = 0; i < kBenchmarkTasks; ++i) {
+ pool.Schedule([&sum1, &sum2, i]() {
+ sum1.fetch_add(i);
+ sum2.fetch_add(1);
+ });
+ }
+ }
+
+ const base::Time t1 = base::Now();
+ *total = sum1.load() + sum2.load();
+ return base::ToDoubleNanoseconds(t1 - t0);
+}
+
+// Compares ThreadPool speed to std::async and ::ThreadPool.
+TEST(DataParallelTest, Benchmarks) {
+ uint64_t sum1, sum2, sum3;
+ const double async_ns = BenchmarkAsync(&sum1);
+ const double poolA_ns = BenchmarkPoolA(&sum2);
+ const double poolG_ns = BenchmarkPoolG(&sum3);
+
+ printf("Async %11.0f ns\nPoolA %11.0f ns\nPoolG %11.0f ns\n", async_ns,
+ poolA_ns, poolG_ns);
+ // baseline 20x, 10x with asan or msan, 5x with tsan
+ EXPECT_GT(async_ns, poolA_ns * 4);
+ // baseline 200x, 180x with asan, 70x with msan, 50x with tsan.
+ EXPECT_GT(poolG_ns, poolA_ns * 20);
+
+ // Should reach same result.
+ EXPECT_EQ(sum1, sum2);
+ EXPECT_EQ(sum2, sum3);
+}
+
+// Ensures multiple hardware threads are used (decided by the OS scheduler).
+TEST(DataParallelTest, TestApicIds) {
+ for (int num_threads = 1; num_threads <= std::thread::hardware_concurrency();
+ ++num_threads) {
+ ThreadPool pool(num_threads);
+
+ std::mutex mutex;
+ std::set<unsigned> ids;
+ double total = 0.0;
+ pool.Run(0, 2 * num_threads, [&mutex, &ids, &total](const int i) {
+ // Useless computations to keep the processor busy so that threads
+ // can't just reuse the same processor.
+ double sum = 0.0;
+ for (int rep = 0; rep < 900 * (i + 30); ++rep) {
+ sum += pow(rep, 0.5);
+ }
+
+ mutex.lock();
+ ids.insert(ApicId());
+ total += sum;
+ mutex.unlock();
+ });
+
+ // No core ID / APIC ID available
+ if (num_threads > 1 && ids.size() == 1) {
+ EXPECT_EQ(0, *ids.begin());
+ } else {
+ // (The Linux scheduler doesn't use all available HTs, but the
+ // computations should at least keep most cores busy.)
+ EXPECT_GT(ids.size() + 2, num_threads / 4);
+ }
+
+ // (Ensure the busy-work is not elided.)
+ EXPECT_GT(total, 1E4);
+ }
+}
+
+} // namespace
+} // namespace highwayhash
diff --git a/contrib/libs/highwayhash/highwayhash/data_parallel_test.cc b/contrib/libs/highwayhash/highwayhash/data_parallel_test.cc
index 2728b7d3ad..d733620099 100644
--- a/contrib/libs/highwayhash/highwayhash/data_parallel_test.cc
+++ b/contrib/libs/highwayhash/highwayhash/data_parallel_test.cc
@@ -1,175 +1,175 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <unistd.h>
-#include <cstdint>
-
-#include "testing/base/public/gunit.h"
-#include "highwayhash/data_parallel.h"
-
-namespace highwayhash {
-namespace {
-
-int PopulationCount(uint64_t bits) {
- int num_set = 0;
- while (bits != 0) {
- num_set += bits & 1;
- bits >>= 1;
- }
- return num_set;
-}
-
-std::atomic<int> func_counts{0};
-
-void Func2() {
- usleep(200000);
- func_counts.fetch_add(4);
-}
-
-void Func3() {
- usleep(300000);
- func_counts.fetch_add(16);
-}
-
-void Func4() {
- usleep(400000);
- func_counts.fetch_add(256);
-}
-
-// Exercises the RunTasks feature (running arbitrary tasks/closures)
-TEST(DataParallelTest, TestRunTasks) {
- ThreadPool pool(4);
- pool.RunTasks({Func2, Func3, Func4});
- EXPECT_EQ(276, func_counts.load());
-}
-
-// Ensures task parameter is in bounds, every parameter is reached,
-// pool can be reused (multiple consecutive Run calls), pool can be destroyed
-// (joining with its threads).
-TEST(DataParallelTest, TestPool) {
- for (int num_threads = 1; num_threads <= 18; ++num_threads) {
- ThreadPool pool(num_threads);
- for (int num_tasks = 0; num_tasks < 32; ++num_tasks) {
- std::vector<int> mementos(num_tasks, 0);
- for (int begin = 0; begin < 32; ++begin) {
- std::fill(mementos.begin(), mementos.end(), 0);
- pool.Run(begin, begin + num_tasks,
- [begin, num_tasks, &mementos](const int i) {
- // Parameter is in the given range
- EXPECT_GE(i, begin);
- EXPECT_LT(i, begin + num_tasks);
-
- // Store mementos to be sure we visited each i.
- mementos.at(i - begin) = 1000 + i;
- });
- for (int i = begin; i < begin + num_tasks; ++i) {
- EXPECT_EQ(1000 + i, mementos.at(i - begin));
- }
- }
- }
- }
-}
-
-TEST(DataParallelTest, TestRunRanges) {
- for (int num_threads = 1; num_threads <= 18; ++num_threads) {
- ThreadPool pool(num_threads);
- for (int num_tasks = 0; num_tasks < 32; ++num_tasks) {
- std::vector<int> mementos(num_tasks, 0);
- for (int begin = 0; begin < 32; ++begin) {
- std::fill(mementos.begin(), mementos.end(), 0);
- pool.RunRanges(begin, begin + num_tasks,
- [begin, num_tasks, &mementos](const int chunk,
- const uint32_t my_begin,
- const uint32_t my_end) {
- for (uint32_t i = my_begin; i < my_end; ++i) {
- // Parameter is in the given range
- EXPECT_GE(i, begin);
- EXPECT_LT(i, begin + num_tasks);
-
- // Store mementos to be sure we visited each i.
- mementos.at(i - begin) = 1000 + i;
- }
- });
- for (int i = begin; i < begin + num_tasks; ++i) {
- EXPECT_EQ(1000 + i, mementos.at(i - begin));
- }
- }
- }
- }
-}
-
-// Ensures each of N threads processes exactly 1 of N tasks, i.e. the
-// work distribution is perfectly fair for small counts.
-TEST(DataParallelTest, TestSmallAssignments) {
- for (int num_threads = 1; num_threads <= 64; ++num_threads) {
- ThreadPool pool(num_threads);
-
- std::atomic<int> counter{0};
- // (Avoid mutex because it may perturb the worker thread scheduling)
- std::atomic<uint64_t> id_bits{0};
-
- pool.Run(0, num_threads, [&counter, num_threads, &id_bits](const int i) {
- const int id = counter.fetch_add(1);
- EXPECT_LT(id, num_threads);
- uint64_t bits = id_bits.load(std::memory_order_relaxed);
- while (!id_bits.compare_exchange_weak(bits, bits | (1ULL << id))) {
- }
- });
-
- const int num_participants = PopulationCount(id_bits.load());
- EXPECT_EQ(num_threads, num_participants);
- }
-}
-
-// Test payload for PerThread.
-struct CheckUniqueIDs {
- bool IsNull() const { return false; }
- void Destroy() { id_bits = 0; }
- void Assimilate(const CheckUniqueIDs& victim) {
- // Cannot overlap because each PerThread has unique bits.
- EXPECT_EQ(0, id_bits & victim.id_bits);
- id_bits |= victim.id_bits;
- }
-
- uint64_t id_bits = 0;
-};
-
-// Ensures each thread has a PerThread instance, that they are successfully
-// combined/reduced into a single result, and that reuse is possible after
-// Destroy().
-TEST(DataParallelTest, TestPerThread) {
- // We use a uint64_t bit array for convenience => no more than 64 threads.
- const int max_threads = std::min(64U, std::thread::hardware_concurrency());
- for (int num_threads = 1; num_threads <= max_threads; ++num_threads) {
- ThreadPool pool(num_threads);
-
- std::atomic<int> counter{0};
- pool.Run(0, num_threads, [&counter, num_threads](const int i) {
- const int id = counter.fetch_add(1);
- EXPECT_LT(id, num_threads);
- PerThread<CheckUniqueIDs>::Get().id_bits |= 1ULL << id;
- });
-
- // Verify each thread's bit is set.
- const uint64_t all_bits = PerThread<CheckUniqueIDs>::Reduce().id_bits;
- // Avoid shifting by 64 (undefined).
- const uint64_t expected =
- num_threads == 64 ? ~0ULL : (1ULL << num_threads) - 1;
- EXPECT_EQ(expected, all_bits);
- PerThread<CheckUniqueIDs>::Destroy();
- }
-}
-
-} // namespace
-} // namespace highwayhash
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+#include <cstdint>
+
+#include "testing/base/public/gunit.h"
+#include "highwayhash/data_parallel.h"
+
+namespace highwayhash {
+namespace {
+
+int PopulationCount(uint64_t bits) {
+ int num_set = 0;
+ while (bits != 0) {
+ num_set += bits & 1;
+ bits >>= 1;
+ }
+ return num_set;
+}
+
+std::atomic<int> func_counts{0};
+
+void Func2() {
+ usleep(200000);
+ func_counts.fetch_add(4);
+}
+
+void Func3() {
+ usleep(300000);
+ func_counts.fetch_add(16);
+}
+
+void Func4() {
+ usleep(400000);
+ func_counts.fetch_add(256);
+}
+
+// Exercises the RunTasks feature (running arbitrary tasks/closures)
+TEST(DataParallelTest, TestRunTasks) {
+ ThreadPool pool(4);
+ pool.RunTasks({Func2, Func3, Func4});
+ EXPECT_EQ(276, func_counts.load());
+}
+
+// Ensures task parameter is in bounds, every parameter is reached,
+// pool can be reused (multiple consecutive Run calls), pool can be destroyed
+// (joining with its threads).
+TEST(DataParallelTest, TestPool) {
+ for (int num_threads = 1; num_threads <= 18; ++num_threads) {
+ ThreadPool pool(num_threads);
+ for (int num_tasks = 0; num_tasks < 32; ++num_tasks) {
+ std::vector<int> mementos(num_tasks, 0);
+ for (int begin = 0; begin < 32; ++begin) {
+ std::fill(mementos.begin(), mementos.end(), 0);
+ pool.Run(begin, begin + num_tasks,
+ [begin, num_tasks, &mementos](const int i) {
+ // Parameter is in the given range
+ EXPECT_GE(i, begin);
+ EXPECT_LT(i, begin + num_tasks);
+
+ // Store mementos to be sure we visited each i.
+ mementos.at(i - begin) = 1000 + i;
+ });
+ for (int i = begin; i < begin + num_tasks; ++i) {
+ EXPECT_EQ(1000 + i, mementos.at(i - begin));
+ }
+ }
+ }
+ }
+}
+
+TEST(DataParallelTest, TestRunRanges) {
+ for (int num_threads = 1; num_threads <= 18; ++num_threads) {
+ ThreadPool pool(num_threads);
+ for (int num_tasks = 0; num_tasks < 32; ++num_tasks) {
+ std::vector<int> mementos(num_tasks, 0);
+ for (int begin = 0; begin < 32; ++begin) {
+ std::fill(mementos.begin(), mementos.end(), 0);
+ pool.RunRanges(begin, begin + num_tasks,
+ [begin, num_tasks, &mementos](const int chunk,
+ const uint32_t my_begin,
+ const uint32_t my_end) {
+ for (uint32_t i = my_begin; i < my_end; ++i) {
+ // Parameter is in the given range
+ EXPECT_GE(i, begin);
+ EXPECT_LT(i, begin + num_tasks);
+
+ // Store mementos to be sure we visited each i.
+ mementos.at(i - begin) = 1000 + i;
+ }
+ });
+ for (int i = begin; i < begin + num_tasks; ++i) {
+ EXPECT_EQ(1000 + i, mementos.at(i - begin));
+ }
+ }
+ }
+ }
+}
+
+// Ensures each of N threads processes exactly 1 of N tasks, i.e. the
+// work distribution is perfectly fair for small counts.
+TEST(DataParallelTest, TestSmallAssignments) {
+ for (int num_threads = 1; num_threads <= 64; ++num_threads) {
+ ThreadPool pool(num_threads);
+
+ std::atomic<int> counter{0};
+ // (Avoid mutex because it may perturb the worker thread scheduling)
+ std::atomic<uint64_t> id_bits{0};
+
+ pool.Run(0, num_threads, [&counter, num_threads, &id_bits](const int i) {
+ const int id = counter.fetch_add(1);
+ EXPECT_LT(id, num_threads);
+ uint64_t bits = id_bits.load(std::memory_order_relaxed);
+ while (!id_bits.compare_exchange_weak(bits, bits | (1ULL << id))) {
+ }
+ });
+
+ const int num_participants = PopulationCount(id_bits.load());
+ EXPECT_EQ(num_threads, num_participants);
+ }
+}
+
+// Test payload for PerThread.
+struct CheckUniqueIDs {
+ bool IsNull() const { return false; }
+ void Destroy() { id_bits = 0; }
+ void Assimilate(const CheckUniqueIDs& victim) {
+ // Cannot overlap because each PerThread has unique bits.
+ EXPECT_EQ(0, id_bits & victim.id_bits);
+ id_bits |= victim.id_bits;
+ }
+
+ uint64_t id_bits = 0;
+};
+
+// Ensures each thread has a PerThread instance, that they are successfully
+// combined/reduced into a single result, and that reuse is possible after
+// Destroy().
+TEST(DataParallelTest, TestPerThread) {
+ // We use a uint64_t bit array for convenience => no more than 64 threads.
+ const int max_threads = std::min(64U, std::thread::hardware_concurrency());
+ for (int num_threads = 1; num_threads <= max_threads; ++num_threads) {
+ ThreadPool pool(num_threads);
+
+ std::atomic<int> counter{0};
+ pool.Run(0, num_threads, [&counter, num_threads](const int i) {
+ const int id = counter.fetch_add(1);
+ EXPECT_LT(id, num_threads);
+ PerThread<CheckUniqueIDs>::Get().id_bits |= 1ULL << id;
+ });
+
+ // Verify each thread's bit is set.
+ const uint64_t all_bits = PerThread<CheckUniqueIDs>::Reduce().id_bits;
+ // Avoid shifting by 64 (undefined).
+ const uint64_t expected =
+ num_threads == 64 ? ~0ULL : (1ULL << num_threads) - 1;
+ EXPECT_EQ(expected, all_bits);
+ PerThread<CheckUniqueIDs>::Destroy();
+ }
+}
+
+} // namespace
+} // namespace highwayhash
diff --git a/contrib/libs/highwayhash/highwayhash/endianess.h b/contrib/libs/highwayhash/highwayhash/endianess.h
index 776a02fa21..6c82d6e50c 100644
--- a/contrib/libs/highwayhash/highwayhash/endianess.h
+++ b/contrib/libs/highwayhash/highwayhash/endianess.h
@@ -1,108 +1,108 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_ENDIANESS_H_
-#define HIGHWAYHASH_ENDIANESS_H_
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include <stdint.h>
-
-#if defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN)
-
- /* Someone has already included <endian.h> or equivalent. */
-
-#elif defined(__LITTLE_ENDIAN__)
-
-# define HH_IS_LITTLE_ENDIAN 1
-# define HH_IS_BIG_ENDIAN 0
-# ifdef __BIG_ENDIAN__
-# error "Platform is both little and big endian?"
-# endif
-
-#elif defined(__BIG_ENDIAN__)
-
-# define HH_IS_LITTLE_ENDIAN 0
-# define HH_IS_BIG_ENDIAN 1
-
-#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
- defined(__ORDER_LITTLE_ENDIAN__)
-
-# define HH_IS_LITTLE_ENDIAN (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-# define HH_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-
-#elif defined(__linux__) || defined(__CYGWIN__) || defined( __GNUC__ ) || \
- defined( __GNU_LIBRARY__ )
-
-# include <endian.h>
-
-#elif defined(__OpenBSD__) || defined(__NetBSD__) || defined(__FreeBSD__) || \
- defined(__DragonFly__)
-
-# include <sys/endian.h>
-
-#elif defined(_WIN32)
-
-#define HH_IS_LITTLE_ENDIAN 1
-#define HH_IS_BIG_ENDIAN 0
-
-#else
-
-# error "Unsupported platform. Cannot determine byte order."
-
-#endif
-
-
-#ifndef HH_IS_LITTLE_ENDIAN
-# define HH_IS_LITTLE_ENDIAN (BYTE_ORDER == LITTLE_ENDIAN)
-# define HH_IS_BIG_ENDIAN (BYTE_ORDER == BIG_ENDIAN)
-#endif
-
-
-namespace highwayhash {
-
-#if HH_IS_LITTLE_ENDIAN
-
-static inline uint32_t le32_from_host(uint32_t x) { return x; }
-static inline uint32_t host_from_le32(uint32_t x) { return x; }
-static inline uint64_t le64_from_host(uint64_t x) { return x; }
-static inline uint64_t host_from_le64(uint64_t x) { return x; }
-
-#elif !HH_IS_BIG_ENDIAN
-
-# error "Unsupported byte order."
-
-#elif defined(_WIN16) || defined(_WIN32) || defined(_WIN64)
-
-#include <intrin.h>
-static inline uint32_t host_from_le32(uint32_t x) { return _byteswap_ulong(x); }
-static inline uint32_t le32_from_host(uint32_t x) { return _byteswap_ulong(x); }
-static inline uint64_t host_from_le64(uint64_t x) { return _byteswap_uint64(x);}
-static inline uint64_t le64_from_host(uint64_t x) { return _byteswap_uint64(x);}
-
-#else
-
-static inline uint32_t host_from_le32(uint32_t x) {return __builtin_bswap32(x);}
-static inline uint32_t le32_from_host(uint32_t x) {return __builtin_bswap32(x);}
-static inline uint64_t host_from_le64(uint64_t x) {return __builtin_bswap64(x);}
-static inline uint64_t le64_from_host(uint64_t x) {return __builtin_bswap64(x);}
-
-#endif
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_ENDIANESS_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_ENDIANESS_H_
+#define HIGHWAYHASH_ENDIANESS_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stdint.h>
+
+#if defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN)
+
+ /* Someone has already included <endian.h> or equivalent. */
+
+#elif defined(__LITTLE_ENDIAN__)
+
+# define HH_IS_LITTLE_ENDIAN 1
+# define HH_IS_BIG_ENDIAN 0
+# ifdef __BIG_ENDIAN__
+# error "Platform is both little and big endian?"
+# endif
+
+#elif defined(__BIG_ENDIAN__)
+
+# define HH_IS_LITTLE_ENDIAN 0
+# define HH_IS_BIG_ENDIAN 1
+
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+ defined(__ORDER_LITTLE_ENDIAN__)
+
+# define HH_IS_LITTLE_ENDIAN (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+# define HH_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+
+#elif defined(__linux__) || defined(__CYGWIN__) || defined( __GNUC__ ) || \
+ defined( __GNU_LIBRARY__ )
+
+# include <endian.h>
+
+#elif defined(__OpenBSD__) || defined(__NetBSD__) || defined(__FreeBSD__) || \
+ defined(__DragonFly__)
+
+# include <sys/endian.h>
+
+#elif defined(_WIN32)
+
+#define HH_IS_LITTLE_ENDIAN 1
+#define HH_IS_BIG_ENDIAN 0
+
+#else
+
+# error "Unsupported platform. Cannot determine byte order."
+
+#endif
+
+
+#ifndef HH_IS_LITTLE_ENDIAN
+# define HH_IS_LITTLE_ENDIAN (BYTE_ORDER == LITTLE_ENDIAN)
+# define HH_IS_BIG_ENDIAN (BYTE_ORDER == BIG_ENDIAN)
+#endif
+
+
+namespace highwayhash {
+
+#if HH_IS_LITTLE_ENDIAN
+
+static inline uint32_t le32_from_host(uint32_t x) { return x; }
+static inline uint32_t host_from_le32(uint32_t x) { return x; }
+static inline uint64_t le64_from_host(uint64_t x) { return x; }
+static inline uint64_t host_from_le64(uint64_t x) { return x; }
+
+#elif !HH_IS_BIG_ENDIAN
+
+# error "Unsupported byte order."
+
+#elif defined(_WIN16) || defined(_WIN32) || defined(_WIN64)
+
+#include <intrin.h>
+static inline uint32_t host_from_le32(uint32_t x) { return _byteswap_ulong(x); }
+static inline uint32_t le32_from_host(uint32_t x) { return _byteswap_ulong(x); }
+static inline uint64_t host_from_le64(uint64_t x) { return _byteswap_uint64(x);}
+static inline uint64_t le64_from_host(uint64_t x) { return _byteswap_uint64(x);}
+
+#else
+
+static inline uint32_t host_from_le32(uint32_t x) {return __builtin_bswap32(x);}
+static inline uint32_t le32_from_host(uint32_t x) {return __builtin_bswap32(x);}
+static inline uint64_t host_from_le64(uint64_t x) {return __builtin_bswap64(x);}
+static inline uint64_t le64_from_host(uint64_t x) {return __builtin_bswap64(x);}
+
+#endif
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_ENDIANESS_H_
diff --git a/contrib/libs/highwayhash/highwayhash/example.cc b/contrib/libs/highwayhash/highwayhash/example.cc
index 587e3c5985..ed7c6a3173 100644
--- a/contrib/libs/highwayhash/highwayhash/example.cc
+++ b/contrib/libs/highwayhash/highwayhash/example.cc
@@ -1,30 +1,30 @@
-// Minimal usage example: prints a hash. Tested on x86, ppc, arm.
-
-#include "highwayhash/highwayhash.h"
-
-#include <algorithm>
-#include <iostream>
-
-using namespace highwayhash;
-
-int main(int argc, char* argv[]) {
- // Please use a different key to ensure your hashes aren't identical.
- const HHKey key HH_ALIGNAS(32) = {1, 2, 3, 4};
- // Aligning inputs to 32 bytes may help but is not required.
- const char in[] = "bytes_to_hash";
- // Type determines the hash size; can also be HHResult128 or HHResult256.
- HHResult64 result;
- // HH_TARGET_PREFERRED expands to the best specialization available for the
- // CPU detected via compiler flags (e.g. AVX2 #ifdef __AVX2__).
- HHStateT<HH_TARGET_PREFERRED> state(key);
- // Using argc prevents the compiler from eliding the hash computations.
- const size_t size = std::min(sizeof(in), static_cast<size_t>(argc));
- HighwayHashT(&state, in, size, &result);
- std::cout << "Hash : " << result << std::endl;
-
- HighwayHashCatT<HH_TARGET_PREFERRED> cat(key);
- cat.Append(in, size);
- cat.Finalize(&result);
- std::cout << "HashCat: " << result << std::endl;
- return 0;
-}
+// Minimal usage example: prints a hash. Tested on x86, ppc, arm.
+
+#include "highwayhash/highwayhash.h"
+
+#include <algorithm>
+#include <iostream>
+
+using namespace highwayhash;
+
+int main(int argc, char* argv[]) {
+ // Please use a different key to ensure your hashes aren't identical.
+ const HHKey key HH_ALIGNAS(32) = {1, 2, 3, 4};
+ // Aligning inputs to 32 bytes may help but is not required.
+ const char in[] = "bytes_to_hash";
+ // Type determines the hash size; can also be HHResult128 or HHResult256.
+ HHResult64 result;
+ // HH_TARGET_PREFERRED expands to the best specialization available for the
+ // CPU detected via compiler flags (e.g. AVX2 #ifdef __AVX2__).
+ HHStateT<HH_TARGET_PREFERRED> state(key);
+ // Using argc prevents the compiler from eliding the hash computations.
+ const size_t size = std::min(sizeof(in), static_cast<size_t>(argc));
+ HighwayHashT(&state, in, size, &result);
+ std::cout << "Hash : " << result << std::endl;
+
+ HighwayHashCatT<HH_TARGET_PREFERRED> cat(key);
+ cat.Append(in, size);
+ cat.Finalize(&result);
+ std::cout << "HashCat: " << result << std::endl;
+ return 0;
+}
diff --git a/contrib/libs/highwayhash/highwayhash/hh_avx2.cc b/contrib/libs/highwayhash/highwayhash/hh_avx2.cc
index 7e3ddff0d4..b4477ad2e2 100644
--- a/contrib/libs/highwayhash/highwayhash/hh_avx2.cc
+++ b/contrib/libs/highwayhash/highwayhash/hh_avx2.cc
@@ -1,19 +1,19 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// WARNING: this is a "restricted" source file; avoid including any headers
-// unless they are also restricted. See arch_specific.h for details.
-
-#define HH_TARGET_NAME AVX2
-#include "highwayhash/highwayhash_target.cc"
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME AVX2
+#include "highwayhash/highwayhash_target.cc"
diff --git a/contrib/libs/highwayhash/highwayhash/hh_avx2.h b/contrib/libs/highwayhash/highwayhash/hh_avx2.h
index 2912a31b11..dfb85ab32a 100644
--- a/contrib/libs/highwayhash/highwayhash/hh_avx2.h
+++ b/contrib/libs/highwayhash/highwayhash/hh_avx2.h
@@ -1,383 +1,383 @@
-// Copyright 2015-2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_HH_AVX2_H_
-#define HIGHWAYHASH_HH_AVX2_H_
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-#include "highwayhash/hh_buffer.h"
-#include "highwayhash/hh_types.h"
-#include "highwayhash/load3.h"
-#include "highwayhash/vector128.h"
-#include "highwayhash/vector256.h"
-
-// For auto-dependency generation, we need to include all headers but not their
-// contents (otherwise compilation fails because -mavx2 is not specified).
-#ifndef HH_DISABLE_TARGET_SPECIFIC
-
-namespace highwayhash {
-// See vector128.h for why this namespace is necessary; matching it here makes
-// it easier use the vector128 symbols, but requires textual inclusion.
-namespace HH_TARGET_NAME {
-
-class HHStateAVX2 {
- public:
- explicit HH_INLINE HHStateAVX2(const HHKey key_lanes) { Reset(key_lanes); }
-
- HH_INLINE void Reset(const HHKey key_lanes) {
- // "Nothing up my sleeve" numbers, concatenated hex digits of Pi from
- // http://www.numberworld.org/digits/Pi/, retrieved Feb 22, 2016.
- //
- // We use this python code to generate the fourth number to have
- // more even mixture of bits:
- /*
-def x(a,b,c):
- retval = 0
- for i in range(64):
- count = ((a >> i) & 1) + ((b >> i) & 1) + ((c >> i) & 1)
- if (count <= 1):
- retval |= 1 << i
- return retval
- */
- const V4x64U init0(0x243f6a8885a308d3ull, 0x13198a2e03707344ull,
- 0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full);
- const V4x64U init1(0x452821e638d01377ull, 0xbe5466cf34e90c6cull,
- 0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull);
- const V4x64U key = LoadUnaligned<V4x64U>(key_lanes);
- v0 = key ^ init0;
- v1 = Rotate64By32(key) ^ init1;
- mul0 = init0;
- mul1 = init1;
- }
-
- HH_INLINE void Update(const HHPacket& packet_bytes) {
- const uint64_t* HH_RESTRICT packet =
- reinterpret_cast<const uint64_t * HH_RESTRICT>(packet_bytes);
- Update(LoadUnaligned<V4x64U>(packet));
- }
-
- HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) {
- // 'Length padding' differentiates zero-valued inputs that have the same
- // size/32. mod32 is sufficient because each Update behaves as if a
- // counter were injected, because the state is large and mixed thoroughly.
- const V8x32U size256(
- _mm256_broadcastd_epi32(_mm_cvtsi64_si128(size_mod32)));
- // Equivalent to storing size_mod32 in packet.
- v0 += V4x64U(size256);
- // Boosts the avalanche effect of mod32.
- v1 = Rotate32By(v1, size256);
-
- const char* remainder = bytes + (size_mod32 & ~3);
- const size_t size_mod4 = size_mod32 & 3;
-
- const V4x32U size(_mm256_castsi256_si128(size256));
-
- // (Branching is faster than a single _mm256_maskload_epi32.)
- if (HH_UNLIKELY(size_mod32 & 16)) { // 16..31 bytes left
- const V4x32U packetL =
- LoadUnaligned<V4x32U>(reinterpret_cast<const uint32_t*>(bytes));
-
- const V4x32U int_mask = IntMask<16>()(size);
- const V4x32U int_lanes = MaskedLoadInt(bytes + 16, int_mask);
- const uint32_t last4 =
- Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4);
-
- // The upper four bytes of packetH are zero, so insert there.
- const V4x32U packetH(_mm_insert_epi32(int_lanes, last4, 3));
- Update(packetH, packetL);
- } else { // size_mod32 < 16
- const V4x32U int_mask = IntMask<0>()(size);
- const V4x32U packetL = MaskedLoadInt(bytes, int_mask);
- const uint64_t last3 =
- Load3()(Load3::AllowUnordered(), remainder, size_mod4);
-
- // Rather than insert into packetL[3], it is faster to initialize
- // the otherwise empty packetH.
- const V4x32U packetH(_mm_cvtsi64_si128(last3));
- Update(packetH, packetL);
- }
- }
-
- HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) {
- // Mix together all lanes. It is slightly better to permute v0 than v1;
- // it will be added to v1.
- Update(Permute(v0));
- Update(Permute(v0));
- Update(Permute(v0));
- Update(Permute(v0));
-
- const V2x64U sum0(_mm256_castsi256_si128(v0 + mul0));
- const V2x64U sum1(_mm256_castsi256_si128(v1 + mul1));
- const V2x64U hash = sum0 + sum1;
- // Each lane is sufficiently mixed, so just truncate to 64 bits.
- _mm_storel_epi64(reinterpret_cast<__m128i*>(result), hash);
- }
-
- HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) {
- Update(Permute(v0));
- Update(Permute(v0));
- Update(Permute(v0));
- Update(Permute(v0));
-
- const V2x64U sum0(_mm256_castsi256_si128(v0 + mul0));
- const V2x64U sum1(_mm256_extracti128_si256(v1 + mul1, 1));
- const V2x64U hash = sum0 + sum1;
- _mm_storeu_si128(reinterpret_cast<__m128i*>(result), hash);
- }
-
- HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) {
- Update(Permute(v0));
- Update(Permute(v0));
- Update(Permute(v0));
- Update(Permute(v0));
-
- const V4x64U sum0 = v0 + mul0;
- const V4x64U sum1 = v1 + mul1;
- const V4x64U hash = ModularReduction(sum1, sum0);
- StoreUnaligned(hash, &(*result)[0]);
- }
-
- // "buffer" must be 32-byte aligned.
- static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer) {
- const __m256i zero = _mm256_setzero_si256();
- _mm256_store_si256(reinterpret_cast<__m256i*>(buffer), zero);
- }
-
- // "buffer" must be 32-byte aligned.
- static HH_INLINE void CopyPartial(const char* HH_RESTRICT from,
- const size_t size_mod32,
- char* HH_RESTRICT buffer) {
- const V4x32U size(size_mod32);
- const uint32_t* const HH_RESTRICT from_u32 =
- reinterpret_cast<const uint32_t * HH_RESTRICT>(from);
- uint32_t* const HH_RESTRICT buffer_u32 =
- reinterpret_cast<uint32_t * HH_RESTRICT>(buffer);
- if (HH_UNLIKELY(size_mod32 & 16)) { // Copying 16..31 bytes
- const V4x32U inL = LoadUnaligned<V4x32U>(from_u32);
- Store(inL, buffer_u32);
- const V4x32U inH = Load0To16<16, Load3::AllowReadBefore>(
- from + 16, size_mod32 - 16, size);
- Store(inH, buffer_u32 + V4x32U::N);
- } else { // Copying 0..15 bytes
- const V4x32U inL = Load0To16<>(from, size_mod32, size);
- Store(inL, buffer_u32);
- // No need to change upper 16 bytes of buffer.
- }
- }
-
- // "buffer" must be 32-byte aligned.
- static HH_INLINE void AppendPartial(const char* HH_RESTRICT from,
- const size_t size_mod32,
- char* HH_RESTRICT buffer,
- const size_t buffer_valid) {
- const V4x32U size(size_mod32);
- uint32_t* const HH_RESTRICT buffer_u32 =
- reinterpret_cast<uint32_t * HH_RESTRICT>(buffer);
- // buffer_valid + size <= 32 => appending 0..16 bytes inside upper 16 bytes.
- if (HH_UNLIKELY(buffer_valid & 16)) {
- const V4x32U suffix = Load0To16<>(from, size_mod32, size);
- const V4x32U bufferH = Load<V4x32U>(buffer_u32 + V4x32U::N);
- const V4x32U outH = Concatenate(bufferH, buffer_valid - 16, suffix);
- Store(outH, buffer_u32 + V4x32U::N);
- } else { // Appending 0..32 bytes starting at offset 0..15.
- const V4x32U bufferL = Load<V4x32U>(buffer_u32);
- const V4x32U suffixL = Load0To16<>(from, size_mod32, size);
- const V4x32U outL = Concatenate(bufferL, buffer_valid, suffixL);
- Store(outL, buffer_u32);
- const size_t offsetH = sizeof(V4x32U) - buffer_valid;
- // Do we have enough input to start filling the upper 16 buffer bytes?
- if (size_mod32 > offsetH) {
- const size_t sizeH = size_mod32 - offsetH;
- const V4x32U outH = Load0To16<>(from + offsetH, sizeH, V4x32U(sizeH));
- Store(outH, buffer_u32 + V4x32U::N);
- }
- }
- }
-
- // "buffer" must be 32-byte aligned.
- HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from,
- const size_t size_mod32,
- const char* HH_RESTRICT buffer,
- const size_t buffer_valid) {
- const V4x32U size(size_mod32);
- const uint32_t* const HH_RESTRICT buffer_u32 =
- reinterpret_cast<const uint32_t * HH_RESTRICT>(buffer);
- // buffer_valid + size <= 32 => appending 0..16 bytes inside upper 16 bytes.
- if (HH_UNLIKELY(buffer_valid & 16)) {
- const V4x32U suffix = Load0To16<>(from, size_mod32, size);
- const V4x32U packetL = Load<V4x32U>(buffer_u32);
- const V4x32U bufferH = Load<V4x32U>(buffer_u32 + V4x32U::N);
- const V4x32U packetH = Concatenate(bufferH, buffer_valid - 16, suffix);
- Update(packetH, packetL);
- } else { // Appending 0..32 bytes starting at offset 0..15.
- const V4x32U bufferL = Load<V4x32U>(buffer_u32);
- const V4x32U suffixL = Load0To16<>(from, size_mod32, size);
- const V4x32U packetL = Concatenate(bufferL, buffer_valid, suffixL);
- const size_t offsetH = sizeof(V4x32U) - buffer_valid;
- V4x32U packetH = packetL - packetL;
- // Do we have enough input to start filling the upper 16 packet bytes?
- if (size_mod32 > offsetH) {
- const size_t sizeH = size_mod32 - offsetH;
- packetH = Load0To16<>(from + offsetH, sizeH, V4x32U(sizeH));
- }
-
- Update(packetH, packetL);
- }
- }
-
- private:
- static HH_INLINE V4x32U MaskedLoadInt(const char* from,
- const V4x32U& int_mask) {
- // No faults will be raised when reading n=0..3 ints from "from" provided
- // int_mask[n] = 0.
- const int* HH_RESTRICT int_from = reinterpret_cast<const int*>(from);
- return V4x32U(_mm_maskload_epi32(int_from, int_mask));
- }
-
- // Loads <= 16 bytes without accessing any byte outside [from, from + size).
- // from[i] is loaded into lane i; from[i >= size] is undefined.
- template <uint32_t kSizeOffset = 0, class Load3Policy = Load3::AllowNone>
- static HH_INLINE V4x32U Load0To16(const char* from, const size_t size_mod32,
- const V4x32U& size) {
- const char* remainder = from + (size_mod32 & ~3);
- const uint64_t last3 = Load3()(Load3Policy(), remainder, size_mod32 & 3);
- const V4x32U int_mask = IntMask<kSizeOffset>()(size);
- const V4x32U int_lanes = MaskedLoadInt(from, int_mask);
- return Insert4AboveMask(last3, int_mask, int_lanes);
- }
-
- static HH_INLINE V4x64U Rotate64By32(const V4x64U& v) {
- return V4x64U(_mm256_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1)));
- }
-
- // Rotates 32-bit lanes by "count" bits.
- static HH_INLINE V4x64U Rotate32By(const V4x64U& v, const V8x32U& count) {
- // Use variable shifts because sll_epi32 has 4 cycle latency (presumably
- // to broadcast the shift count).
- const V4x64U shifted_left(_mm256_sllv_epi32(v, count));
- const V4x64U shifted_right(_mm256_srlv_epi32(v, V8x32U(32) - count));
- return shifted_left | shifted_right;
- }
-
- static HH_INLINE V4x64U Permute(const V4x64U& v) {
- // For complete mixing, we need to swap the upper and lower 128-bit halves;
- // we also swap all 32-bit halves. This is faster than extracti128 plus
- // inserti128 followed by Rotate64By32.
- const V4x64U indices(0x0000000200000003ull, 0x0000000000000001ull,
- 0x0000000600000007ull, 0x0000000400000005ull);
- return V4x64U(_mm256_permutevar8x32_epi32(v, indices));
- }
-
- static HH_INLINE V4x64U MulLow32(const V4x64U& a, const V4x64U& b) {
- return V4x64U(_mm256_mul_epu32(a, b));
- }
-
- static HH_INLINE V4x64U ZipperMerge(const V4x64U& v) {
- // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
- // varying degrees. In descending order of goodness, bytes
- // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
- // As expected, the upper and lower bytes are much worse.
- // For each 64-bit lane, our objectives are:
- // 1) maximizing and equalizing total goodness across the four lanes.
- // 2) mixing with bytes from the neighboring lane (AVX-2 makes it difficult
- // to cross the 128-bit wall, but PermuteAndUpdate takes care of that);
- // 3) placing the worst bytes in the upper 32 bits because those will not
- // be used in the next 32x32 multiplication.
- const uint64_t hi = 0x070806090D0A040Bull;
- const uint64_t lo = 0x000F010E05020C03ull;
- return V4x64U(_mm256_shuffle_epi8(v, V4x64U(hi, lo, hi, lo)));
- }
-
- // Updates four hash lanes in parallel by injecting four 64-bit packets.
- HH_INLINE void Update(const V4x64U& packet) {
- v1 += packet;
- v1 += mul0;
- mul0 ^= MulLow32(v1, v0 >> 32);
- HH_COMPILER_FENCE;
- v0 += mul1;
- mul1 ^= MulLow32(v0, v1 >> 32);
- HH_COMPILER_FENCE;
- v0 += ZipperMerge(v1);
- v1 += ZipperMerge(v0);
- }
-
- HH_INLINE void Update(const V4x32U& packetH, const V4x32U& packetL) {
- const __m256i packetL256 = _mm256_castsi128_si256(packetL);
- Update(V4x64U(_mm256_inserti128_si256(packetL256, packetH, 1)));
- }
-
- // XORs a << 1 and a << 2 into *out after clearing the upper two bits of a.
- // Also does the same for the upper 128 bit lane "b". Bit shifts are only
- // possible on independent 64-bit lanes. We therefore insert the upper bits
- // of a[0] that were lost into a[1]. Thanks to D. Lemire for helpful comments!
- static HH_INLINE void XorByShift128Left12(const V4x64U& ba,
- V4x64U* HH_RESTRICT out) {
- const V4x64U zero = ba ^ ba;
- const V4x64U top_bits2 = ba >> (64 - 2);
- const V4x64U ones = ba == ba; // FF .. FF
- const V4x64U shifted1_unmasked = ba + ba; // (avoids needing port0)
- HH_COMPILER_FENCE;
-
- // Only the lower halves of top_bits1's 128 bit lanes will be used, so we
- // can compute it before clearing the upper two bits of ba.
- const V4x64U top_bits1 = ba >> (64 - 1);
- const V4x64U upper_8bytes(_mm256_slli_si256(ones, 8)); // F 0 F 0
- const V4x64U shifted2 = shifted1_unmasked + shifted1_unmasked;
- HH_COMPILER_FENCE;
-
- const V4x64U upper_bit_of_128 = upper_8bytes << 63; // 80..00 80..00
- const V4x64U new_low_bits2(_mm256_unpacklo_epi64(zero, top_bits2));
- *out ^= shifted2;
- HH_COMPILER_FENCE;
-
- // The result must be as if the upper two bits of the input had been clear,
- // otherwise we're no longer computing a reduction.
- const V4x64U shifted1 = AndNot(upper_bit_of_128, shifted1_unmasked);
- *out ^= new_low_bits2;
- HH_COMPILER_FENCE;
-
- const V4x64U new_low_bits1(_mm256_unpacklo_epi64(zero, top_bits1));
- *out ^= shifted1;
-
- *out ^= new_low_bits1;
- }
-
- // Modular reduction by the irreducible polynomial (x^128 + x^2 + x).
- // Input: two 256-bit numbers a3210 and b3210, interleaved in 2 vectors.
- // The upper and lower 128-bit halves are processed independently.
- static HH_INLINE V4x64U ModularReduction(const V4x64U& b32a32,
- const V4x64U& b10a10) {
- // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf.
- V4x64U out = b10a10;
- XorByShift128Left12(b32a32, &out);
- return out;
- }
-
- V4x64U v0;
- V4x64U v1;
- V4x64U mul0;
- V4x64U mul1;
-};
-
-} // namespace HH_TARGET_NAME
-} // namespace highwayhash
-
-#endif // HH_DISABLE_TARGET_SPECIFIC
-#endif // HIGHWAYHASH_HH_AVX2_H_
+// Copyright 2015-2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_AVX2_H_
+#define HIGHWAYHASH_HH_AVX2_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_buffer.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/load3.h"
+#include "highwayhash/vector128.h"
+#include "highwayhash/vector256.h"
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents (otherwise compilation fails because -mavx2 is not specified).
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+namespace highwayhash {
+// See vector128.h for why this namespace is necessary; matching it here makes
+// it easier use the vector128 symbols, but requires textual inclusion.
+namespace HH_TARGET_NAME {
+
+class HHStateAVX2 {
+ public:
+ explicit HH_INLINE HHStateAVX2(const HHKey key_lanes) { Reset(key_lanes); }
+
+ HH_INLINE void Reset(const HHKey key_lanes) {
+ // "Nothing up my sleeve" numbers, concatenated hex digits of Pi from
+ // http://www.numberworld.org/digits/Pi/, retrieved Feb 22, 2016.
+ //
+ // We use this python code to generate the fourth number to have
+ // more even mixture of bits:
+ /*
+def x(a,b,c):
+ retval = 0
+ for i in range(64):
+ count = ((a >> i) & 1) + ((b >> i) & 1) + ((c >> i) & 1)
+ if (count <= 1):
+ retval |= 1 << i
+ return retval
+ */
+ const V4x64U init0(0x243f6a8885a308d3ull, 0x13198a2e03707344ull,
+ 0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full);
+ const V4x64U init1(0x452821e638d01377ull, 0xbe5466cf34e90c6cull,
+ 0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull);
+ const V4x64U key = LoadUnaligned<V4x64U>(key_lanes);
+ v0 = key ^ init0;
+ v1 = Rotate64By32(key) ^ init1;
+ mul0 = init0;
+ mul1 = init1;
+ }
+
+ HH_INLINE void Update(const HHPacket& packet_bytes) {
+ const uint64_t* HH_RESTRICT packet =
+ reinterpret_cast<const uint64_t * HH_RESTRICT>(packet_bytes);
+ Update(LoadUnaligned<V4x64U>(packet));
+ }
+
+ HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) {
+ // 'Length padding' differentiates zero-valued inputs that have the same
+ // size/32. mod32 is sufficient because each Update behaves as if a
+ // counter were injected, because the state is large and mixed thoroughly.
+ const V8x32U size256(
+ _mm256_broadcastd_epi32(_mm_cvtsi64_si128(size_mod32)));
+ // Equivalent to storing size_mod32 in packet.
+ v0 += V4x64U(size256);
+ // Boosts the avalanche effect of mod32.
+ v1 = Rotate32By(v1, size256);
+
+ const char* remainder = bytes + (size_mod32 & ~3);
+ const size_t size_mod4 = size_mod32 & 3;
+
+ const V4x32U size(_mm256_castsi256_si128(size256));
+
+ // (Branching is faster than a single _mm256_maskload_epi32.)
+ if (HH_UNLIKELY(size_mod32 & 16)) { // 16..31 bytes left
+ const V4x32U packetL =
+ LoadUnaligned<V4x32U>(reinterpret_cast<const uint32_t*>(bytes));
+
+ const V4x32U int_mask = IntMask<16>()(size);
+ const V4x32U int_lanes = MaskedLoadInt(bytes + 16, int_mask);
+ const uint32_t last4 =
+ Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4);
+
+ // The upper four bytes of packetH are zero, so insert there.
+ const V4x32U packetH(_mm_insert_epi32(int_lanes, last4, 3));
+ Update(packetH, packetL);
+ } else { // size_mod32 < 16
+ const V4x32U int_mask = IntMask<0>()(size);
+ const V4x32U packetL = MaskedLoadInt(bytes, int_mask);
+ const uint64_t last3 =
+ Load3()(Load3::AllowUnordered(), remainder, size_mod4);
+
+ // Rather than insert into packetL[3], it is faster to initialize
+ // the otherwise empty packetH.
+ const V4x32U packetH(_mm_cvtsi64_si128(last3));
+ Update(packetH, packetL);
+ }
+ }
+
+ HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) {
+ // Mix together all lanes. It is slightly better to permute v0 than v1;
+ // it will be added to v1.
+ Update(Permute(v0));
+ Update(Permute(v0));
+ Update(Permute(v0));
+ Update(Permute(v0));
+
+ const V2x64U sum0(_mm256_castsi256_si128(v0 + mul0));
+ const V2x64U sum1(_mm256_castsi256_si128(v1 + mul1));
+ const V2x64U hash = sum0 + sum1;
+ // Each lane is sufficiently mixed, so just truncate to 64 bits.
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(result), hash);
+ }
+
+ HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) {
+ Update(Permute(v0));
+ Update(Permute(v0));
+ Update(Permute(v0));
+ Update(Permute(v0));
+
+ const V2x64U sum0(_mm256_castsi256_si128(v0 + mul0));
+ const V2x64U sum1(_mm256_extracti128_si256(v1 + mul1, 1));
+ const V2x64U hash = sum0 + sum1;
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(result), hash);
+ }
+
+ HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) {
+ Update(Permute(v0));
+ Update(Permute(v0));
+ Update(Permute(v0));
+ Update(Permute(v0));
+
+ const V4x64U sum0 = v0 + mul0;
+ const V4x64U sum1 = v1 + mul1;
+ const V4x64U hash = ModularReduction(sum1, sum0);
+ StoreUnaligned(hash, &(*result)[0]);
+ }
+
+ // "buffer" must be 32-byte aligned.
+ static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_store_si256(reinterpret_cast<__m256i*>(buffer), zero);
+ }
+
+ // "buffer" must be 32-byte aligned.
+ static HH_INLINE void CopyPartial(const char* HH_RESTRICT from,
+ const size_t size_mod32,
+ char* HH_RESTRICT buffer) {
+ const V4x32U size(size_mod32);
+ const uint32_t* const HH_RESTRICT from_u32 =
+ reinterpret_cast<const uint32_t * HH_RESTRICT>(from);
+ uint32_t* const HH_RESTRICT buffer_u32 =
+ reinterpret_cast<uint32_t * HH_RESTRICT>(buffer);
+ if (HH_UNLIKELY(size_mod32 & 16)) { // Copying 16..31 bytes
+ const V4x32U inL = LoadUnaligned<V4x32U>(from_u32);
+ Store(inL, buffer_u32);
+ const V4x32U inH = Load0To16<16, Load3::AllowReadBefore>(
+ from + 16, size_mod32 - 16, size);
+ Store(inH, buffer_u32 + V4x32U::N);
+ } else { // Copying 0..15 bytes
+ const V4x32U inL = Load0To16<>(from, size_mod32, size);
+ Store(inL, buffer_u32);
+ // No need to change upper 16 bytes of buffer.
+ }
+ }
+
+ // "buffer" must be 32-byte aligned.
+ static HH_INLINE void AppendPartial(const char* HH_RESTRICT from,
+ const size_t size_mod32,
+ char* HH_RESTRICT buffer,
+ const size_t buffer_valid) {
+ const V4x32U size(size_mod32);
+ uint32_t* const HH_RESTRICT buffer_u32 =
+ reinterpret_cast<uint32_t * HH_RESTRICT>(buffer);
+ // buffer_valid + size <= 32 => appending 0..16 bytes inside upper 16 bytes.
+ if (HH_UNLIKELY(buffer_valid & 16)) {
+ const V4x32U suffix = Load0To16<>(from, size_mod32, size);
+ const V4x32U bufferH = Load<V4x32U>(buffer_u32 + V4x32U::N);
+ const V4x32U outH = Concatenate(bufferH, buffer_valid - 16, suffix);
+ Store(outH, buffer_u32 + V4x32U::N);
+ } else { // Appending 0..32 bytes starting at offset 0..15.
+ const V4x32U bufferL = Load<V4x32U>(buffer_u32);
+ const V4x32U suffixL = Load0To16<>(from, size_mod32, size);
+ const V4x32U outL = Concatenate(bufferL, buffer_valid, suffixL);
+ Store(outL, buffer_u32);
+ const size_t offsetH = sizeof(V4x32U) - buffer_valid;
+ // Do we have enough input to start filling the upper 16 buffer bytes?
+ if (size_mod32 > offsetH) {
+ const size_t sizeH = size_mod32 - offsetH;
+ const V4x32U outH = Load0To16<>(from + offsetH, sizeH, V4x32U(sizeH));
+ Store(outH, buffer_u32 + V4x32U::N);
+ }
+ }
+ }
+
+ // "buffer" must be 32-byte aligned.
+ HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from,
+ const size_t size_mod32,
+ const char* HH_RESTRICT buffer,
+ const size_t buffer_valid) {
+ const V4x32U size(size_mod32);
+ const uint32_t* const HH_RESTRICT buffer_u32 =
+ reinterpret_cast<const uint32_t * HH_RESTRICT>(buffer);
+ // buffer_valid + size <= 32 => appending 0..16 bytes inside upper 16 bytes.
+ if (HH_UNLIKELY(buffer_valid & 16)) {
+ const V4x32U suffix = Load0To16<>(from, size_mod32, size);
+ const V4x32U packetL = Load<V4x32U>(buffer_u32);
+ const V4x32U bufferH = Load<V4x32U>(buffer_u32 + V4x32U::N);
+ const V4x32U packetH = Concatenate(bufferH, buffer_valid - 16, suffix);
+ Update(packetH, packetL);
+ } else { // Appending 0..32 bytes starting at offset 0..15.
+ const V4x32U bufferL = Load<V4x32U>(buffer_u32);
+ const V4x32U suffixL = Load0To16<>(from, size_mod32, size);
+ const V4x32U packetL = Concatenate(bufferL, buffer_valid, suffixL);
+ const size_t offsetH = sizeof(V4x32U) - buffer_valid;
+ V4x32U packetH = packetL - packetL;
+ // Do we have enough input to start filling the upper 16 packet bytes?
+ if (size_mod32 > offsetH) {
+ const size_t sizeH = size_mod32 - offsetH;
+ packetH = Load0To16<>(from + offsetH, sizeH, V4x32U(sizeH));
+ }
+
+ Update(packetH, packetL);
+ }
+ }
+
+ private:
+ static HH_INLINE V4x32U MaskedLoadInt(const char* from,
+ const V4x32U& int_mask) {
+ // No faults will be raised when reading n=0..3 ints from "from" provided
+ // int_mask[n] = 0.
+ const int* HH_RESTRICT int_from = reinterpret_cast<const int*>(from);
+ return V4x32U(_mm_maskload_epi32(int_from, int_mask));
+ }
+
+ // Loads <= 16 bytes without accessing any byte outside [from, from + size).
+ // from[i] is loaded into lane i; from[i >= size] is undefined.
+ template <uint32_t kSizeOffset = 0, class Load3Policy = Load3::AllowNone>
+ static HH_INLINE V4x32U Load0To16(const char* from, const size_t size_mod32,
+ const V4x32U& size) {
+ const char* remainder = from + (size_mod32 & ~3);
+ const uint64_t last3 = Load3()(Load3Policy(), remainder, size_mod32 & 3);
+ const V4x32U int_mask = IntMask<kSizeOffset>()(size);
+ const V4x32U int_lanes = MaskedLoadInt(from, int_mask);
+ return Insert4AboveMask(last3, int_mask, int_lanes);
+ }
+
+ static HH_INLINE V4x64U Rotate64By32(const V4x64U& v) {
+ return V4x64U(_mm256_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1)));
+ }
+
+ // Rotates 32-bit lanes by "count" bits.
+ static HH_INLINE V4x64U Rotate32By(const V4x64U& v, const V8x32U& count) {
+ // Use variable shifts because sll_epi32 has 4 cycle latency (presumably
+ // to broadcast the shift count).
+ const V4x64U shifted_left(_mm256_sllv_epi32(v, count));
+ const V4x64U shifted_right(_mm256_srlv_epi32(v, V8x32U(32) - count));
+ return shifted_left | shifted_right;
+ }
+
+ static HH_INLINE V4x64U Permute(const V4x64U& v) {
+ // For complete mixing, we need to swap the upper and lower 128-bit halves;
+ // we also swap all 32-bit halves. This is faster than extracti128 plus
+ // inserti128 followed by Rotate64By32.
+ const V4x64U indices(0x0000000200000003ull, 0x0000000000000001ull,
+ 0x0000000600000007ull, 0x0000000400000005ull);
+ return V4x64U(_mm256_permutevar8x32_epi32(v, indices));
+ }
+
+ static HH_INLINE V4x64U MulLow32(const V4x64U& a, const V4x64U& b) {
+ return V4x64U(_mm256_mul_epu32(a, b));
+ }
+
+ static HH_INLINE V4x64U ZipperMerge(const V4x64U& v) {
+ // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ // varying degrees. In descending order of goodness, bytes
+ // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ // As expected, the upper and lower bytes are much worse.
+ // For each 64-bit lane, our objectives are:
+ // 1) maximizing and equalizing total goodness across the four lanes.
+ // 2) mixing with bytes from the neighboring lane (AVX-2 makes it difficult
+ // to cross the 128-bit wall, but PermuteAndUpdate takes care of that);
+ // 3) placing the worst bytes in the upper 32 bits because those will not
+ // be used in the next 32x32 multiplication.
+ const uint64_t hi = 0x070806090D0A040Bull;
+ const uint64_t lo = 0x000F010E05020C03ull;
+ return V4x64U(_mm256_shuffle_epi8(v, V4x64U(hi, lo, hi, lo)));
+ }
+
+ // Updates four hash lanes in parallel by injecting four 64-bit packets.
+ HH_INLINE void Update(const V4x64U& packet) {
+ v1 += packet;
+ v1 += mul0;
+ mul0 ^= MulLow32(v1, v0 >> 32);
+ HH_COMPILER_FENCE;
+ v0 += mul1;
+ mul1 ^= MulLow32(v0, v1 >> 32);
+ HH_COMPILER_FENCE;
+ v0 += ZipperMerge(v1);
+ v1 += ZipperMerge(v0);
+ }
+
+ HH_INLINE void Update(const V4x32U& packetH, const V4x32U& packetL) {
+ const __m256i packetL256 = _mm256_castsi128_si256(packetL);
+ Update(V4x64U(_mm256_inserti128_si256(packetL256, packetH, 1)));
+ }
+
+ // XORs a << 1 and a << 2 into *out after clearing the upper two bits of a.
+ // Also does the same for the upper 128 bit lane "b". Bit shifts are only
+ // possible on independent 64-bit lanes. We therefore insert the upper bits
+ // of a[0] that were lost into a[1]. Thanks to D. Lemire for helpful comments!
+ static HH_INLINE void XorByShift128Left12(const V4x64U& ba,
+ V4x64U* HH_RESTRICT out) {
+ const V4x64U zero = ba ^ ba;
+ const V4x64U top_bits2 = ba >> (64 - 2);
+ const V4x64U ones = ba == ba; // FF .. FF
+ const V4x64U shifted1_unmasked = ba + ba; // (avoids needing port0)
+ HH_COMPILER_FENCE;
+
+ // Only the lower halves of top_bits1's 128 bit lanes will be used, so we
+ // can compute it before clearing the upper two bits of ba.
+ const V4x64U top_bits1 = ba >> (64 - 1);
+ const V4x64U upper_8bytes(_mm256_slli_si256(ones, 8)); // F 0 F 0
+ const V4x64U shifted2 = shifted1_unmasked + shifted1_unmasked;
+ HH_COMPILER_FENCE;
+
+ const V4x64U upper_bit_of_128 = upper_8bytes << 63; // 80..00 80..00
+ const V4x64U new_low_bits2(_mm256_unpacklo_epi64(zero, top_bits2));
+ *out ^= shifted2;
+ HH_COMPILER_FENCE;
+
+ // The result must be as if the upper two bits of the input had been clear,
+ // otherwise we're no longer computing a reduction.
+ const V4x64U shifted1 = AndNot(upper_bit_of_128, shifted1_unmasked);
+ *out ^= new_low_bits2;
+ HH_COMPILER_FENCE;
+
+ const V4x64U new_low_bits1(_mm256_unpacklo_epi64(zero, top_bits1));
+ *out ^= shifted1;
+
+ *out ^= new_low_bits1;
+ }
+
+ // Modular reduction by the irreducible polynomial (x^128 + x^2 + x).
+ // Input: two 256-bit numbers a3210 and b3210, interleaved in 2 vectors.
+ // The upper and lower 128-bit halves are processed independently.
+ static HH_INLINE V4x64U ModularReduction(const V4x64U& b32a32,
+ const V4x64U& b10a10) {
+ // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf.
+ V4x64U out = b10a10;
+ XorByShift128Left12(b32a32, &out);
+ return out;
+ }
+
+ V4x64U v0;
+ V4x64U v1;
+ V4x64U mul0;
+ V4x64U mul1;
+};
+
+} // namespace HH_TARGET_NAME
+} // namespace highwayhash
+
+#endif // HH_DISABLE_TARGET_SPECIFIC
+#endif // HIGHWAYHASH_HH_AVX2_H_
diff --git a/contrib/libs/highwayhash/highwayhash/hh_buffer.h b/contrib/libs/highwayhash/highwayhash/hh_buffer.h
index 5b1c83f95b..83b0fa6b8e 100644
--- a/contrib/libs/highwayhash/highwayhash/hh_buffer.h
+++ b/contrib/libs/highwayhash/highwayhash/hh_buffer.h
@@ -1,103 +1,103 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_HH_BUFFER_H_
-#define HIGHWAYHASH_HH_BUFFER_H_
-
-// Helper functions used by hh_avx2 and hh_sse41.
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include "highwayhash/vector128.h"
-
-// For auto-dependency generation, we need to include all headers but not their
-// contents (otherwise compilation fails because -msse4.1 is not specified).
-#ifndef HH_DISABLE_TARGET_SPECIFIC
-
-namespace highwayhash {
-// To prevent ODR violations when including this from multiple translation
-// units (TU) that are compiled with different flags, the contents must reside
-// in a namespace whose name is unique to the TU. NOTE: this behavior is
-// incompatible with precompiled modules and requires textual inclusion instead.
-namespace HH_TARGET_NAME {
-
-template <uint32_t kSizeOffset>
-struct IntMask {}; // primary template
-
-template <>
-struct IntMask<0> {
- // Returns 32-bit lanes : ~0U if that lane can be loaded given "size" bytes.
- // Typical case: size = 0..16, nothing deducted.
- HH_INLINE V4x32U operator()(const V4x32U& size) const {
- // Lane n is valid if size >= (n + 1) * 4; subtract one because we only have
- // greater-than comparisons and don't want a negated mask.
- return V4x32U(_mm_cmpgt_epi32(size, V4x32U(15, 11, 7, 3)));
- }
-};
-
-template <>
-struct IntMask<16> {
- // "size" is 16..31; this is for loading the upper half of a packet, so
- // effectively deduct 16 from size by changing the comparands.
- HH_INLINE V4x32U operator()(const V4x32U& size) const {
- return V4x32U(_mm_cmpgt_epi32(size, V4x32U(31, 27, 23, 19)));
- }
-};
-
-// Inserts "bytes4" into "prev" at the lowest i such that mask[i] = 0.
-// Assumes prev[j] == 0 if mask[j] = 0.
-HH_INLINE V4x32U Insert4AboveMask(const uint32_t bytes4, const V4x32U& mask,
- const V4x32U& prev) {
- // There is no 128-bit shift by a variable count. Using shuffle_epi8 with a
- // control mask requires a table lookup. We know the shift count is a
- // multiple of 4 bytes, so we can broadcastd_epi32 and clear all lanes except
- // those where mask != 0. This works because any upper output lanes need not
- // be zero.
- return prev | AndNot(mask, V4x32U(bytes4));
-}
-
-// Shifts "suffix" left by "prefix_len" = 0..15 bytes, clears upper bytes of
-// "prefix", and returns the merged/concatenated bytes.
-HH_INLINE V4x32U Concatenate(const V4x32U& prefix, const size_t prefix_len,
- const V4x32U& suffix) {
- static const uint64_t table[V16x8U::N][V2x64U::N] = {
- {0x0706050403020100ull, 0x0F0E0D0C0B0A0908ull},
- {0x06050403020100FFull, 0x0E0D0C0B0A090807ull},
- {0x050403020100FFFFull, 0x0D0C0B0A09080706ull},
- {0x0403020100FFFFFFull, 0x0C0B0A0908070605ull},
- {0x03020100FFFFFFFFull, 0x0B0A090807060504ull},
- {0x020100FFFFFFFFFFull, 0x0A09080706050403ull},
- {0x0100FFFFFFFFFFFFull, 0x0908070605040302ull},
- {0x00FFFFFFFFFFFFFFull, 0x0807060504030201ull},
- {0xFFFFFFFFFFFFFFFFull, 0x0706050403020100ull},
- {0xFFFFFFFFFFFFFFFFull, 0x06050403020100FFull},
- {0xFFFFFFFFFFFFFFFFull, 0x050403020100FFFFull},
- {0xFFFFFFFFFFFFFFFFull, 0x0403020100FFFFFFull},
- {0xFFFFFFFFFFFFFFFFull, 0x03020100FFFFFFFFull},
- {0xFFFFFFFFFFFFFFFFull, 0x020100FFFFFFFFFFull},
- {0xFFFFFFFFFFFFFFFFull, 0x0100FFFFFFFFFFFFull},
- {0xFFFFFFFFFFFFFFFFull, 0x00FFFFFFFFFFFFFFull}};
- const V2x64U control = Load<V2x64U>(&table[prefix_len][0]);
- const V2x64U shifted_suffix(_mm_shuffle_epi8(suffix, control));
- return V4x32U(_mm_blendv_epi8(shifted_suffix, prefix, control));
-}
-
-} // namespace HH_TARGET_NAME
-} // namespace highwayhash
-
-#endif // HH_DISABLE_TARGET_SPECIFIC
-#endif // HIGHWAYHASH_HH_BUFFER_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_BUFFER_H_
+#define HIGHWAYHASH_HH_BUFFER_H_
+
+// Helper functions used by hh_avx2 and hh_sse41.
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/vector128.h"
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents (otherwise compilation fails because -msse4.1 is not specified).
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+namespace highwayhash {
+// To prevent ODR violations when including this from multiple translation
+// units (TU) that are compiled with different flags, the contents must reside
+// in a namespace whose name is unique to the TU. NOTE: this behavior is
+// incompatible with precompiled modules and requires textual inclusion instead.
+namespace HH_TARGET_NAME {
+
+template <uint32_t kSizeOffset>
+struct IntMask {}; // primary template
+
+template <>
+struct IntMask<0> {
+ // Returns 32-bit lanes : ~0U if that lane can be loaded given "size" bytes.
+ // Typical case: size = 0..16, nothing deducted.
+ HH_INLINE V4x32U operator()(const V4x32U& size) const {
+ // Lane n is valid if size >= (n + 1) * 4; subtract one because we only have
+ // greater-than comparisons and don't want a negated mask.
+ return V4x32U(_mm_cmpgt_epi32(size, V4x32U(15, 11, 7, 3)));
+ }
+};
+
+template <>
+struct IntMask<16> {
+ // "size" is 16..31; this is for loading the upper half of a packet, so
+ // effectively deduct 16 from size by changing the comparands.
+ HH_INLINE V4x32U operator()(const V4x32U& size) const {
+ return V4x32U(_mm_cmpgt_epi32(size, V4x32U(31, 27, 23, 19)));
+ }
+};
+
+// Inserts "bytes4" into "prev" at the lowest i such that mask[i] = 0.
+// Assumes prev[j] == 0 if mask[j] = 0.
+HH_INLINE V4x32U Insert4AboveMask(const uint32_t bytes4, const V4x32U& mask,
+ const V4x32U& prev) {
+ // There is no 128-bit shift by a variable count. Using shuffle_epi8 with a
+ // control mask requires a table lookup. We know the shift count is a
+ // multiple of 4 bytes, so we can broadcastd_epi32 and clear all lanes except
+ // those where mask != 0. This works because any upper output lanes need not
+ // be zero.
+ return prev | AndNot(mask, V4x32U(bytes4));
+}
+
+// Shifts "suffix" left by "prefix_len" = 0..15 bytes, clears upper bytes of
+// "prefix", and returns the merged/concatenated bytes.
+HH_INLINE V4x32U Concatenate(const V4x32U& prefix, const size_t prefix_len,
+ const V4x32U& suffix) {
+ static const uint64_t table[V16x8U::N][V2x64U::N] = {
+ {0x0706050403020100ull, 0x0F0E0D0C0B0A0908ull},
+ {0x06050403020100FFull, 0x0E0D0C0B0A090807ull},
+ {0x050403020100FFFFull, 0x0D0C0B0A09080706ull},
+ {0x0403020100FFFFFFull, 0x0C0B0A0908070605ull},
+ {0x03020100FFFFFFFFull, 0x0B0A090807060504ull},
+ {0x020100FFFFFFFFFFull, 0x0A09080706050403ull},
+ {0x0100FFFFFFFFFFFFull, 0x0908070605040302ull},
+ {0x00FFFFFFFFFFFFFFull, 0x0807060504030201ull},
+ {0xFFFFFFFFFFFFFFFFull, 0x0706050403020100ull},
+ {0xFFFFFFFFFFFFFFFFull, 0x06050403020100FFull},
+ {0xFFFFFFFFFFFFFFFFull, 0x050403020100FFFFull},
+ {0xFFFFFFFFFFFFFFFFull, 0x0403020100FFFFFFull},
+ {0xFFFFFFFFFFFFFFFFull, 0x03020100FFFFFFFFull},
+ {0xFFFFFFFFFFFFFFFFull, 0x020100FFFFFFFFFFull},
+ {0xFFFFFFFFFFFFFFFFull, 0x0100FFFFFFFFFFFFull},
+ {0xFFFFFFFFFFFFFFFFull, 0x00FFFFFFFFFFFFFFull}};
+ const V2x64U control = Load<V2x64U>(&table[prefix_len][0]);
+ const V2x64U shifted_suffix(_mm_shuffle_epi8(suffix, control));
+ return V4x32U(_mm_blendv_epi8(shifted_suffix, prefix, control));
+}
+
+} // namespace HH_TARGET_NAME
+} // namespace highwayhash
+
+#endif // HH_DISABLE_TARGET_SPECIFIC
+#endif // HIGHWAYHASH_HH_BUFFER_H_
diff --git a/contrib/libs/highwayhash/highwayhash/hh_portable.cc b/contrib/libs/highwayhash/highwayhash/hh_portable.cc
index 3e0de9ed9c..1c8072aebe 100644
--- a/contrib/libs/highwayhash/highwayhash/hh_portable.cc
+++ b/contrib/libs/highwayhash/highwayhash/hh_portable.cc
@@ -1,19 +1,19 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// WARNING: this is a "restricted" source file; avoid including any headers
-// unless they are also restricted. See arch_specific.h for details.
-
-#define HH_TARGET_NAME Portable
-#include "highwayhash/highwayhash_target.cc"
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME Portable
+#include "highwayhash/highwayhash_target.cc"
diff --git a/contrib/libs/highwayhash/highwayhash/hh_portable.h b/contrib/libs/highwayhash/highwayhash/hh_portable.h
index 11284deae8..150ecdee7c 100644
--- a/contrib/libs/highwayhash/highwayhash/hh_portable.h
+++ b/contrib/libs/highwayhash/highwayhash/hh_portable.h
@@ -1,301 +1,301 @@
-// Copyright 2015-2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_HH_PORTABLE_H_
-#define HIGHWAYHASH_HH_PORTABLE_H_
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-#include "highwayhash/endianess.h"
-#include "highwayhash/hh_types.h"
-#include "highwayhash/load3.h"
-
-namespace highwayhash {
-// See vector128.h for why this namespace is necessary; we match it here for
-// consistency. As a result, this header requires textual inclusion.
-namespace HH_TARGET_NAME {
-
-class HHStatePortable {
- public:
- static const int kNumLanes = 4;
- using Lanes = uint64_t[kNumLanes];
-
- explicit HH_INLINE HHStatePortable(const HHKey keys) { Reset(keys); }
-
- HH_INLINE void Reset(const HHKey keys) {
- static const Lanes init0 = {0xdbe6d5d5fe4cce2full, 0xa4093822299f31d0ull,
- 0x13198a2e03707344ull, 0x243f6a8885a308d3ull};
- static const Lanes init1 = {0x3bd39e10cb0ef593ull, 0xc0acf169b5f18a8cull,
- 0xbe5466cf34e90c6cull, 0x452821e638d01377ull};
- Lanes rotated_keys;
- Rotate64By32(keys, &rotated_keys);
- Copy(init0, &mul0);
- Copy(init1, &mul1);
- Xor(init0, keys, &v0);
- Xor(init1, rotated_keys, &v1);
- }
-
- HH_INLINE void Update(const HHPacket& packet) {
- Lanes packet_lanes;
- CopyPartial(&packet[0], sizeof(HHPacket),
- reinterpret_cast<char*>(&packet_lanes));
- for (int lane = 0; lane < kNumLanes; ++lane) {
- packet_lanes[lane] = host_from_le64(packet_lanes[lane]);
- }
- Update(packet_lanes);
- }
-
- HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) {
- // 'Length padding' differentiates zero-valued inputs that have the same
- // size/32. mod32 is sufficient because each Update behaves as if a
- // counter were injected, because the state is large and mixed thoroughly.
- const uint64_t mod32_pair = (size_mod32 << 32) + size_mod32;
- for (int lane = 0; lane < kNumLanes; ++lane) {
- v0[lane] += mod32_pair;
- }
- Rotate32By(reinterpret_cast<uint32_t*>(&v1), size_mod32);
-
- const size_t size_mod4 = size_mod32 & 3;
- const char* remainder = bytes + (size_mod32 & ~3);
-
- HHPacket packet HH_ALIGNAS(32) = {0};
- CopyPartial(bytes, remainder - bytes, &packet[0]);
-
- if (size_mod32 & 16) { // 16..31 bytes left
- // Read the last 0..3 bytes and previous 1..4 into the upper bits.
- // Insert into the upper four bytes of packet, which are zero.
- uint32_t last4 =
- Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4);
- CopyPartial(reinterpret_cast<const char*>(&last4), 4, &packet[28]);
- } else { // size_mod32 < 16
- uint64_t last4 = Load3()(Load3::AllowUnordered(), remainder, size_mod4);
-
- // Rather than insert at packet + 28, it is faster to initialize
- // the otherwise empty packet + 16 with up to 64 bits of padding.
- CopyPartial(reinterpret_cast<const char*>(&last4), sizeof(last4),
- &packet[16]);
- }
- Update(packet);
- }
-
- HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) {
- PermuteAndUpdate();
- PermuteAndUpdate();
- PermuteAndUpdate();
- PermuteAndUpdate();
-
- *result = v0[0] + v1[0] + mul0[0] + mul1[0];
- }
-
- HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) {
- PermuteAndUpdate();
- PermuteAndUpdate();
- PermuteAndUpdate();
- PermuteAndUpdate();
-
- (*result)[0] = v0[0] + mul0[0] + v1[2] + mul1[2];
- (*result)[1] = v0[1] + mul0[1] + v1[3] + mul1[3];
- }
-
- HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) {
- PermuteAndUpdate();
- PermuteAndUpdate();
- PermuteAndUpdate();
- PermuteAndUpdate();
-
- ModularReduction(v1[1] + mul1[1], v1[0] + mul1[0], v0[1] + mul0[1],
- v0[0] + mul0[0], &(*result)[1], &(*result)[0]);
- ModularReduction(v1[3] + mul1[3], v1[2] + mul1[2], v0[3] + mul0[3],
- v0[2] + mul0[2], &(*result)[3], &(*result)[2]);
- }
-
- static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer) {
- for (size_t i = 0; i < sizeof(HHPacket); ++i) {
- buffer[i] = 0;
- }
- }
-
- static HH_INLINE void CopyPartial(const char* HH_RESTRICT from,
- const size_t size_mod32,
- char* HH_RESTRICT buffer) {
- for (size_t i = 0; i < size_mod32; ++i) {
- buffer[i] = from[i];
- }
- }
-
- static HH_INLINE void AppendPartial(const char* HH_RESTRICT from,
- const size_t size_mod32,
- char* HH_RESTRICT buffer,
- const size_t buffer_valid) {
- for (size_t i = 0; i < size_mod32; ++i) {
- buffer[buffer_valid + i] = from[i];
- }
- }
-
- HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from,
- const size_t size_mod32,
- const char* HH_RESTRICT buffer,
- const size_t buffer_valid) {
- HHPacket tmp HH_ALIGNAS(32);
- for (size_t i = 0; i < buffer_valid; ++i) {
- tmp[i] = buffer[i];
- }
- for (size_t i = 0; i < size_mod32; ++i) {
- tmp[buffer_valid + i] = from[i];
- }
- Update(tmp);
- }
-
- private:
- static HH_INLINE void Copy(const Lanes& source, Lanes* HH_RESTRICT dest) {
- for (int lane = 0; lane < kNumLanes; ++lane) {
- (*dest)[lane] = source[lane];
- }
- }
-
- static HH_INLINE void Add(const Lanes& source, Lanes* HH_RESTRICT dest) {
- for (int lane = 0; lane < kNumLanes; ++lane) {
- (*dest)[lane] += source[lane];
- }
- }
-
- template <typename LanesOrPointer>
- static HH_INLINE void Xor(const Lanes& op1, const LanesOrPointer& op2,
- Lanes* HH_RESTRICT dest) {
- for (int lane = 0; lane < kNumLanes; ++lane) {
- (*dest)[lane] = op1[lane] ^ op2[lane];
- }
- }
-
-// Clears all bits except one byte at the given offset.
-#define MASK(v, bytes) ((v) & (0xFFull << ((bytes)*8)))
-
- // 16-byte permutation; shifting is about 10% faster than byte loads.
- // Adds zipper-merge result to add*.
- static HH_INLINE void ZipperMergeAndAdd(const uint64_t v1, const uint64_t v0,
- uint64_t* HH_RESTRICT add1,
- uint64_t* HH_RESTRICT add0) {
- *add0 += ((MASK(v0, 3) + MASK(v1, 4)) >> 24) +
- ((MASK(v0, 5) + MASK(v1, 6)) >> 16) + MASK(v0, 2) +
- (MASK(v0, 1) << 32) + (MASK(v1, 7) >> 8) + (v0 << 56);
-
- *add1 += ((MASK(v1, 3) + MASK(v0, 4)) >> 24) + MASK(v1, 2) +
- (MASK(v1, 5) >> 16) + (MASK(v1, 1) << 24) + (MASK(v0, 6) >> 8) +
- (MASK(v1, 0) << 48) + MASK(v0, 7);
- }
-
-#undef MASK
-
- // For inputs that are already in native byte order (e.g. PermuteAndAdd)
- HH_INLINE void Update(const Lanes& packet_lanes) {
- Add(packet_lanes, &v1);
- Add(mul0, &v1);
-
- // (Loop is faster than unrolling)
- for (int lane = 0; lane < kNumLanes; ++lane) {
- const uint32_t v1_32 = static_cast<uint32_t>(v1[lane]);
- mul0[lane] ^= v1_32 * (v0[lane] >> 32);
- v0[lane] += mul1[lane];
- const uint32_t v0_32 = static_cast<uint32_t>(v0[lane]);
- mul1[lane] ^= v0_32 * (v1[lane] >> 32);
- }
-
- ZipperMergeAndAdd(v1[1], v1[0], &v0[1], &v0[0]);
- ZipperMergeAndAdd(v1[3], v1[2], &v0[3], &v0[2]);
-
- ZipperMergeAndAdd(v0[1], v0[0], &v1[1], &v1[0]);
- ZipperMergeAndAdd(v0[3], v0[2], &v1[3], &v1[2]);
- }
-
- static HH_INLINE uint64_t Rotate64By32(const uint64_t x) {
- return (x >> 32) | (x << 32);
- }
-
- template <typename LanesOrPointer>
- static HH_INLINE void Rotate64By32(const LanesOrPointer& v,
- Lanes* HH_RESTRICT rotated) {
- for (int i = 0; i < kNumLanes; ++i) {
- (*rotated)[i] = Rotate64By32(v[i]);
- }
- }
-
- static HH_INLINE void Rotate32By(uint32_t* halves, const uint64_t count) {
- for (int i = 0; i < 2 * kNumLanes; ++i) {
- const uint32_t x = halves[i];
- halves[i] = (x << count) | (x >> (32 - count));
- }
- }
-
- static HH_INLINE void Permute(const Lanes& v, Lanes* HH_RESTRICT permuted) {
- (*permuted)[0] = Rotate64By32(v[2]);
- (*permuted)[1] = Rotate64By32(v[3]);
- (*permuted)[2] = Rotate64By32(v[0]);
- (*permuted)[3] = Rotate64By32(v[1]);
- }
-
- HH_INLINE void PermuteAndUpdate() {
- Lanes permuted;
- Permute(v0, &permuted);
- Update(permuted);
- }
-
- // Computes a << kBits for 128-bit a = (a1, a0).
- // Bit shifts are only possible on independent 64-bit lanes. We therefore
- // insert the upper bits of a0 that were lost into a1. This is slightly
- // shorter than Lemire's (a << 1) | (((a >> 8) << 1) << 8) approach.
- template <int kBits>
- static HH_INLINE void Shift128Left(uint64_t* HH_RESTRICT a1,
- uint64_t* HH_RESTRICT a0) {
- const uint64_t shifted1 = (*a1) << kBits;
- const uint64_t top_bits = (*a0) >> (64 - kBits);
- *a0 <<= kBits;
- *a1 = shifted1 | top_bits;
- }
-
- // Modular reduction by the irreducible polynomial (x^128 + x^2 + x).
- // Input: a 256-bit number a3210.
- static HH_INLINE void ModularReduction(const uint64_t a3_unmasked,
- const uint64_t a2, const uint64_t a1,
- const uint64_t a0,
- uint64_t* HH_RESTRICT m1,
- uint64_t* HH_RESTRICT m0) {
- // The upper two bits must be clear, otherwise a3 << 2 would lose bits,
- // in which case we're no longer computing a reduction.
- const uint64_t a3 = a3_unmasked & 0x3FFFFFFFFFFFFFFFull;
- // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf.
- uint64_t a3_shl1 = a3;
- uint64_t a2_shl1 = a2;
- uint64_t a3_shl2 = a3;
- uint64_t a2_shl2 = a2;
- Shift128Left<1>(&a3_shl1, &a2_shl1);
- Shift128Left<2>(&a3_shl2, &a2_shl2);
- *m1 = a1 ^ a3_shl1 ^ a3_shl2;
- *m0 = a0 ^ a2_shl1 ^ a2_shl2;
- }
-
- Lanes v0;
- Lanes v1;
- Lanes mul0;
- Lanes mul1;
-};
-
-} // namespace HH_TARGET_NAME
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_HH_PORTABLE_H_
+// Copyright 2015-2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_PORTABLE_H_
+#define HIGHWAYHASH_HH_PORTABLE_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/endianess.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/load3.h"
+
+namespace highwayhash {
+// See vector128.h for why this namespace is necessary; we match it here for
+// consistency. As a result, this header requires textual inclusion.
+namespace HH_TARGET_NAME {
+
+class HHStatePortable {
+ public:
+ static const int kNumLanes = 4;
+ using Lanes = uint64_t[kNumLanes];
+
+ explicit HH_INLINE HHStatePortable(const HHKey keys) { Reset(keys); }
+
+ HH_INLINE void Reset(const HHKey keys) {
+ static const Lanes init0 = {0xdbe6d5d5fe4cce2full, 0xa4093822299f31d0ull,
+ 0x13198a2e03707344ull, 0x243f6a8885a308d3ull};
+ static const Lanes init1 = {0x3bd39e10cb0ef593ull, 0xc0acf169b5f18a8cull,
+ 0xbe5466cf34e90c6cull, 0x452821e638d01377ull};
+ Lanes rotated_keys;
+ Rotate64By32(keys, &rotated_keys);
+ Copy(init0, &mul0);
+ Copy(init1, &mul1);
+ Xor(init0, keys, &v0);
+ Xor(init1, rotated_keys, &v1);
+ }
+
+ HH_INLINE void Update(const HHPacket& packet) {
+ Lanes packet_lanes;
+ CopyPartial(&packet[0], sizeof(HHPacket),
+ reinterpret_cast<char*>(&packet_lanes));
+ for (int lane = 0; lane < kNumLanes; ++lane) {
+ packet_lanes[lane] = host_from_le64(packet_lanes[lane]);
+ }
+ Update(packet_lanes);
+ }
+
+ HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) {
+ // 'Length padding' differentiates zero-valued inputs that have the same
+ // size/32. mod32 is sufficient because each Update behaves as if a
+ // counter were injected, because the state is large and mixed thoroughly.
+ const uint64_t mod32_pair = (size_mod32 << 32) + size_mod32;
+ for (int lane = 0; lane < kNumLanes; ++lane) {
+ v0[lane] += mod32_pair;
+ }
+ Rotate32By(reinterpret_cast<uint32_t*>(&v1), size_mod32);
+
+ const size_t size_mod4 = size_mod32 & 3;
+ const char* remainder = bytes + (size_mod32 & ~3);
+
+ HHPacket packet HH_ALIGNAS(32) = {0};
+ CopyPartial(bytes, remainder - bytes, &packet[0]);
+
+ if (size_mod32 & 16) { // 16..31 bytes left
+ // Read the last 0..3 bytes and previous 1..4 into the upper bits.
+ // Insert into the upper four bytes of packet, which are zero.
+ uint32_t last4 =
+ Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4);
+ CopyPartial(reinterpret_cast<const char*>(&last4), 4, &packet[28]);
+ } else { // size_mod32 < 16
+ uint64_t last4 = Load3()(Load3::AllowUnordered(), remainder, size_mod4);
+
+ // Rather than insert at packet + 28, it is faster to initialize
+ // the otherwise empty packet + 16 with up to 64 bits of padding.
+ CopyPartial(reinterpret_cast<const char*>(&last4), sizeof(last4),
+ &packet[16]);
+ }
+ Update(packet);
+ }
+
+ HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) {
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+
+ *result = v0[0] + v1[0] + mul0[0] + mul1[0];
+ }
+
+ HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) {
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+
+ (*result)[0] = v0[0] + mul0[0] + v1[2] + mul1[2];
+ (*result)[1] = v0[1] + mul0[1] + v1[3] + mul1[3];
+ }
+
+ HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) {
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+
+ ModularReduction(v1[1] + mul1[1], v1[0] + mul1[0], v0[1] + mul0[1],
+ v0[0] + mul0[0], &(*result)[1], &(*result)[0]);
+ ModularReduction(v1[3] + mul1[3], v1[2] + mul1[2], v0[3] + mul0[3],
+ v0[2] + mul0[2], &(*result)[3], &(*result)[2]);
+ }
+
+ static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer) {
+ for (size_t i = 0; i < sizeof(HHPacket); ++i) {
+ buffer[i] = 0;
+ }
+ }
+
+ static HH_INLINE void CopyPartial(const char* HH_RESTRICT from,
+ const size_t size_mod32,
+ char* HH_RESTRICT buffer) {
+ for (size_t i = 0; i < size_mod32; ++i) {
+ buffer[i] = from[i];
+ }
+ }
+
+ static HH_INLINE void AppendPartial(const char* HH_RESTRICT from,
+ const size_t size_mod32,
+ char* HH_RESTRICT buffer,
+ const size_t buffer_valid) {
+ for (size_t i = 0; i < size_mod32; ++i) {
+ buffer[buffer_valid + i] = from[i];
+ }
+ }
+
+ HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from,
+ const size_t size_mod32,
+ const char* HH_RESTRICT buffer,
+ const size_t buffer_valid) {
+ HHPacket tmp HH_ALIGNAS(32);
+ for (size_t i = 0; i < buffer_valid; ++i) {
+ tmp[i] = buffer[i];
+ }
+ for (size_t i = 0; i < size_mod32; ++i) {
+ tmp[buffer_valid + i] = from[i];
+ }
+ Update(tmp);
+ }
+
+ private:
+ static HH_INLINE void Copy(const Lanes& source, Lanes* HH_RESTRICT dest) {
+ for (int lane = 0; lane < kNumLanes; ++lane) {
+ (*dest)[lane] = source[lane];
+ }
+ }
+
+ static HH_INLINE void Add(const Lanes& source, Lanes* HH_RESTRICT dest) {
+ for (int lane = 0; lane < kNumLanes; ++lane) {
+ (*dest)[lane] += source[lane];
+ }
+ }
+
+ template <typename LanesOrPointer>
+ static HH_INLINE void Xor(const Lanes& op1, const LanesOrPointer& op2,
+ Lanes* HH_RESTRICT dest) {
+ for (int lane = 0; lane < kNumLanes; ++lane) {
+ (*dest)[lane] = op1[lane] ^ op2[lane];
+ }
+ }
+
+// Clears all bits except one byte at the given offset.
+#define MASK(v, bytes) ((v) & (0xFFull << ((bytes)*8)))
+
+ // 16-byte permutation; shifting is about 10% faster than byte loads.
+ // Adds zipper-merge result to add*.
+ static HH_INLINE void ZipperMergeAndAdd(const uint64_t v1, const uint64_t v0,
+ uint64_t* HH_RESTRICT add1,
+ uint64_t* HH_RESTRICT add0) {
+ *add0 += ((MASK(v0, 3) + MASK(v1, 4)) >> 24) +
+ ((MASK(v0, 5) + MASK(v1, 6)) >> 16) + MASK(v0, 2) +
+ (MASK(v0, 1) << 32) + (MASK(v1, 7) >> 8) + (v0 << 56);
+
+ *add1 += ((MASK(v1, 3) + MASK(v0, 4)) >> 24) + MASK(v1, 2) +
+ (MASK(v1, 5) >> 16) + (MASK(v1, 1) << 24) + (MASK(v0, 6) >> 8) +
+ (MASK(v1, 0) << 48) + MASK(v0, 7);
+ }
+
+#undef MASK
+
+ // For inputs that are already in native byte order (e.g. PermuteAndAdd)
+ HH_INLINE void Update(const Lanes& packet_lanes) {
+ Add(packet_lanes, &v1);
+ Add(mul0, &v1);
+
+ // (Loop is faster than unrolling)
+ for (int lane = 0; lane < kNumLanes; ++lane) {
+ const uint32_t v1_32 = static_cast<uint32_t>(v1[lane]);
+ mul0[lane] ^= v1_32 * (v0[lane] >> 32);
+ v0[lane] += mul1[lane];
+ const uint32_t v0_32 = static_cast<uint32_t>(v0[lane]);
+ mul1[lane] ^= v0_32 * (v1[lane] >> 32);
+ }
+
+ ZipperMergeAndAdd(v1[1], v1[0], &v0[1], &v0[0]);
+ ZipperMergeAndAdd(v1[3], v1[2], &v0[3], &v0[2]);
+
+ ZipperMergeAndAdd(v0[1], v0[0], &v1[1], &v1[0]);
+ ZipperMergeAndAdd(v0[3], v0[2], &v1[3], &v1[2]);
+ }
+
+ static HH_INLINE uint64_t Rotate64By32(const uint64_t x) {
+ return (x >> 32) | (x << 32);
+ }
+
+ template <typename LanesOrPointer>
+ static HH_INLINE void Rotate64By32(const LanesOrPointer& v,
+ Lanes* HH_RESTRICT rotated) {
+ for (int i = 0; i < kNumLanes; ++i) {
+ (*rotated)[i] = Rotate64By32(v[i]);
+ }
+ }
+
+ static HH_INLINE void Rotate32By(uint32_t* halves, const uint64_t count) {
+ for (int i = 0; i < 2 * kNumLanes; ++i) {
+ const uint32_t x = halves[i];
+ halves[i] = (x << count) | (x >> (32 - count));
+ }
+ }
+
+ static HH_INLINE void Permute(const Lanes& v, Lanes* HH_RESTRICT permuted) {
+ (*permuted)[0] = Rotate64By32(v[2]);
+ (*permuted)[1] = Rotate64By32(v[3]);
+ (*permuted)[2] = Rotate64By32(v[0]);
+ (*permuted)[3] = Rotate64By32(v[1]);
+ }
+
+ HH_INLINE void PermuteAndUpdate() {
+ Lanes permuted;
+ Permute(v0, &permuted);
+ Update(permuted);
+ }
+
+ // Computes a << kBits for 128-bit a = (a1, a0).
+ // Bit shifts are only possible on independent 64-bit lanes. We therefore
+ // insert the upper bits of a0 that were lost into a1. This is slightly
+ // shorter than Lemire's (a << 1) | (((a >> 8) << 1) << 8) approach.
+ template <int kBits>
+ static HH_INLINE void Shift128Left(uint64_t* HH_RESTRICT a1,
+ uint64_t* HH_RESTRICT a0) {
+ const uint64_t shifted1 = (*a1) << kBits;
+ const uint64_t top_bits = (*a0) >> (64 - kBits);
+ *a0 <<= kBits;
+ *a1 = shifted1 | top_bits;
+ }
+
+ // Modular reduction by the irreducible polynomial (x^128 + x^2 + x).
+ // Input: a 256-bit number a3210.
+ static HH_INLINE void ModularReduction(const uint64_t a3_unmasked,
+ const uint64_t a2, const uint64_t a1,
+ const uint64_t a0,
+ uint64_t* HH_RESTRICT m1,
+ uint64_t* HH_RESTRICT m0) {
+ // The upper two bits must be clear, otherwise a3 << 2 would lose bits,
+ // in which case we're no longer computing a reduction.
+ const uint64_t a3 = a3_unmasked & 0x3FFFFFFFFFFFFFFFull;
+ // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf.
+ uint64_t a3_shl1 = a3;
+ uint64_t a2_shl1 = a2;
+ uint64_t a3_shl2 = a3;
+ uint64_t a2_shl2 = a2;
+ Shift128Left<1>(&a3_shl1, &a2_shl1);
+ Shift128Left<2>(&a3_shl2, &a2_shl2);
+ *m1 = a1 ^ a3_shl1 ^ a3_shl2;
+ *m0 = a0 ^ a2_shl1 ^ a2_shl2;
+ }
+
+ Lanes v0;
+ Lanes v1;
+ Lanes mul0;
+ Lanes mul1;
+};
+
+} // namespace HH_TARGET_NAME
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_HH_PORTABLE_H_
diff --git a/contrib/libs/highwayhash/highwayhash/hh_sse41.cc b/contrib/libs/highwayhash/highwayhash/hh_sse41.cc
index 9d6a0b968f..0bf13ab4f5 100644
--- a/contrib/libs/highwayhash/highwayhash/hh_sse41.cc
+++ b/contrib/libs/highwayhash/highwayhash/hh_sse41.cc
@@ -1,19 +1,19 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// WARNING: this is a "restricted" source file; avoid including any headers
-// unless they are also restricted. See arch_specific.h for details.
-
-#define HH_TARGET_NAME SSE41
-#include "highwayhash/highwayhash_target.cc"
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME SSE41
+#include "highwayhash/highwayhash_target.cc"
diff --git a/contrib/libs/highwayhash/highwayhash/hh_sse41.h b/contrib/libs/highwayhash/highwayhash/hh_sse41.h
index a2a86da9b6..c4d56697e2 100644
--- a/contrib/libs/highwayhash/highwayhash/hh_sse41.h
+++ b/contrib/libs/highwayhash/highwayhash/hh_sse41.h
@@ -1,330 +1,330 @@
-// Copyright 2015-2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_HH_SSE41_H_
-#define HIGHWAYHASH_HH_SSE41_H_
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-#include "highwayhash/hh_buffer.h"
-#include "highwayhash/hh_types.h"
-#include "highwayhash/load3.h"
-#include "highwayhash/vector128.h"
-
-// For auto-dependency generation, we need to include all headers but not their
-// contents (otherwise compilation fails because -msse4.1 is not specified).
-#ifndef HH_DISABLE_TARGET_SPECIFIC
-
-namespace highwayhash {
-// See vector128.h for why this namespace is necessary; matching it here makes
-// it easier use the vector128 symbols, but requires textual inclusion.
-namespace HH_TARGET_NAME {
-
-// J-lanes tree hashing: see http://dx.doi.org/10.4236/jis.2014.53010
-// Uses pairs of SSE4.1 instructions to emulate the AVX-2 algorithm.
-class HHStateSSE41 {
- public:
- explicit HH_INLINE HHStateSSE41(const HHKey key) { Reset(key); }
-
- HH_INLINE void Reset(const HHKey key) {
- // "Nothing up my sleeve numbers"; see HHStateTAVX2.
- const V2x64U init0L(0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full);
- const V2x64U init0H(0x243f6a8885a308d3ull, 0x13198a2e03707344ull);
- const V2x64U init1L(0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull);
- const V2x64U init1H(0x452821e638d01377ull, 0xbe5466cf34e90c6cull);
- const V2x64U keyL = LoadUnaligned<V2x64U>(key + 0);
- const V2x64U keyH = LoadUnaligned<V2x64U>(key + 2);
- v0L = keyL ^ init0L;
- v0H = keyH ^ init0H;
- v1L = Rotate64By32(keyL) ^ init1L;
- v1H = Rotate64By32(keyH) ^ init1H;
- mul0L = init0L;
- mul0H = init0H;
- mul1L = init1L;
- mul1H = init1H;
- }
-
- HH_INLINE void Update(const HHPacket& packet_bytes) {
- const uint64_t* HH_RESTRICT packet =
- reinterpret_cast<const uint64_t * HH_RESTRICT>(packet_bytes);
- const V2x64U packetL = LoadUnaligned<V2x64U>(packet + 0);
- const V2x64U packetH = LoadUnaligned<V2x64U>(packet + 2);
- Update(packetH, packetL);
- }
-
- HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) {
- // 'Length padding' differentiates zero-valued inputs that have the same
- // size/32. mod32 is sufficient because each Update behaves as if a
- // counter were injected, because the state is large and mixed thoroughly.
- const V4x32U vsize_mod32(static_cast<uint32_t>(size_mod32));
- // Equivalent to storing size_mod32 in packet.
- v0L += V2x64U(vsize_mod32);
- v0H += V2x64U(vsize_mod32);
- // Boosts the avalanche effect of mod32.
- Rotate32By(&v1H, &v1L, size_mod32);
-
- const size_t size_mod4 = size_mod32 & 3;
- const char* HH_RESTRICT remainder = bytes + (size_mod32 & ~3);
-
- if (HH_UNLIKELY(size_mod32 & 16)) { // 16..31 bytes left
- const V2x64U packetL =
- LoadUnaligned<V2x64U>(reinterpret_cast<const uint64_t*>(bytes));
-
- V2x64U packetH = LoadMultipleOfFour(bytes + 16, size_mod32);
-
- const uint32_t last4 =
- Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4);
-
- // The upper four bytes of packetH are zero, so insert there.
- packetH = V2x64U(_mm_insert_epi32(packetH, last4, 3));
- Update(packetH, packetL);
- } else { // size_mod32 < 16
- const V2x64U packetL = LoadMultipleOfFour(bytes, size_mod32);
-
- const uint64_t last4 =
- Load3()(Load3::AllowUnordered(), remainder, size_mod4);
-
- // Rather than insert into packetL[3], it is faster to initialize
- // the otherwise empty packetH.
- const V2x64U packetH(_mm_cvtsi64_si128(last4));
- Update(packetH, packetL);
- }
- }
-
- HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) {
- // Mix together all lanes.
- PermuteAndUpdate();
- PermuteAndUpdate();
- PermuteAndUpdate();
- PermuteAndUpdate();
-
- const V2x64U sum0 = v0L + mul0L;
- const V2x64U sum1 = v1L + mul1L;
- const V2x64U hash = sum0 + sum1;
- _mm_storel_epi64(reinterpret_cast<__m128i*>(result), hash);
- }
-
- HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) {
- PermuteAndUpdate();
- PermuteAndUpdate();
- PermuteAndUpdate();
- PermuteAndUpdate();
-
- const V2x64U sum0 = v0L + mul0L;
- const V2x64U sum1 = v1H + mul1H;
- const V2x64U hash = sum0 + sum1;
- StoreUnaligned(hash, &(*result)[0]);
- }
-
- HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) {
- PermuteAndUpdate();
- PermuteAndUpdate();
- PermuteAndUpdate();
- PermuteAndUpdate();
-
- const V2x64U sum0L = v0L + mul0L;
- const V2x64U sum1L = v1L + mul1L;
- const V2x64U sum0H = v0H + mul0H;
- const V2x64U sum1H = v1H + mul1H;
- const V2x64U hashL = ModularReduction(sum1L, sum0L);
- const V2x64U hashH = ModularReduction(sum1H, sum0H);
- StoreUnaligned(hashL, &(*result)[0]);
- StoreUnaligned(hashH, &(*result)[2]);
- }
-
- static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer_bytes) {
- __m128i* buffer = reinterpret_cast<__m128i*>(buffer_bytes);
- const __m128i zero = _mm_setzero_si128();
- _mm_store_si128(buffer + 0, zero);
- _mm_store_si128(buffer + 1, zero);
- }
-
- static HH_INLINE void CopyPartial(const char* HH_RESTRICT from,
- const size_t size_mod32,
- char* HH_RESTRICT buffer) {
- for (size_t i = 0; i < size_mod32; ++i) {
- buffer[i] = from[i];
- }
- }
-
- static HH_INLINE void AppendPartial(const char* HH_RESTRICT from,
- const size_t size_mod32,
- char* HH_RESTRICT buffer,
- const size_t buffer_valid) {
- for (size_t i = 0; i < size_mod32; ++i) {
- buffer[buffer_valid + i] = from[i];
- }
- }
-
- HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from,
- const size_t size_mod32,
- const char* HH_RESTRICT buffer,
- const size_t buffer_valid) {
- HHPacket tmp HH_ALIGNAS(32);
- for (size_t i = 0; i < buffer_valid; ++i) {
- tmp[i] = buffer[i];
- }
- for (size_t i = 0; i < size_mod32; ++i) {
- tmp[buffer_valid + i] = from[i];
- }
- Update(tmp);
- }
-
- private:
- // Swap 32-bit halves of each lane (caller swaps 128-bit halves)
- static HH_INLINE V2x64U Rotate64By32(const V2x64U& v) {
- return V2x64U(_mm_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1)));
- }
-
- // Rotates 32-bit lanes by "count" bits.
- static HH_INLINE void Rotate32By(V2x64U* HH_RESTRICT vH,
- V2x64U* HH_RESTRICT vL,
- const uint64_t count) {
- // WARNING: the shift count is 64 bits, so we can't reuse vsize_mod32,
- // which is broadcast into 32-bit lanes.
- const __m128i count_left = _mm_cvtsi64_si128(count);
- const __m128i count_right = _mm_cvtsi64_si128(32 - count);
- const V2x64U shifted_leftL(_mm_sll_epi32(*vL, count_left));
- const V2x64U shifted_leftH(_mm_sll_epi32(*vH, count_left));
- const V2x64U shifted_rightL(_mm_srl_epi32(*vL, count_right));
- const V2x64U shifted_rightH(_mm_srl_epi32(*vH, count_right));
- *vL = shifted_leftL | shifted_rightL;
- *vH = shifted_leftH | shifted_rightH;
- }
-
- static HH_INLINE V2x64U ZipperMerge(const V2x64U& v) {
- // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
- // varying degrees. In descending order of goodness, bytes
- // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
- // As expected, the upper and lower bytes are much worse.
- // For each 64-bit lane, our objectives are:
- // 1) maximizing and equalizing total goodness across each lane's bytes;
- // 2) mixing with bytes from the neighboring lane;
- // 3) placing the worst bytes in the upper 32 bits because those will not
- // be used in the next 32x32 multiplication.
- const uint64_t hi = 0x070806090D0A040Bull;
- const uint64_t lo = 0x000F010E05020C03ull;
- return V2x64U(_mm_shuffle_epi8(v, V2x64U(hi, lo)));
- }
-
- HH_INLINE void Update(const V2x64U& packetH, const V2x64U& packetL) {
- v1L += packetL;
- v1H += packetH;
- v1L += mul0L;
- v1H += mul0H;
- mul0L ^= V2x64U(_mm_mul_epu32(v1L, Rotate64By32(v0L)));
- mul0H ^= V2x64U(_mm_mul_epu32(v1H, v0H >> 32));
- v0L += mul1L;
- v0H += mul1H;
- mul1L ^= V2x64U(_mm_mul_epu32(v0L, Rotate64By32(v1L)));
- mul1H ^= V2x64U(_mm_mul_epu32(v0H, v1H >> 32));
- v0L += ZipperMerge(v1L);
- v0H += ZipperMerge(v1H);
- v1L += ZipperMerge(v0L);
- v1H += ZipperMerge(v0H);
- }
-
- HH_INLINE void PermuteAndUpdate() {
- // It is slightly better to permute v0 than v1; it will be added to v1.
- // AVX-2 Permute also swaps 128-bit halves, so swap input operands.
- Update(Rotate64By32(v0L), Rotate64By32(v0H));
- }
-
- // Returns zero-initialized vector with the lower "size" = 0, 4, 8 or 12
- // bytes loaded from "bytes". Serves as a replacement for AVX2 maskload_epi32.
- static HH_INLINE V2x64U LoadMultipleOfFour(const char* bytes,
- const size_t size) {
- const uint32_t* words = reinterpret_cast<const uint32_t*>(bytes);
- // Mask of 1-bits where the final 4 bytes should be inserted (replacement
- // for variable shift/insert using broadcast+blend).
- V2x64U mask4(_mm_cvtsi64_si128(0xFFFFFFFFULL)); // 'insert' into lane 0
- V2x64U ret(0);
- if (size & 8) {
- ret = V2x64U(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(words)));
- // mask4 = 0 ~0 0 0 ('insert' into lane 2)
- mask4 = V2x64U(_mm_slli_si128(mask4, 8));
- words += 2;
- }
- // Final 4 (possibly after the 8 above); 'insert' into lane 0 or 2 of ret.
- if (size & 4) {
- const __m128i word2 = _mm_cvtsi32_si128(words[0]);
- // = 0 word2 0 word2; mask4 will select which lane to keep.
- const V2x64U broadcast(_mm_shuffle_epi32(word2, 0x00));
- // (slightly faster than blendv_epi8)
- ret |= V2x64U(broadcast & mask4);
- }
- return ret;
- }
-
- // XORs x << 1 and x << 2 into *out after clearing the upper two bits of x.
- // Bit shifts are only possible on independent 64-bit lanes. We therefore
- // insert the upper bits of x[0] that were lost into x[1].
- // Thanks to D. Lemire for helpful comments!
- static HH_INLINE void XorByShift128Left12(const V2x64U& x,
- V2x64U* HH_RESTRICT out) {
- const V2x64U zero(_mm_setzero_si128());
- const V2x64U sign_bit128(_mm_insert_epi32(zero, 0x80000000u, 3));
- const V2x64U top_bits2 = x >> (64 - 2);
- HH_COMPILER_FENCE;
- const V2x64U shifted1_unmasked = x + x; // (avoids needing port0)
-
- // Only the lower half of top_bits1 will be used, so we
- // can compute it before clearing the upper two bits of x.
- const V2x64U top_bits1 = x >> (64 - 1);
- const V2x64U shifted2 = shifted1_unmasked + shifted1_unmasked;
- HH_COMPILER_FENCE;
-
- const V2x64U new_low_bits2(_mm_slli_si128(top_bits2, 8));
- *out ^= shifted2;
- // The result must be as if the upper two bits of the input had been clear,
- // otherwise we're no longer computing a reduction.
- const V2x64U shifted1 = AndNot(sign_bit128, shifted1_unmasked);
- HH_COMPILER_FENCE;
-
- const V2x64U new_low_bits1(_mm_slli_si128(top_bits1, 8));
- *out ^= new_low_bits2;
- *out ^= shifted1;
- *out ^= new_low_bits1;
- }
-
- // Modular reduction by the irreducible polynomial (x^128 + x^2 + x).
- // Input: a 256-bit number a3210.
- static HH_INLINE V2x64U ModularReduction(const V2x64U& a32_unmasked,
- const V2x64U& a10) {
- // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf.
- V2x64U out = a10;
- XorByShift128Left12(a32_unmasked, &out);
- return out;
- }
-
- V2x64U v0L;
- V2x64U v0H;
- V2x64U v1L;
- V2x64U v1H;
- V2x64U mul0L;
- V2x64U mul0H;
- V2x64U mul1L;
- V2x64U mul1H;
-};
-
-} // namespace HH_TARGET_NAME
-} // namespace highwayhash
-
-#endif // HH_DISABLE_TARGET_SPECIFIC
-#endif // HIGHWAYHASH_HH_SSE41_H_
+// Copyright 2015-2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_SSE41_H_
+#define HIGHWAYHASH_HH_SSE41_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_buffer.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/load3.h"
+#include "highwayhash/vector128.h"
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents (otherwise compilation fails because -msse4.1 is not specified).
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+namespace highwayhash {
+// See vector128.h for why this namespace is necessary; matching it here makes
+// it easier use the vector128 symbols, but requires textual inclusion.
+namespace HH_TARGET_NAME {
+
+// J-lanes tree hashing: see http://dx.doi.org/10.4236/jis.2014.53010
+// Uses pairs of SSE4.1 instructions to emulate the AVX-2 algorithm.
+class HHStateSSE41 {
+ public:
+ explicit HH_INLINE HHStateSSE41(const HHKey key) { Reset(key); }
+
+ HH_INLINE void Reset(const HHKey key) {
+ // "Nothing up my sleeve numbers"; see HHStateTAVX2.
+ const V2x64U init0L(0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full);
+ const V2x64U init0H(0x243f6a8885a308d3ull, 0x13198a2e03707344ull);
+ const V2x64U init1L(0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull);
+ const V2x64U init1H(0x452821e638d01377ull, 0xbe5466cf34e90c6cull);
+ const V2x64U keyL = LoadUnaligned<V2x64U>(key + 0);
+ const V2x64U keyH = LoadUnaligned<V2x64U>(key + 2);
+ v0L = keyL ^ init0L;
+ v0H = keyH ^ init0H;
+ v1L = Rotate64By32(keyL) ^ init1L;
+ v1H = Rotate64By32(keyH) ^ init1H;
+ mul0L = init0L;
+ mul0H = init0H;
+ mul1L = init1L;
+ mul1H = init1H;
+ }
+
+ HH_INLINE void Update(const HHPacket& packet_bytes) {
+ const uint64_t* HH_RESTRICT packet =
+ reinterpret_cast<const uint64_t * HH_RESTRICT>(packet_bytes);
+ const V2x64U packetL = LoadUnaligned<V2x64U>(packet + 0);
+ const V2x64U packetH = LoadUnaligned<V2x64U>(packet + 2);
+ Update(packetH, packetL);
+ }
+
+ HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) {
+ // 'Length padding' differentiates zero-valued inputs that have the same
+ // size/32. mod32 is sufficient because each Update behaves as if a
+ // counter were injected, because the state is large and mixed thoroughly.
+ const V4x32U vsize_mod32(static_cast<uint32_t>(size_mod32));
+ // Equivalent to storing size_mod32 in packet.
+ v0L += V2x64U(vsize_mod32);
+ v0H += V2x64U(vsize_mod32);
+ // Boosts the avalanche effect of mod32.
+ Rotate32By(&v1H, &v1L, size_mod32);
+
+ const size_t size_mod4 = size_mod32 & 3;
+ const char* HH_RESTRICT remainder = bytes + (size_mod32 & ~3);
+
+ if (HH_UNLIKELY(size_mod32 & 16)) { // 16..31 bytes left
+ const V2x64U packetL =
+ LoadUnaligned<V2x64U>(reinterpret_cast<const uint64_t*>(bytes));
+
+ V2x64U packetH = LoadMultipleOfFour(bytes + 16, size_mod32);
+
+ const uint32_t last4 =
+ Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4);
+
+ // The upper four bytes of packetH are zero, so insert there.
+ packetH = V2x64U(_mm_insert_epi32(packetH, last4, 3));
+ Update(packetH, packetL);
+ } else { // size_mod32 < 16
+ const V2x64U packetL = LoadMultipleOfFour(bytes, size_mod32);
+
+ const uint64_t last4 =
+ Load3()(Load3::AllowUnordered(), remainder, size_mod4);
+
+ // Rather than insert into packetL[3], it is faster to initialize
+ // the otherwise empty packetH.
+ const V2x64U packetH(_mm_cvtsi64_si128(last4));
+ Update(packetH, packetL);
+ }
+ }
+
+ HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) {
+ // Mix together all lanes.
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+
+ const V2x64U sum0 = v0L + mul0L;
+ const V2x64U sum1 = v1L + mul1L;
+ const V2x64U hash = sum0 + sum1;
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(result), hash);
+ }
+
+ HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) {
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+
+ const V2x64U sum0 = v0L + mul0L;
+ const V2x64U sum1 = v1H + mul1H;
+ const V2x64U hash = sum0 + sum1;
+ StoreUnaligned(hash, &(*result)[0]);
+ }
+
+ HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) {
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+ PermuteAndUpdate();
+
+ const V2x64U sum0L = v0L + mul0L;
+ const V2x64U sum1L = v1L + mul1L;
+ const V2x64U sum0H = v0H + mul0H;
+ const V2x64U sum1H = v1H + mul1H;
+ const V2x64U hashL = ModularReduction(sum1L, sum0L);
+ const V2x64U hashH = ModularReduction(sum1H, sum0H);
+ StoreUnaligned(hashL, &(*result)[0]);
+ StoreUnaligned(hashH, &(*result)[2]);
+ }
+
+ static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer_bytes) {
+ __m128i* buffer = reinterpret_cast<__m128i*>(buffer_bytes);
+ const __m128i zero = _mm_setzero_si128();
+ _mm_store_si128(buffer + 0, zero);
+ _mm_store_si128(buffer + 1, zero);
+ }
+
+ static HH_INLINE void CopyPartial(const char* HH_RESTRICT from,
+ const size_t size_mod32,
+ char* HH_RESTRICT buffer) {
+ for (size_t i = 0; i < size_mod32; ++i) {
+ buffer[i] = from[i];
+ }
+ }
+
+ static HH_INLINE void AppendPartial(const char* HH_RESTRICT from,
+ const size_t size_mod32,
+ char* HH_RESTRICT buffer,
+ const size_t buffer_valid) {
+ for (size_t i = 0; i < size_mod32; ++i) {
+ buffer[buffer_valid + i] = from[i];
+ }
+ }
+
+ HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from,
+ const size_t size_mod32,
+ const char* HH_RESTRICT buffer,
+ const size_t buffer_valid) {
+ HHPacket tmp HH_ALIGNAS(32);
+ for (size_t i = 0; i < buffer_valid; ++i) {
+ tmp[i] = buffer[i];
+ }
+ for (size_t i = 0; i < size_mod32; ++i) {
+ tmp[buffer_valid + i] = from[i];
+ }
+ Update(tmp);
+ }
+
+ private:
+ // Swap 32-bit halves of each lane (caller swaps 128-bit halves)
+ static HH_INLINE V2x64U Rotate64By32(const V2x64U& v) {
+ return V2x64U(_mm_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1)));
+ }
+
+ // Rotates 32-bit lanes by "count" bits.
+ static HH_INLINE void Rotate32By(V2x64U* HH_RESTRICT vH,
+ V2x64U* HH_RESTRICT vL,
+ const uint64_t count) {
+ // WARNING: the shift count is 64 bits, so we can't reuse vsize_mod32,
+ // which is broadcast into 32-bit lanes.
+ const __m128i count_left = _mm_cvtsi64_si128(count);
+ const __m128i count_right = _mm_cvtsi64_si128(32 - count);
+ const V2x64U shifted_leftL(_mm_sll_epi32(*vL, count_left));
+ const V2x64U shifted_leftH(_mm_sll_epi32(*vH, count_left));
+ const V2x64U shifted_rightL(_mm_srl_epi32(*vL, count_right));
+ const V2x64U shifted_rightH(_mm_srl_epi32(*vH, count_right));
+ *vL = shifted_leftL | shifted_rightL;
+ *vH = shifted_leftH | shifted_rightH;
+ }
+
+ static HH_INLINE V2x64U ZipperMerge(const V2x64U& v) {
+ // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ // varying degrees. In descending order of goodness, bytes
+ // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ // As expected, the upper and lower bytes are much worse.
+ // For each 64-bit lane, our objectives are:
+ // 1) maximizing and equalizing total goodness across each lane's bytes;
+ // 2) mixing with bytes from the neighboring lane;
+ // 3) placing the worst bytes in the upper 32 bits because those will not
+ // be used in the next 32x32 multiplication.
+ const uint64_t hi = 0x070806090D0A040Bull;
+ const uint64_t lo = 0x000F010E05020C03ull;
+ return V2x64U(_mm_shuffle_epi8(v, V2x64U(hi, lo)));
+ }
+
+ HH_INLINE void Update(const V2x64U& packetH, const V2x64U& packetL) {
+ v1L += packetL;
+ v1H += packetH;
+ v1L += mul0L;
+ v1H += mul0H;
+ mul0L ^= V2x64U(_mm_mul_epu32(v1L, Rotate64By32(v0L)));
+ mul0H ^= V2x64U(_mm_mul_epu32(v1H, v0H >> 32));
+ v0L += mul1L;
+ v0H += mul1H;
+ mul1L ^= V2x64U(_mm_mul_epu32(v0L, Rotate64By32(v1L)));
+ mul1H ^= V2x64U(_mm_mul_epu32(v0H, v1H >> 32));
+ v0L += ZipperMerge(v1L);
+ v0H += ZipperMerge(v1H);
+ v1L += ZipperMerge(v0L);
+ v1H += ZipperMerge(v0H);
+ }
+
+ HH_INLINE void PermuteAndUpdate() {
+ // It is slightly better to permute v0 than v1; it will be added to v1.
+ // AVX-2 Permute also swaps 128-bit halves, so swap input operands.
+ Update(Rotate64By32(v0L), Rotate64By32(v0H));
+ }
+
+ // Returns zero-initialized vector with the lower "size" = 0, 4, 8 or 12
+ // bytes loaded from "bytes". Serves as a replacement for AVX2 maskload_epi32.
+ static HH_INLINE V2x64U LoadMultipleOfFour(const char* bytes,
+ const size_t size) {
+ const uint32_t* words = reinterpret_cast<const uint32_t*>(bytes);
+ // Mask of 1-bits where the final 4 bytes should be inserted (replacement
+ // for variable shift/insert using broadcast+blend).
+ V2x64U mask4(_mm_cvtsi64_si128(0xFFFFFFFFULL)); // 'insert' into lane 0
+ V2x64U ret(0);
+ if (size & 8) {
+ ret = V2x64U(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(words)));
+ // mask4 = 0 ~0 0 0 ('insert' into lane 2)
+ mask4 = V2x64U(_mm_slli_si128(mask4, 8));
+ words += 2;
+ }
+ // Final 4 (possibly after the 8 above); 'insert' into lane 0 or 2 of ret.
+ if (size & 4) {
+ const __m128i word2 = _mm_cvtsi32_si128(words[0]);
+ // = 0 word2 0 word2; mask4 will select which lane to keep.
+ const V2x64U broadcast(_mm_shuffle_epi32(word2, 0x00));
+ // (slightly faster than blendv_epi8)
+ ret |= V2x64U(broadcast & mask4);
+ }
+ return ret;
+ }
+
+ // XORs x << 1 and x << 2 into *out after clearing the upper two bits of x.
+ // Bit shifts are only possible on independent 64-bit lanes. We therefore
+ // insert the upper bits of x[0] that were lost into x[1].
+ // Thanks to D. Lemire for helpful comments!
+ static HH_INLINE void XorByShift128Left12(const V2x64U& x,
+ V2x64U* HH_RESTRICT out) {
+ const V2x64U zero(_mm_setzero_si128());
+ const V2x64U sign_bit128(_mm_insert_epi32(zero, 0x80000000u, 3));
+ const V2x64U top_bits2 = x >> (64 - 2);
+ HH_COMPILER_FENCE;
+ const V2x64U shifted1_unmasked = x + x; // (avoids needing port0)
+
+ // Only the lower half of top_bits1 will be used, so we
+ // can compute it before clearing the upper two bits of x.
+ const V2x64U top_bits1 = x >> (64 - 1);
+ const V2x64U shifted2 = shifted1_unmasked + shifted1_unmasked;
+ HH_COMPILER_FENCE;
+
+ const V2x64U new_low_bits2(_mm_slli_si128(top_bits2, 8));
+ *out ^= shifted2;
+ // The result must be as if the upper two bits of the input had been clear,
+ // otherwise we're no longer computing a reduction.
+ const V2x64U shifted1 = AndNot(sign_bit128, shifted1_unmasked);
+ HH_COMPILER_FENCE;
+
+ const V2x64U new_low_bits1(_mm_slli_si128(top_bits1, 8));
+ *out ^= new_low_bits2;
+ *out ^= shifted1;
+ *out ^= new_low_bits1;
+ }
+
+ // Modular reduction by the irreducible polynomial (x^128 + x^2 + x).
+ // Input: a 256-bit number a3210.
+ static HH_INLINE V2x64U ModularReduction(const V2x64U& a32_unmasked,
+ const V2x64U& a10) {
+ // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf.
+ V2x64U out = a10;
+ XorByShift128Left12(a32_unmasked, &out);
+ return out;
+ }
+
+ V2x64U v0L;
+ V2x64U v0H;
+ V2x64U v1L;
+ V2x64U v1H;
+ V2x64U mul0L;
+ V2x64U mul0H;
+ V2x64U mul1L;
+ V2x64U mul1H;
+};
+
+} // namespace HH_TARGET_NAME
+} // namespace highwayhash
+
+#endif // HH_DISABLE_TARGET_SPECIFIC
+#endif // HIGHWAYHASH_HH_SSE41_H_
diff --git a/contrib/libs/highwayhash/highwayhash/hh_types.h b/contrib/libs/highwayhash/highwayhash/hh_types.h
index f350d70f65..e5b0430f17 100644
--- a/contrib/libs/highwayhash/highwayhash/hh_types.h
+++ b/contrib/libs/highwayhash/highwayhash/hh_types.h
@@ -1,50 +1,50 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_HH_TYPES_H_
-#define HIGHWAYHASH_HH_TYPES_H_
-
-// WARNING: included from c_bindings => must be C-compatible.
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include <stddef.h> // size_t
-#include <stdint.h>
-
-#ifdef __cplusplus
-namespace highwayhash {
-#endif
-
-// 256-bit secret key that should remain unknown to attackers.
-// We recommend initializing it to a random value.
-typedef uint64_t HHKey[4];
-
-// How much input is hashed by one call to HHStateT::Update.
-typedef char HHPacket[32];
-
-// Hash 'return' types.
-typedef uint64_t HHResult64; // returned directly
-typedef uint64_t HHResult128[2];
-typedef uint64_t HHResult256[4];
-
-// Called if a test fails, indicating which target and size.
-typedef void (*HHNotify)(const char*, size_t);
-
-#ifdef __cplusplus
-} // namespace highwayhash
-#endif
-
-#endif // HIGHWAYHASH_HH_TYPES_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_TYPES_H_
+#define HIGHWAYHASH_HH_TYPES_H_
+
+// WARNING: included from c_bindings => must be C-compatible.
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h> // size_t
+#include <stdint.h>
+
+#ifdef __cplusplus
+namespace highwayhash {
+#endif
+
+// 256-bit secret key that should remain unknown to attackers.
+// We recommend initializing it to a random value.
+typedef uint64_t HHKey[4];
+
+// How much input is hashed by one call to HHStateT::Update.
+typedef char HHPacket[32];
+
+// Hash 'return' types.
+typedef uint64_t HHResult64; // returned directly
+typedef uint64_t HHResult128[2];
+typedef uint64_t HHResult256[4];
+
+// Called if a test fails, indicating which target and size.
+typedef void (*HHNotify)(const char*, size_t);
+
+#ifdef __cplusplus
+} // namespace highwayhash
+#endif
+
+#endif // HIGHWAYHASH_HH_TYPES_H_
diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash.h b/contrib/libs/highwayhash/highwayhash/highwayhash.h
index cee1c31ba4..0cebc841fe 100644
--- a/contrib/libs/highwayhash/highwayhash/highwayhash.h
+++ b/contrib/libs/highwayhash/highwayhash/highwayhash.h
@@ -1,202 +1,202 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_HIGHWAYHASH_H_
-#define HIGHWAYHASH_HIGHWAYHASH_H_
-
-// This header's templates are useful for inlining into other CPU-specific code:
-// template<TargetBits Target> CodeUsingHash() { HighwayHashT<Target>(...); },
-// and can also be instantiated with HH_TARGET when callers don't care about the
-// exact implementation. Otherwise, they are implementation details of the
-// highwayhash_target wrapper. Use that instead if you need to detect the best
-// available implementation at runtime.
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-#include "highwayhash/hh_types.h"
-#include "highwayhash/iaca.h"
-
-// Include exactly one (see arch_specific.h) header, which defines a state
-// object in a target-specific namespace, e.g. AVX2::HHStateAVX2.
-// Attempts to use "computed includes" (#define MACRO "path/or_just_filename",
-// #include MACRO) fail with 'file not found', so we need an #if chain.
-#if HH_TARGET == HH_TARGET_AVX2
-#include "highwayhash/hh_avx2.h"
-#elif HH_TARGET == HH_TARGET_SSE41
-#include "highwayhash/hh_sse41.h"
-#elif HH_TARGET == HH_TARGET_Portable
-#include "highwayhash/hh_portable.h"
-#else
-#error "Unknown target, add its hh_*.h include here."
-#endif
-
-#ifndef HH_DISABLE_TARGET_SPECIFIC
-namespace highwayhash {
-
-// Alias templates (HHStateT) cannot be specialized, so we need a helper struct.
-// Note that hh_*.h don't just specialize HHStateT directly because vector128.h
-// must reside in a distinct namespace (to allow including it from multiple
-// translation units), and it is easier if its users, i.e. the concrete HHState,
-// also reside in that same namespace, which precludes specialization.
-template <TargetBits Target>
-struct HHStateForTarget {};
-
-template <>
-struct HHStateForTarget<HH_TARGET> {
- // (The namespace is sufficient and the additional HH_TARGET_NAME suffix is
- // technically redundant, but it makes searching easier.)
- using type = HH_TARGET_NAME::HH_ADD_TARGET_SUFFIX(HHState);
-};
-
-// Typically used as HHStateT<HH_TARGET>. It would be easier to just have a
-// concrete type HH_STATE, but this alias template is required by the
-// templates in highwayhash_target.cc.
-template <TargetBits Target>
-using HHStateT = typename HHStateForTarget<Target>::type;
-
-// Computes HighwayHash of "bytes" using the implementation chosen by "State".
-//
-// "state" is a HHStateT<> initialized with a key.
-// "bytes" is the data to hash (possibly unaligned).
-// "size" is the number of bytes to hash; we do not read any additional bytes.
-// "hash" is a HHResult* (either 64, 128 or 256 bits).
-//
-// HighwayHash is a strong pseudorandom function with security claims
-// [https://arxiv.org/abs/1612.06257]. It is intended as a safer general-purpose
-// hash, about 4x faster than SipHash and 10x faster than BLAKE2.
-//
-// This template allows callers (e.g. tests) to invoke a specific
-// implementation. It must be compiled with the flags required by the desired
-// implementation. If the entire program cannot be built with these flags, use
-// the wrapper in highwayhash_target.h instead.
-//
-// Callers wanting to hash multiple pieces of data should duplicate this
-// function, calling HHStateT::Update for each input and only Finalizing once.
-template <class State, typename Result>
-HH_INLINE void HighwayHashT(State* HH_RESTRICT state,
- const char* HH_RESTRICT bytes, const size_t size,
- Result* HH_RESTRICT hash) {
- // BeginIACA();
- const size_t remainder = size & (sizeof(HHPacket) - 1);
- const size_t truncated = size & ~(sizeof(HHPacket) - 1);
- for (size_t offset = 0; offset < truncated; offset += sizeof(HHPacket)) {
- state->Update(*reinterpret_cast<const HHPacket*>(bytes + offset));
- }
-
- if (remainder != 0) {
- state->UpdateRemainder(bytes + truncated, remainder);
- }
-
- state->Finalize(hash);
- // EndIACA();
-}
-
-// Wrapper class for incrementally hashing a series of data ranges. The final
-// result is the same as HighwayHashT of the concatenation of all the ranges.
-// This is useful for computing the hash of cords, iovecs, and similar
-// data structures.
-template <TargetBits Target>
-class HighwayHashCatT {
- public:
- HH_INLINE HighwayHashCatT(const HHKey& key) : state_(key) {
- // Avoids msan uninitialized-memory warnings.
- HHStateT<Target>::ZeroInitialize(buffer_);
- }
-
- // Resets the state of the hasher so it can be used to hash a new string.
- HH_INLINE void Reset(const HHKey& key) {
- state_.Reset(key);
- buffer_usage_ = 0;
- }
-
- // Adds "bytes" to the internal buffer, feeding it to HHStateT::Update as
- // required. Call this as often as desired. Only reads bytes within the
- // interval [bytes, bytes + num_bytes). "num_bytes" == 0 has no effect.
- // There are no alignment requirements.
- HH_INLINE void Append(const char* HH_RESTRICT bytes, size_t num_bytes) {
- // BeginIACA();
- const size_t capacity = sizeof(HHPacket) - buffer_usage_;
- // New bytes fit within buffer, but still not enough to Update.
- if (HH_UNLIKELY(num_bytes < capacity)) {
- HHStateT<Target>::AppendPartial(bytes, num_bytes, buffer_, buffer_usage_);
- buffer_usage_ += num_bytes;
- return;
- }
-
- // HACK: ensures the state is kept in SIMD registers; otherwise, Update
- // constantly load/stores its operands, which is much slower.
- // Restrict-qualified pointers to external state or the state_ member are
- // not sufficient for keeping this in registers.
- HHStateT<Target> state_copy = state_;
-
- // Have prior bytes to flush.
- const size_t buffer_usage = buffer_usage_;
- if (HH_LIKELY(buffer_usage != 0)) {
- // Calls update with prior buffer contents plus new data. Does not modify
- // the buffer because some implementations can load into SIMD registers
- // and Append to them directly.
- state_copy.AppendAndUpdate(bytes, capacity, buffer_, buffer_usage);
- bytes += capacity;
- num_bytes -= capacity;
- }
-
- // Buffer currently empty => Update directly from the source.
- while (num_bytes >= sizeof(HHPacket)) {
- state_copy.Update(*reinterpret_cast<const HHPacket*>(bytes));
- bytes += sizeof(HHPacket);
- num_bytes -= sizeof(HHPacket);
- }
-
- // Unconditionally assign even if zero because we didn't reset to zero
- // after the AppendAndUpdate above.
- buffer_usage_ = num_bytes;
-
- state_ = state_copy;
-
- // Store any remainders in buffer, no-op if multiple of a packet.
- if (HH_LIKELY(num_bytes != 0)) {
- HHStateT<Target>::CopyPartial(bytes, num_bytes, buffer_);
- }
- // EndIACA();
- }
-
- // Stores the resulting 64, 128 or 256-bit hash of all data passed to Append.
- // Must be called exactly once, or after a prior Reset.
- template <typename Result> // HHResult*
- HH_INLINE void Finalize(Result* HH_RESTRICT hash) {
- // BeginIACA();
- HHStateT<Target> state_copy = state_;
- const size_t buffer_usage = buffer_usage_;
- if (HH_LIKELY(buffer_usage != 0)) {
- state_copy.UpdateRemainder(buffer_, buffer_usage);
- }
- state_copy.Finalize(hash);
- // EndIACA();
- }
-
- private:
- HHPacket buffer_ HH_ALIGNAS(64);
- HHStateT<Target> state_ HH_ALIGNAS(32);
- // How many bytes in buffer_ (starting with offset 0) are valid.
- size_t buffer_usage_ = 0;
-};
-
-} // namespace highwayhash
-#endif // HH_DISABLE_TARGET_SPECIFIC
-#endif // HIGHWAYHASH_HIGHWAYHASH_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HIGHWAYHASH_H_
+#define HIGHWAYHASH_HIGHWAYHASH_H_
+
+// This header's templates are useful for inlining into other CPU-specific code:
+// template<TargetBits Target> CodeUsingHash() { HighwayHashT<Target>(...); },
+// and can also be instantiated with HH_TARGET when callers don't care about the
+// exact implementation. Otherwise, they are implementation details of the
+// highwayhash_target wrapper. Use that instead if you need to detect the best
+// available implementation at runtime.
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/iaca.h"
+
+// Include exactly one (see arch_specific.h) header, which defines a state
+// object in a target-specific namespace, e.g. AVX2::HHStateAVX2.
+// Attempts to use "computed includes" (#define MACRO "path/or_just_filename",
+// #include MACRO) fail with 'file not found', so we need an #if chain.
+#if HH_TARGET == HH_TARGET_AVX2
+#include "highwayhash/hh_avx2.h"
+#elif HH_TARGET == HH_TARGET_SSE41
+#include "highwayhash/hh_sse41.h"
+#elif HH_TARGET == HH_TARGET_Portable
+#include "highwayhash/hh_portable.h"
+#else
+#error "Unknown target, add its hh_*.h include here."
+#endif
+
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+namespace highwayhash {
+
+// Alias templates (HHStateT) cannot be specialized, so we need a helper struct.
+// Note that hh_*.h don't just specialize HHStateT directly because vector128.h
+// must reside in a distinct namespace (to allow including it from multiple
+// translation units), and it is easier if its users, i.e. the concrete HHState,
+// also reside in that same namespace, which precludes specialization.
+template <TargetBits Target>
+struct HHStateForTarget {};
+
+template <>
+struct HHStateForTarget<HH_TARGET> {
+ // (The namespace is sufficient and the additional HH_TARGET_NAME suffix is
+ // technically redundant, but it makes searching easier.)
+ using type = HH_TARGET_NAME::HH_ADD_TARGET_SUFFIX(HHState);
+};
+
+// Typically used as HHStateT<HH_TARGET>. It would be easier to just have a
+// concrete type HH_STATE, but this alias template is required by the
+// templates in highwayhash_target.cc.
+template <TargetBits Target>
+using HHStateT = typename HHStateForTarget<Target>::type;
+
+// Computes HighwayHash of "bytes" using the implementation chosen by "State".
+//
+// "state" is a HHStateT<> initialized with a key.
+// "bytes" is the data to hash (possibly unaligned).
+// "size" is the number of bytes to hash; we do not read any additional bytes.
+// "hash" is a HHResult* (either 64, 128 or 256 bits).
+//
+// HighwayHash is a strong pseudorandom function with security claims
+// [https://arxiv.org/abs/1612.06257]. It is intended as a safer general-purpose
+// hash, about 4x faster than SipHash and 10x faster than BLAKE2.
+//
+// This template allows callers (e.g. tests) to invoke a specific
+// implementation. It must be compiled with the flags required by the desired
+// implementation. If the entire program cannot be built with these flags, use
+// the wrapper in highwayhash_target.h instead.
+//
+// Callers wanting to hash multiple pieces of data should duplicate this
+// function, calling HHStateT::Update for each input and only Finalizing once.
+template <class State, typename Result>
+HH_INLINE void HighwayHashT(State* HH_RESTRICT state,
+ const char* HH_RESTRICT bytes, const size_t size,
+ Result* HH_RESTRICT hash) {
+ // BeginIACA();
+ const size_t remainder = size & (sizeof(HHPacket) - 1);
+ const size_t truncated = size & ~(sizeof(HHPacket) - 1);
+ for (size_t offset = 0; offset < truncated; offset += sizeof(HHPacket)) {
+ state->Update(*reinterpret_cast<const HHPacket*>(bytes + offset));
+ }
+
+ if (remainder != 0) {
+ state->UpdateRemainder(bytes + truncated, remainder);
+ }
+
+ state->Finalize(hash);
+ // EndIACA();
+}
+
+// Wrapper class for incrementally hashing a series of data ranges. The final
+// result is the same as HighwayHashT of the concatenation of all the ranges.
+// This is useful for computing the hash of cords, iovecs, and similar
+// data structures.
+template <TargetBits Target>
+class HighwayHashCatT {
+ public:
+ HH_INLINE HighwayHashCatT(const HHKey& key) : state_(key) {
+ // Avoids msan uninitialized-memory warnings.
+ HHStateT<Target>::ZeroInitialize(buffer_);
+ }
+
+ // Resets the state of the hasher so it can be used to hash a new string.
+ HH_INLINE void Reset(const HHKey& key) {
+ state_.Reset(key);
+ buffer_usage_ = 0;
+ }
+
+ // Adds "bytes" to the internal buffer, feeding it to HHStateT::Update as
+ // required. Call this as often as desired. Only reads bytes within the
+ // interval [bytes, bytes + num_bytes). "num_bytes" == 0 has no effect.
+ // There are no alignment requirements.
+ HH_INLINE void Append(const char* HH_RESTRICT bytes, size_t num_bytes) {
+ // BeginIACA();
+ const size_t capacity = sizeof(HHPacket) - buffer_usage_;
+ // New bytes fit within buffer, but still not enough to Update.
+ if (HH_UNLIKELY(num_bytes < capacity)) {
+ HHStateT<Target>::AppendPartial(bytes, num_bytes, buffer_, buffer_usage_);
+ buffer_usage_ += num_bytes;
+ return;
+ }
+
+ // HACK: ensures the state is kept in SIMD registers; otherwise, Update
+ // constantly load/stores its operands, which is much slower.
+ // Restrict-qualified pointers to external state or the state_ member are
+ // not sufficient for keeping this in registers.
+ HHStateT<Target> state_copy = state_;
+
+ // Have prior bytes to flush.
+ const size_t buffer_usage = buffer_usage_;
+ if (HH_LIKELY(buffer_usage != 0)) {
+ // Calls update with prior buffer contents plus new data. Does not modify
+ // the buffer because some implementations can load into SIMD registers
+ // and Append to them directly.
+ state_copy.AppendAndUpdate(bytes, capacity, buffer_, buffer_usage);
+ bytes += capacity;
+ num_bytes -= capacity;
+ }
+
+ // Buffer currently empty => Update directly from the source.
+ while (num_bytes >= sizeof(HHPacket)) {
+ state_copy.Update(*reinterpret_cast<const HHPacket*>(bytes));
+ bytes += sizeof(HHPacket);
+ num_bytes -= sizeof(HHPacket);
+ }
+
+ // Unconditionally assign even if zero because we didn't reset to zero
+ // after the AppendAndUpdate above.
+ buffer_usage_ = num_bytes;
+
+ state_ = state_copy;
+
+ // Store any remainders in buffer, no-op if multiple of a packet.
+ if (HH_LIKELY(num_bytes != 0)) {
+ HHStateT<Target>::CopyPartial(bytes, num_bytes, buffer_);
+ }
+ // EndIACA();
+ }
+
+ // Stores the resulting 64, 128 or 256-bit hash of all data passed to Append.
+ // Must be called exactly once, or after a prior Reset.
+ template <typename Result> // HHResult*
+ HH_INLINE void Finalize(Result* HH_RESTRICT hash) {
+ // BeginIACA();
+ HHStateT<Target> state_copy = state_;
+ const size_t buffer_usage = buffer_usage_;
+ if (HH_LIKELY(buffer_usage != 0)) {
+ state_copy.UpdateRemainder(buffer_, buffer_usage);
+ }
+ state_copy.Finalize(hash);
+ // EndIACA();
+ }
+
+ private:
+ HHPacket buffer_ HH_ALIGNAS(64);
+ HHStateT<Target> state_ HH_ALIGNAS(32);
+ // How many bytes in buffer_ (starting with offset 0) are valid.
+ size_t buffer_usage_ = 0;
+};
+
+} // namespace highwayhash
+#endif // HH_DISABLE_TARGET_SPECIFIC
+#endif // HIGHWAYHASH_HIGHWAYHASH_H_
diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_target.cc b/contrib/libs/highwayhash/highwayhash/highwayhash_target.cc
index 74022f64bf..f7dc4a0d54 100644
--- a/contrib/libs/highwayhash/highwayhash/highwayhash_target.cc
+++ b/contrib/libs/highwayhash/highwayhash/highwayhash_target.cc
@@ -1,104 +1,104 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// WARNING: this is a "restricted" source file; avoid including any headers
-// unless they are also restricted. See arch_specific.h for details.
-
-#include "highwayhash/highwayhash_target.h"
-
-#include "highwayhash/highwayhash.h"
-
-#ifndef HH_DISABLE_TARGET_SPECIFIC
-namespace highwayhash {
-
-extern "C" {
-uint64_t HH_ADD_TARGET_SUFFIX(HighwayHash64_)(const HHKey key,
- const char* bytes,
- const uint64_t size) {
- HHStateT<HH_TARGET> state(key);
- HHResult64 result;
- HighwayHashT(&state, bytes, size, &result);
- return result;
-}
-} // extern "C"
-
-template <TargetBits Target>
-void HighwayHash<Target>::operator()(const HHKey& key,
- const char* HH_RESTRICT bytes,
- const size_t size,
- HHResult64* HH_RESTRICT hash) const {
- HHStateT<Target> state(key);
- HighwayHashT(&state, bytes, size, hash);
-}
-
-template <TargetBits Target>
-void HighwayHash<Target>::operator()(const HHKey& key,
- const char* HH_RESTRICT bytes,
- const size_t size,
- HHResult128* HH_RESTRICT hash) const {
- HHStateT<Target> state(key);
- HighwayHashT(&state, bytes, size, hash);
-}
-
-template <TargetBits Target>
-void HighwayHash<Target>::operator()(const HHKey& key,
- const char* HH_RESTRICT bytes,
- const size_t size,
- HHResult256* HH_RESTRICT hash) const {
- HHStateT<Target> state(key);
- HighwayHashT(&state, bytes, size, hash);
-}
-
-template <TargetBits Target>
-void HighwayHashCat<Target>::operator()(const HHKey& key,
- const StringView* HH_RESTRICT fragments,
- const size_t num_fragments,
- HHResult64* HH_RESTRICT hash) const {
- HighwayHashCatT<Target> cat(key);
- for (size_t i = 0; i < num_fragments; ++i) {
- cat.Append(fragments[i].data, fragments[i].num_bytes);
- }
- cat.Finalize(hash);
-}
-
-template <TargetBits Target>
-void HighwayHashCat<Target>::operator()(const HHKey& key,
- const StringView* HH_RESTRICT fragments,
- const size_t num_fragments,
- HHResult128* HH_RESTRICT hash) const {
- HighwayHashCatT<Target> cat(key);
- for (size_t i = 0; i < num_fragments; ++i) {
- cat.Append(fragments[i].data, fragments[i].num_bytes);
- }
- cat.Finalize(hash);
-}
-
-template <TargetBits Target>
-void HighwayHashCat<Target>::operator()(const HHKey& key,
- const StringView* HH_RESTRICT fragments,
- const size_t num_fragments,
- HHResult256* HH_RESTRICT hash) const {
- HighwayHashCatT<Target> cat(key);
- for (size_t i = 0; i < num_fragments; ++i) {
- cat.Append(fragments[i].data, fragments[i].num_bytes);
- }
- cat.Finalize(hash);
-}
-
-// Instantiate for the current target.
-template struct HighwayHash<HH_TARGET>;
-template struct HighwayHashCat<HH_TARGET>;
-
-} // namespace highwayhash
-#endif // HH_DISABLE_TARGET_SPECIFIC
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#include "highwayhash/highwayhash_target.h"
+
+#include "highwayhash/highwayhash.h"
+
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+namespace highwayhash {
+
+extern "C" {
+uint64_t HH_ADD_TARGET_SUFFIX(HighwayHash64_)(const HHKey key,
+ const char* bytes,
+ const uint64_t size) {
+ HHStateT<HH_TARGET> state(key);
+ HHResult64 result;
+ HighwayHashT(&state, bytes, size, &result);
+ return result;
+}
+} // extern "C"
+
+template <TargetBits Target>
+void HighwayHash<Target>::operator()(const HHKey& key,
+ const char* HH_RESTRICT bytes,
+ const size_t size,
+ HHResult64* HH_RESTRICT hash) const {
+ HHStateT<Target> state(key);
+ HighwayHashT(&state, bytes, size, hash);
+}
+
+template <TargetBits Target>
+void HighwayHash<Target>::operator()(const HHKey& key,
+ const char* HH_RESTRICT bytes,
+ const size_t size,
+ HHResult128* HH_RESTRICT hash) const {
+ HHStateT<Target> state(key);
+ HighwayHashT(&state, bytes, size, hash);
+}
+
+template <TargetBits Target>
+void HighwayHash<Target>::operator()(const HHKey& key,
+ const char* HH_RESTRICT bytes,
+ const size_t size,
+ HHResult256* HH_RESTRICT hash) const {
+ HHStateT<Target> state(key);
+ HighwayHashT(&state, bytes, size, hash);
+}
+
+template <TargetBits Target>
+void HighwayHashCat<Target>::operator()(const HHKey& key,
+ const StringView* HH_RESTRICT fragments,
+ const size_t num_fragments,
+ HHResult64* HH_RESTRICT hash) const {
+ HighwayHashCatT<Target> cat(key);
+ for (size_t i = 0; i < num_fragments; ++i) {
+ cat.Append(fragments[i].data, fragments[i].num_bytes);
+ }
+ cat.Finalize(hash);
+}
+
+template <TargetBits Target>
+void HighwayHashCat<Target>::operator()(const HHKey& key,
+ const StringView* HH_RESTRICT fragments,
+ const size_t num_fragments,
+ HHResult128* HH_RESTRICT hash) const {
+ HighwayHashCatT<Target> cat(key);
+ for (size_t i = 0; i < num_fragments; ++i) {
+ cat.Append(fragments[i].data, fragments[i].num_bytes);
+ }
+ cat.Finalize(hash);
+}
+
+template <TargetBits Target>
+void HighwayHashCat<Target>::operator()(const HHKey& key,
+ const StringView* HH_RESTRICT fragments,
+ const size_t num_fragments,
+ HHResult256* HH_RESTRICT hash) const {
+ HighwayHashCatT<Target> cat(key);
+ for (size_t i = 0; i < num_fragments; ++i) {
+ cat.Append(fragments[i].data, fragments[i].num_bytes);
+ }
+ cat.Finalize(hash);
+}
+
+// Instantiate for the current target.
+template struct HighwayHash<HH_TARGET>;
+template struct HighwayHashCat<HH_TARGET>;
+
+} // namespace highwayhash
+#endif // HH_DISABLE_TARGET_SPECIFIC
diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_target.h b/contrib/libs/highwayhash/highwayhash/highwayhash_target.h
index 08b803f191..3d6f33f236 100644
--- a/contrib/libs/highwayhash/highwayhash/highwayhash_target.h
+++ b/contrib/libs/highwayhash/highwayhash/highwayhash_target.h
@@ -1,91 +1,91 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
-#define HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
-
-// Adapter for the InstructionSets::Run dispatcher, which invokes the best
-// implementations available on the current CPU.
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-#include "highwayhash/hh_types.h"
-
-namespace highwayhash {
-
-// Usage: InstructionSets::Run<HighwayHash>(key, bytes, size, hash).
-// This incurs some small dispatch overhead. If the entire program is compiled
-// for the target CPU, you can instead call HighwayHashT directly to avoid any
-// overhead. This template is instantiated in the source file, which is
-// compiled once for every target with the required flags (e.g. -mavx2).
-template <TargetBits Target>
-struct HighwayHash {
- // Stores a 64/128/256 bit hash of "bytes" using the HighwayHashT
- // implementation for the "Target" CPU. The hash result is identical
- // regardless of which implementation is used.
- //
- // "key" is a (randomly generated or hard-coded) HHKey.
- // "bytes" is the data to hash (possibly unaligned).
- // "size" is the number of bytes to hash; we do not read any additional bytes.
- // "hash" is a HHResult* (either 64, 128 or 256 bits).
- //
- // HighwayHash is a strong pseudorandom function with security claims
- // [https://arxiv.org/abs/1612.06257]. It is intended as a safer
- // general-purpose hash, 5x faster than SipHash and 10x faster than BLAKE2.
- void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
- const size_t size, HHResult64* HH_RESTRICT hash) const;
- void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
- const size_t size, HHResult128* HH_RESTRICT hash) const;
- void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
- const size_t size, HHResult256* HH_RESTRICT hash) const;
-};
-
-// Replacement for C++17 std::string_view that avoids dependencies.
-// A struct requires fewer allocations when calling HighwayHashCat with
-// non-const "num_fragments".
-struct StringView {
- const char* data; // not necessarily aligned/padded
- size_t num_bytes; // possibly zero
-};
-
-// Note: this interface avoids dispatch overhead per fragment.
-template <TargetBits Target>
-struct HighwayHashCat {
- // Stores a 64/128/256 bit hash of all "num_fragments" "fragments" using the
- // HighwayHashCatT implementation for "Target". The hash result is identical
- // to HighwayHash of the flattened data, regardless of Target.
- //
- // "key" is a (randomly generated or hard-coded) HHKey.
- // "fragments" contain unaligned pointers and the number of valid bytes.
- // "num_fragments" indicates the number of entries in "fragments".
- // "hash" is a HHResult* (either 64, 128 or 256 bits).
- void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments,
- const size_t num_fragments,
- HHResult64* HH_RESTRICT hash) const;
- void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments,
- const size_t num_fragments,
- HHResult128* HH_RESTRICT hash) const;
- void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments,
- const size_t num_fragments,
- HHResult256* HH_RESTRICT hash) const;
-};
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
+#define HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
+
+// Adapter for the InstructionSets::Run dispatcher, which invokes the best
+// implementations available on the current CPU.
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_types.h"
+
+namespace highwayhash {
+
+// Usage: InstructionSets::Run<HighwayHash>(key, bytes, size, hash).
+// This incurs some small dispatch overhead. If the entire program is compiled
+// for the target CPU, you can instead call HighwayHashT directly to avoid any
+// overhead. This template is instantiated in the source file, which is
+// compiled once for every target with the required flags (e.g. -mavx2).
+template <TargetBits Target>
+struct HighwayHash {
+ // Stores a 64/128/256 bit hash of "bytes" using the HighwayHashT
+ // implementation for the "Target" CPU. The hash result is identical
+ // regardless of which implementation is used.
+ //
+ // "key" is a (randomly generated or hard-coded) HHKey.
+ // "bytes" is the data to hash (possibly unaligned).
+ // "size" is the number of bytes to hash; we do not read any additional bytes.
+ // "hash" is a HHResult* (either 64, 128 or 256 bits).
+ //
+ // HighwayHash is a strong pseudorandom function with security claims
+ // [https://arxiv.org/abs/1612.06257]. It is intended as a safer
+ // general-purpose hash, 5x faster than SipHash and 10x faster than BLAKE2.
+ void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+ const size_t size, HHResult64* HH_RESTRICT hash) const;
+ void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+ const size_t size, HHResult128* HH_RESTRICT hash) const;
+ void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+ const size_t size, HHResult256* HH_RESTRICT hash) const;
+};
+
+// Replacement for C++17 std::string_view that avoids dependencies.
+// A struct requires fewer allocations when calling HighwayHashCat with
+// non-const "num_fragments".
+struct StringView {
+ const char* data; // not necessarily aligned/padded
+ size_t num_bytes; // possibly zero
+};
+
+// Note: this interface avoids dispatch overhead per fragment.
+template <TargetBits Target>
+struct HighwayHashCat {
+ // Stores a 64/128/256 bit hash of all "num_fragments" "fragments" using the
+ // HighwayHashCatT implementation for "Target". The hash result is identical
+ // to HighwayHash of the flattened data, regardless of Target.
+ //
+ // "key" is a (randomly generated or hard-coded) HHKey.
+ // "fragments" contain unaligned pointers and the number of valid bytes.
+ // "num_fragments" indicates the number of entries in "fragments".
+ // "hash" is a HHResult* (either 64, 128 or 256 bits).
+ void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments,
+ const size_t num_fragments,
+ HHResult64* HH_RESTRICT hash) const;
+ void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments,
+ const size_t num_fragments,
+ HHResult128* HH_RESTRICT hash) const;
+ void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments,
+ const size_t num_fragments,
+ HHResult256* HH_RESTRICT hash) const;
+};
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_test.cc b/contrib/libs/highwayhash/highwayhash/highwayhash_test.cc
index b0f8b88712..d7f914af78 100644
--- a/contrib/libs/highwayhash/highwayhash/highwayhash_test.cc
+++ b/contrib/libs/highwayhash/highwayhash/highwayhash_test.cc
@@ -1,388 +1,388 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Ensures each implementation of HighwayHash returns consistent and unchanging
-// hash values.
-
-#include "highwayhash/highwayhash_test_target.h"
-
-#include <stddef.h>
-#include <atomic>
-#include <cstdio>
-#include <cstdlib>
-#include <vector>
-
-#ifdef HH_GOOGLETEST
-#include "testing/base/public/gunit.h"
-#endif
-
-#include "highwayhash/data_parallel.h"
-#include "highwayhash/instruction_sets.h"
-
-// Define to nonzero in order to print the (new) golden outputs.
-#define PRINT_RESULTS 0
-
-namespace highwayhash {
-namespace {
-
-// Known-good outputs are verified for all lengths in [0, 64].
-const size_t kMaxSize = 64;
-
-#if PRINT_RESULTS
-void Print(const HHResult64 result) { printf("0x%016lXull,\n", result); }
-
-// For HHResult128/256.
-template <int kNumLanes>
-void Print(const HHResult64 (&result)[kNumLanes]) {
- printf("{ ");
- for (int i = 0; i < kNumLanes; ++i) {
- if (i != 0) {
- printf(", ");
- }
- printf("0x%016lXull", result[i]);
- }
- printf("},\n");
-}
-#endif // PRINT_RESULTS
-
-// Called when any test fails; exits immediately because one mismatch usually
-// implies many others.
-void OnFailure(const char* target_name, const size_t size) {
- printf("Mismatch at size %zu\n", size);
-#ifdef HH_GOOGLETEST
- EXPECT_TRUE(false);
-#endif
- exit(1);
-}
-
-// Verifies every combination of implementation and input size. Returns which
-// targets were run/verified.
-template <typename Result>
-TargetBits VerifyImplementations(const Result (&known_good)[kMaxSize + 1]) {
- const HHKey key = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL,
- 0x1716151413121110ULL, 0x1F1E1D1C1B1A1918ULL};
-
- TargetBits targets = ~0U;
-
- // For each test input: empty string, 00, 00 01, ...
- char in[kMaxSize + 1] = {0};
- // Fast enough that we don't need a thread pool.
- for (uint64_t size = 0; size <= kMaxSize; ++size) {
- in[size] = static_cast<char>(size);
-#if PRINT_RESULTS
- Result actual;
- targets &= InstructionSets::Run<HighwayHash>(key, in, size, &actual);
- Print(actual);
-#else
- const Result* expected = &known_good[size];
- targets &= InstructionSets::RunAll<HighwayHashTest>(key, in, size, expected,
- &OnFailure);
-#endif
- }
- return targets;
-}
-
-// Cat
-
-void OnCatFailure(const char* target_name, const size_t size) {
- printf("Cat mismatch at size %zu\n", size);
-#ifdef HH_GOOGLETEST
- EXPECT_TRUE(false);
-#endif
- exit(1);
-}
-
-// Returns which targets were run/verified.
-template <typename Result>
-TargetBits VerifyCat(ThreadPool* pool) {
- // Reversed order vs prior test.
- const HHKey key = {0x1F1E1D1C1B1A1918ULL, 0x1716151413121110ULL,
- 0x0F0E0D0C0B0A0908ULL, 0x0706050403020100ULL};
-
- const size_t kMaxSize = 3 * 35;
- std::vector<char> flat;
- flat.reserve(kMaxSize);
- srand(129);
- for (size_t size = 0; size < kMaxSize; ++size) {
- flat.push_back(static_cast<char>(rand() & 0xFF));
- }
-
- std::atomic<TargetBits> targets{~0U};
-
- pool->Run(0, kMaxSize, [&key, &flat, &targets](const uint32_t i) {
- Result dummy;
- targets.fetch_and(InstructionSets::RunAll<HighwayHashCatTest>(
- key, flat.data(), i, &dummy, &OnCatFailure));
- });
- return targets.load();
-}
-
-const HHResult64 kExpected64[kMaxSize + 1] = {
- 0x907A56DE22C26E53ull, 0x7EAB43AAC7CDDD78ull, 0xB8D0569AB0B53D62ull,
- 0x5C6BEFAB8A463D80ull, 0xF205A46893007EDAull, 0x2B8A1668E4A94541ull,
- 0xBD4CCC325BEFCA6Full, 0x4D02AE1738F59482ull, 0xE1205108E55F3171ull,
- 0x32D2644EC77A1584ull, 0xF6E10ACDB103A90Bull, 0xC3BBF4615B415C15ull,
- 0x243CC2040063FA9Cull, 0xA89A58CE65E641FFull, 0x24B031A348455A23ull,
- 0x40793F86A449F33Bull, 0xCFAB3489F97EB832ull, 0x19FE67D2C8C5C0E2ull,
- 0x04DD90A69C565CC2ull, 0x75D9518E2371C504ull, 0x38AD9B1141D3DD16ull,
- 0x0264432CCD8A70E0ull, 0xA9DB5A6288683390ull, 0xD7B05492003F028Cull,
- 0x205F615AEA59E51Eull, 0xEEE0C89621052884ull, 0x1BFC1A93A7284F4Full,
- 0x512175B5B70DA91Dull, 0xF71F8976A0A2C639ull, 0xAE093FEF1F84E3E7ull,
- 0x22CA92B01161860Full, 0x9FC7007CCF035A68ull, 0xA0C964D9ECD580FCull,
- 0x2C90F73CA03181FCull, 0x185CF84E5691EB9Eull, 0x4FC1F5EF2752AA9Bull,
- 0xF5B7391A5E0A33EBull, 0xB9B84B83B4E96C9Cull, 0x5E42FE712A5CD9B4ull,
- 0xA150F2F90C3F97DCull, 0x7FA522D75E2D637Dull, 0x181AD0CC0DFFD32Bull,
- 0x3889ED981E854028ull, 0xFB4297E8C586EE2Dull, 0x6D064A45BB28059Cull,
- 0x90563609B3EC860Cull, 0x7AA4FCE94097C666ull, 0x1326BAC06B911E08ull,
- 0xB926168D2B154F34ull, 0x9919848945B1948Dull, 0xA2A98FC534825EBEull,
- 0xE9809095213EF0B6ull, 0x582E5483707BC0E9ull, 0x086E9414A88A6AF5ull,
- 0xEE86B98D20F6743Dull, 0xF89B7FF609B1C0A7ull, 0x4C7D9CC19E22C3E8ull,
- 0x9A97005024562A6Full, 0x5DD41CF423E6EBEFull, 0xDF13609C0468E227ull,
- 0x6E0DA4F64188155Aull, 0xB755BA4B50D7D4A1ull, 0x887A3484647479BDull,
- 0xAB8EEBE9BF2139A0ull, 0x75542C5D4CD2A6FFull};
-
-const HHResult128 kExpected128[kMaxSize + 1] = {
- {0x0679D1E884C28A7Cull, 0x2BCA2547F904748Dull},
- {0x7F3A39BCC2D897B9ull, 0x4A7E113CA064D91Full},
- {0x6AB34B92C5AB85BFull, 0xED7AC546689D76C2ull},
- {0xAC6AF8405A4A7DBEull, 0xD78FB7953256C3E1ull},
- {0x5A6E8CF789B86448ull, 0x834EF47C1BEDC218ull},
- {0x8EBFE0B573F425A3ull, 0xBCFCC410CB84325Aull},
- {0xA1E19717CAB8F1D6ull, 0x2AA50671881F877Dull},
- {0x0B595302950DA1ECull, 0x46932DE27204B388ull},
- {0x02FB033F200F89D4ull, 0xFEC3D7BB3B421F92ull},
- {0x0A5479D46CC1EADEull, 0x0C16A2D5A0F1C3DEull},
- {0xF759E41DDD621106ull, 0xB43D70116E004750ull},
- {0x980010BC36A4E98Full, 0x27479317AE00BBD1ull},
- {0x3BABF3B23761A379ull, 0xACCDC28E0256F326ull},
- {0x5780CD04269E142Eull, 0xBB70EE3F23BDEDA9ull},
- {0x4A401F1937E99EC3ull, 0x4B3D1385D6B4E214ull},
- {0x045C6EDE080E2CB0ull, 0x7327B45D2132DC89ull},
- {0x97E1624BEB1C1756ull, 0xB7137E1B69D45024ull},
- {0x31DBA8E3DB0BF012ull, 0x3E66E6A78A729B16ull},
- {0x34D6DF1B5D8AF2A7ull, 0x4F1A47FCBC39EB55ull},
- {0xE2C6BE2D47E5DCBCull, 0xD2FF85284E307C1Full},
- {0xDA681E06098EC892ull, 0x71AD98355019FED1ull},
- {0xC4FBD72B1F2FC30Bull, 0x327549B6C9FDEDD5ull},
- {0x14F429D1C20F0EB5ull, 0x228B40C92F3FA369ull},
- {0xF5C9535333206D01ull, 0xB6FC46FCCA65F9CCull},
- {0x3049FAD9DB729D2Dull, 0xB84C931C45F781EAull},
- {0x7C6FFE6F3706DC04ull, 0x4F94583806AE3C62ull},
- {0x9EF95EB28BE1CCE0ull, 0xAD9D5B96A0D15BFEull},
- {0x63D0ED54AF2985E6ull, 0xDFAFB1B6485C1B01ull},
- {0xA46C8A2FE498D46Cull, 0xF4DBAEC0FF03BAD6ull},
- {0xED978A0FBB3E5158ull, 0x060D144D57FBE6FDull},
- {0x53F1D80C8922E4E5ull, 0x1324880D932140C9ull},
- {0xDD363B03563870CEull, 0x0DFDB79F4F34184Bull},
- {0x4E702701AE65DB38ull, 0x1B67E0A2E2DBFB04ull},
- {0x240DA388551D0822ull, 0x2FF1BB584AC4BD61ull},
- {0x3FAFB8B7C26499ABull, 0x072516308E889132ull},
- {0x0AB452339406AB22ull, 0x751DBB7FF9472D42ull},
- {0x83BA782DB6EB1186ull, 0x4391544D9318DC29ull},
- {0x25077ECDAAB201E8ull, 0x695E0E95446D63A2ull},
- {0x1AF0BF12F91F17D4ull, 0x5BB8FF299368D22Cull},
- {0x338C09CBAF701E38ull, 0xA7D24D5E7C06DC78ull},
- {0x5AB58D6555D28B56ull, 0xE781413A9AE1310Full},
- {0xB0281CD10BCA7B89ull, 0xF49873B45C0F7274ull},
- {0x67EEBD6D71E57B06ull, 0x9421CB1DB54EEDDFull},
- {0x00DAB867E37EDA65ull, 0x6477E454191E213Full},
- {0x9AF9C4817C24C82Eull, 0xAE3A73522F311EEBull},
- {0xD8A334E30D23C6E6ull, 0xAF57EF86CCCF12FFull},
- {0x0353A48FC9E139DDull, 0x27D5626170A7DD0Full},
- {0x0DA12E888EB61876ull, 0x67B17DF10CB365CDull},
- {0x967CD764883A5E85ull, 0x570D7C9A774A6AB4ull},
- {0xA8DF13980C81E533ull, 0x9C33FE4797F87F1Aull},
- {0xCABB59F53AE75FF2ull, 0x6D25512E77172E7Aull},
- {0xB24E7F0C7DA62BE7ull, 0x2442F94890F57D89ull},
- {0x7DCBA0A5B9689BBDull, 0x700FC8D13DA4CC60ull},
- {0x1E8E014B97A9F828ull, 0xF858EFCA33E8A502ull},
- {0x4DAF4E31F34D10C7ull, 0x47E382D0A5A8C613ull},
- {0x577CAB4EF626BB28ull, 0xF6ED27E594C5795Full},
- {0x989188C958586C96ull, 0x8B3A2CB0D5B48FD9ull},
- {0x13CC58F5A076C088ull, 0x932A0FD21D4B422Cull},
- {0xD067380DAD885647ull, 0xC1020E396B31BB4Aull},
- {0x47D05A73072758D0ull, 0x5CF6075A0AEB5D78ull},
- {0x54441D7AE94E2D4Eull, 0x3B4F67953ABD3EA4ull},
- {0xEDD4250C3733EEBCull, 0x26E365AA1167C723ull},
- {0x92D02D2A641DA598ull, 0x3DAF5EB24A0C2A94ull},
- {0xAE6CF7FE2D76CA56ull, 0xC7918532A42D2F5Dull},
- {0xAD24762A08D96F1Bull, 0x729083EC59FA8DF7ull}};
-
-const HHResult256 kExpected256[kMaxSize + 1] = {
- {0xC6DC0C823434863Full, 0x6A42CCB644CBFAD9ull, 0x18DEF6A60EA5D873ull,
- 0x3596F663D00D1225ull},
- {0x00518B3D2BD22424ull, 0xE5791619BF612E97ull, 0xF4DAF07017FAF99Dull,
- 0xE36AE62C5509B5D6ull},
- {0x81021CC5067D8526ull, 0xBEEFC1BC87A6911Aull, 0xE2AEC605F80657FEull,
- 0x3C6576B5DF982327ull},
- {0x118D72C0B5DB2C70ull, 0x0BE2E64BF538CA74ull, 0x667B33FE41DDAA74ull,
- 0xB6199539303E13E1ull},
- {0x4AC9B8B2E4FD873Bull, 0xDE0FE265A45FFC97ull, 0x1FC1476F896ADA3Bull,
- 0x7680B4AE30B371E7ull},
- {0x518ABC6B5E88214Full, 0xFD62A05B2B06026Bull, 0x9C978E8B38DBE795ull,
- 0x41412401886FF054ull},
- {0x2DEDEF0832BEA7D9ull, 0x44EFE0AEAB7944FCull, 0x09AA7C9374A1E980ull,
- 0x714DB8B507C507FBull},
- {0x6FA2135DE3D3D3AAull, 0xC0EEA9A890E36156ull, 0xFAC1DB8C817DB095ull,
- 0x7B42789096836327ull},
- {0x27257C518B1FFC5Cull, 0x26CC8E669DA1AB0Full, 0xCD7B17C661A0A680ull,
- 0x31D0A7EC0AA3B9BFull},
- {0xB91869900A1AF26Cull, 0x95B0D74B7FF20B43ull, 0x2A6CABF6F931B575ull,
- 0x69734DC9E66A1965ull},
- {0xDD7DA31F5C4DD30Full, 0x08940D249A0A7B69ull, 0xAE7D3AD1C5EA81F2ull,
- 0x96701DB5C6602B21ull},
- {0x2E4A230847E64687ull, 0xF96176C38E48B038ull, 0x9ED0B88A3026E1BCull,
- 0x9AAB5DCA46FCFE19ull},
- {0x3E5CF04BFBAC2642ull, 0x591A3581001709DFull, 0xA0288F5FA63C10A2ull,
- 0x85B94D3641A2C108ull},
- {0x454A95FAD8901350ull, 0x5546E8E75D2AC833ull, 0xCF5FF2ACB4B5F2C1ull,
- 0x14F314318028D62Eull},
- {0x0DED251FB81F34A9ull, 0xC42111DB31618AA6ull, 0xC1C3352B70B00C5Dull,
- 0xDC8947DBC398F0C2ull},
- {0xC591A100AB4E9E72ull, 0x4CCFD2A7B0D8D911ull, 0x6FEDFDDE1BA3F770ull,
- 0x03E5C5A2F6E708A1ull},
- {0x537C42CC5E7B448Aull, 0xA7343E04249B2231ull, 0x2CB51D697EFE9B6Dull,
- 0x589D83141A699A97ull},
- {0x3F7E6EA60343B870ull, 0x4E27E907E296D4D7ull, 0x87525BF1AABBF794ull,
- 0x6B03C4DC206EC628ull},
- {0x741BA4D7A856E03Cull, 0x3798422CB64C9AFAull, 0xB1D89C9720D33FDDull,
- 0x08DE607FC4E3B5C3ull},
- {0x77D77342C85BA466ull, 0xA01C603C58F6D97Eull, 0x342AF0A7309EA4EAull,
- 0x9C958EB3F6A64B94ull},
- {0x9EDCADDD1FFC763Full, 0xBD9BAA6E9BE936EFull, 0xAAB0F78F1A4A94F7ull,
- 0xE71D9CA601DA4C02ull},
- {0xE3AA0D0A422BF888ull, 0x07734C8173411035ull, 0x8A085019DE545AF6ull,
- 0xBC3C520B1221A779ull},
- {0x16170C02C5E5439Dull, 0x45C6004513BFC174ull, 0x35CF3AD65D225EC8ull,
- 0xE10BAA702D37C90Eull},
- {0x6BD63B47EA43ABC6ull, 0xCC08BE8A651E24C0ull, 0xB564F0FC6FF8998Aull,
- 0x3EE409A34232E589ull},
- {0xD6CEE5574355BB81ull, 0x8E31FF40B271A16Dull, 0xC3ECEDBEEACCCAE9ull,
- 0x19386CD3A23B92E9ull},
- {0x32475E05D248DBB1ull, 0xF2396A122830E72Cull, 0xB88395678C0DB899ull,
- 0x8BD410A22A247066ull},
- {0x0BFA3B3C4775EB43ull, 0x496596C36FB2A200ull, 0xA00F533EF150D7DDull,
- 0xB5D70BBCABB572C4ull},
- {0x932B0ED33ED691B1ull, 0xB58394EDCEA3C53Dull, 0xB935E0786B132755ull,
- 0x3E0998322B3F74BAull},
- {0xE21F2CE1BDD156A7ull, 0x764518A56E1363B5ull, 0x461251D3EC39B93Full,
- 0x33C1FE46C9664CC4ull},
- {0x8ABD3F6184C9CD7Dull, 0x8195816637017FC0ull, 0x284B3E93524765DEull,
- 0x56147BDBA9362D0Eull},
- {0x1F050672342807B6ull, 0x9B0AD1091A83910Dull, 0xF23AD4A58C3B1E21ull,
- 0xCC986EC0BEA16781ull},
- {0x053164DEF96B10CEull, 0x1D5ADA15E36D8F6Cull, 0x06FB43534C0472EFull,
- 0x021C0ED1FDEA0948ull},
- {0xF62BA4C5A665E602ull, 0x490D89FD89430C56ull, 0x18F423BE8A9B7E3Cull,
- 0x769E5DDA4DCAC619ull},
- {0xDABD25FAF07A6684ull, 0xACA85CD21536B927ull, 0xAC05E050B4E3D3D1ull,
- 0xBE427B2475CCD981ull},
- {0x89A2B35A34F89F8Cull, 0x1A0E51B2875D34E6ull, 0xBA573CF45E123919ull,
- 0x1C50815B08F1138Aull},
- {0x3390CCBE60F2AFF7ull, 0xD9E2D245643E79C2ull, 0x1104A78F85D3CDF5ull,
- 0x7E55F38F9C53A58Full},
- {0xC189AE1A9D456C0Eull, 0x06AA4C3D4204A40Full, 0x4B383405A9D451A9ull,
- 0x7EA34CBCAEF0C31Eull},
- {0xB45FA7CC19AE4DDFull, 0x306C418E9BA67420ull, 0xDF16D80D4D48C096ull,
- 0xD3169E50BC8D75CCull},
- {0x5894367013710C89ull, 0xD39EE6D584E76AF3ull, 0x5C55A414BCDDE505ull,
- 0x8FA97D561CB174BFull},
- {0x87355749D59F39DDull, 0x26B8B311E72C50F4ull, 0x1911A8CBCE53E37Bull,
- 0x5C256452C39B95F6ull},
- {0x8B9E87C9ABC82821ull, 0x12A5FC06B69CDC2Dull, 0xF95104FF805E5E1Dull,
- 0xE5D4D2257AD5592Eull},
- {0x5A89242B02E1E048ull, 0x771602AAD1880A7Eull, 0x0F34507608387843ull,
- 0x7AFB45F3EA4F0F24ull},
- {0x3BE3800150FDDE00ull, 0x7871908FF91AD81Aull, 0xA00E07F351BB15C1ull,
- 0x429658E7FD10D11Aull},
- {0x2B2B1A6CD1BA454Cull, 0xF19E8CA5C022308Aull, 0xAEFA0EB6F7C3CF74ull,
- 0x21F4330A5258E7C7ull},
- {0xD1C806622910A9BEull, 0xFE224EF598F541B1ull, 0xB95A435AEC4DD849ull,
- 0xD942A277AB57E68Eull},
- {0x16BF7116E8D2B328ull, 0xB37DC98EA931FC13ull, 0x18E8859A592C8C11ull,
- 0x11590F16C4C61716ull},
- {0xD046122D4C7B24AEull, 0xBD0899DFD7345611ull, 0x91AAECB50DE6DFF9ull,
- 0x6EDC4896BAA90FFAull},
- {0x2FE97B8135EA956Dull, 0xFBA50900FB4EF23Cull, 0x0BC907363F7EA368ull,
- 0xA5C982D3094BCEE2ull},
- {0x247BFB5BA3A0F245ull, 0x6ACBDD4AFFDB03EBull, 0xA4237427D373B619ull,
- 0xFA9C041D302B728Cull},
- {0xF93109909D6B80EFull, 0xD1321A6BEE302794ull, 0xD63E1E7985C458D3ull,
- 0x644CD44F6C6FDE95ull},
- {0xD0522C663FBE65B0ull, 0x78F366F302EA33F5ull, 0xB9ED66D1CB87C891ull,
- 0x0CEB2298BA9D1C1Aull},
- {0x60D60E9B569264E8ull, 0xE34447A5741417EAull, 0x04522108BDF3AFC3ull,
- 0x90F4FE2D585B25FAull},
- {0xAF411662AAB81B12ull, 0x3AD58EBBA1BA2F39ull, 0x73E0E8EB5879E37Dull,
- 0xCE0E8F8F613D3FC5ull},
- {0xCA756CB9E1FDF1C6ull, 0x89731D81712D34BDull, 0xBF520B2D830959C2ull,
- 0xD35ED12BB24CE9EFull},
- {0x5FB2B65ABF038045ull, 0x3F2D32F8532E14D6ull, 0x06443CC95CDD58C8ull,
- 0x30FC6FBE8CCE8EB8ull},
- {0x94A9774F02848D73ull, 0x83F9AFC4C0B48768ull, 0xDB7BF5FBD9B25A26ull,
- 0x7F7D50266FFA639Bull},
- {0x352A775C646259DDull, 0xB2B532B472539832ull, 0x9981AE050A2FB38Cull,
- 0xE13641E804F6DC00ull},
- {0x080E005A04E73352ull, 0x0314F6EA196A210Cull, 0x29EA80869CE307A4ull,
- 0x4FABEB9ADE04BE00ull},
- {0x5674A4A533335ADFull, 0x3C7C0650FF6C585Bull, 0x384E4F8246446812ull,
- 0xAE2DADA5E0EB6D81ull},
- {0xB6CE794A89B0A1F7ull, 0x0DC2B87EC9473CDDull, 0x349A006CA2899C88ull,
- 0x4B411CB7DF6BF33Cull},
- {0xD79BB5606CE6BDAFull, 0x4040EA447818A5C1ull, 0x53D58C5710475284ull,
- 0x3DA8730E092608BAull},
- {0x5900A2DAA12E085Cull, 0x80D490C510C493DDull, 0x4BDF17B0247C8D1Bull,
- 0xA8649490D6CFCE67ull},
- {0xFBDAB07B10180D47ull, 0xED6C196BDC43E292ull, 0xE7D494077FA2791Dull,
- 0xC7108D4FD01BBF85ull},
- {0x4365D6236E6AE467ull, 0xB3D540909D4308A5ull, 0xE38207ABD4588D68ull,
- 0xBBD42849A8C92313ull},
- {0x064DB5FE415126F5ull, 0x248AF8FB29A9C595ull, 0x508633A742B3FFF7ull,
- 0x24CFDCA800C34770ull}};
-
-void RunTests() {
- // TODO(janwas): detect number of cores.
- ThreadPool pool(4);
-
- TargetBits tested = ~0U;
- tested &= VerifyImplementations(kExpected64);
- tested &= VerifyImplementations(kExpected128);
- tested &= VerifyImplementations(kExpected256);
- // Any failure causes immediate exit, so apparently all succeeded.
- HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) {
- printf("%10s: OK\n", TargetName(target));
- });
-
- tested = ~0U;
- tested &= VerifyCat<HHResult64>(&pool);
- tested &= VerifyCat<HHResult128>(&pool);
- tested &= VerifyCat<HHResult256>(&pool);
- HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) {
- printf("%10sCat: OK\n", TargetName(target));
- });
-}
-
-#ifdef HH_GOOGLETEST
-TEST(HighwayhashTest, OutputMatchesExpectations) { RunTests(); }
-#endif
-
-} // namespace
-} // namespace highwayhash
-
-#ifndef HH_GOOGLETEST
-int main(int argc, char* argv[]) {
- highwayhash::RunTests();
- return 0;
-}
-#endif
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Ensures each implementation of HighwayHash returns consistent and unchanging
+// hash values.
+
+#include "highwayhash/highwayhash_test_target.h"
+
+#include <stddef.h>
+#include <atomic>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+#ifdef HH_GOOGLETEST
+#include "testing/base/public/gunit.h"
+#endif
+
+#include "highwayhash/data_parallel.h"
+#include "highwayhash/instruction_sets.h"
+
+// Define to nonzero in order to print the (new) golden outputs.
+#define PRINT_RESULTS 0
+
+namespace highwayhash {
+namespace {
+
+// Known-good outputs are verified for all lengths in [0, 64].
+const size_t kMaxSize = 64;
+
+#if PRINT_RESULTS
+void Print(const HHResult64 result) { printf("0x%016lXull,\n", result); }
+
+// For HHResult128/256.
+template <int kNumLanes>
+void Print(const HHResult64 (&result)[kNumLanes]) {
+ printf("{ ");
+ for (int i = 0; i < kNumLanes; ++i) {
+ if (i != 0) {
+ printf(", ");
+ }
+ printf("0x%016lXull", result[i]);
+ }
+ printf("},\n");
+}
+#endif // PRINT_RESULTS
+
+// Called when any test fails; exits immediately because one mismatch usually
+// implies many others.
+void OnFailure(const char* target_name, const size_t size) {
+ printf("Mismatch at size %zu\n", size);
+#ifdef HH_GOOGLETEST
+ EXPECT_TRUE(false);
+#endif
+ exit(1);
+}
+
+// Verifies every combination of implementation and input size. Returns which
+// targets were run/verified.
+template <typename Result>
+TargetBits VerifyImplementations(const Result (&known_good)[kMaxSize + 1]) {
+ const HHKey key = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL,
+ 0x1716151413121110ULL, 0x1F1E1D1C1B1A1918ULL};
+
+ TargetBits targets = ~0U;
+
+ // For each test input: empty string, 00, 00 01, ...
+ char in[kMaxSize + 1] = {0};
+ // Fast enough that we don't need a thread pool.
+ for (uint64_t size = 0; size <= kMaxSize; ++size) {
+ in[size] = static_cast<char>(size);
+#if PRINT_RESULTS
+ Result actual;
+ targets &= InstructionSets::Run<HighwayHash>(key, in, size, &actual);
+ Print(actual);
+#else
+ const Result* expected = &known_good[size];
+ targets &= InstructionSets::RunAll<HighwayHashTest>(key, in, size, expected,
+ &OnFailure);
+#endif
+ }
+ return targets;
+}
+
+// Cat
+
+void OnCatFailure(const char* target_name, const size_t size) {
+ printf("Cat mismatch at size %zu\n", size);
+#ifdef HH_GOOGLETEST
+ EXPECT_TRUE(false);
+#endif
+ exit(1);
+}
+
+// Returns which targets were run/verified.
+template <typename Result>
+TargetBits VerifyCat(ThreadPool* pool) {
+ // Reversed order vs prior test.
+ const HHKey key = {0x1F1E1D1C1B1A1918ULL, 0x1716151413121110ULL,
+ 0x0F0E0D0C0B0A0908ULL, 0x0706050403020100ULL};
+
+ const size_t kMaxSize = 3 * 35;
+ std::vector<char> flat;
+ flat.reserve(kMaxSize);
+ srand(129);
+ for (size_t size = 0; size < kMaxSize; ++size) {
+ flat.push_back(static_cast<char>(rand() & 0xFF));
+ }
+
+ std::atomic<TargetBits> targets{~0U};
+
+ pool->Run(0, kMaxSize, [&key, &flat, &targets](const uint32_t i) {
+ Result dummy;
+ targets.fetch_and(InstructionSets::RunAll<HighwayHashCatTest>(
+ key, flat.data(), i, &dummy, &OnCatFailure));
+ });
+ return targets.load();
+}
+
+const HHResult64 kExpected64[kMaxSize + 1] = {
+ 0x907A56DE22C26E53ull, 0x7EAB43AAC7CDDD78ull, 0xB8D0569AB0B53D62ull,
+ 0x5C6BEFAB8A463D80ull, 0xF205A46893007EDAull, 0x2B8A1668E4A94541ull,
+ 0xBD4CCC325BEFCA6Full, 0x4D02AE1738F59482ull, 0xE1205108E55F3171ull,
+ 0x32D2644EC77A1584ull, 0xF6E10ACDB103A90Bull, 0xC3BBF4615B415C15ull,
+ 0x243CC2040063FA9Cull, 0xA89A58CE65E641FFull, 0x24B031A348455A23ull,
+ 0x40793F86A449F33Bull, 0xCFAB3489F97EB832ull, 0x19FE67D2C8C5C0E2ull,
+ 0x04DD90A69C565CC2ull, 0x75D9518E2371C504ull, 0x38AD9B1141D3DD16ull,
+ 0x0264432CCD8A70E0ull, 0xA9DB5A6288683390ull, 0xD7B05492003F028Cull,
+ 0x205F615AEA59E51Eull, 0xEEE0C89621052884ull, 0x1BFC1A93A7284F4Full,
+ 0x512175B5B70DA91Dull, 0xF71F8976A0A2C639ull, 0xAE093FEF1F84E3E7ull,
+ 0x22CA92B01161860Full, 0x9FC7007CCF035A68ull, 0xA0C964D9ECD580FCull,
+ 0x2C90F73CA03181FCull, 0x185CF84E5691EB9Eull, 0x4FC1F5EF2752AA9Bull,
+ 0xF5B7391A5E0A33EBull, 0xB9B84B83B4E96C9Cull, 0x5E42FE712A5CD9B4ull,
+ 0xA150F2F90C3F97DCull, 0x7FA522D75E2D637Dull, 0x181AD0CC0DFFD32Bull,
+ 0x3889ED981E854028ull, 0xFB4297E8C586EE2Dull, 0x6D064A45BB28059Cull,
+ 0x90563609B3EC860Cull, 0x7AA4FCE94097C666ull, 0x1326BAC06B911E08ull,
+ 0xB926168D2B154F34ull, 0x9919848945B1948Dull, 0xA2A98FC534825EBEull,
+ 0xE9809095213EF0B6ull, 0x582E5483707BC0E9ull, 0x086E9414A88A6AF5ull,
+ 0xEE86B98D20F6743Dull, 0xF89B7FF609B1C0A7ull, 0x4C7D9CC19E22C3E8ull,
+ 0x9A97005024562A6Full, 0x5DD41CF423E6EBEFull, 0xDF13609C0468E227ull,
+ 0x6E0DA4F64188155Aull, 0xB755BA4B50D7D4A1ull, 0x887A3484647479BDull,
+ 0xAB8EEBE9BF2139A0ull, 0x75542C5D4CD2A6FFull};
+
+const HHResult128 kExpected128[kMaxSize + 1] = {
+ {0x0679D1E884C28A7Cull, 0x2BCA2547F904748Dull},
+ {0x7F3A39BCC2D897B9ull, 0x4A7E113CA064D91Full},
+ {0x6AB34B92C5AB85BFull, 0xED7AC546689D76C2ull},
+ {0xAC6AF8405A4A7DBEull, 0xD78FB7953256C3E1ull},
+ {0x5A6E8CF789B86448ull, 0x834EF47C1BEDC218ull},
+ {0x8EBFE0B573F425A3ull, 0xBCFCC410CB84325Aull},
+ {0xA1E19717CAB8F1D6ull, 0x2AA50671881F877Dull},
+ {0x0B595302950DA1ECull, 0x46932DE27204B388ull},
+ {0x02FB033F200F89D4ull, 0xFEC3D7BB3B421F92ull},
+ {0x0A5479D46CC1EADEull, 0x0C16A2D5A0F1C3DEull},
+ {0xF759E41DDD621106ull, 0xB43D70116E004750ull},
+ {0x980010BC36A4E98Full, 0x27479317AE00BBD1ull},
+ {0x3BABF3B23761A379ull, 0xACCDC28E0256F326ull},
+ {0x5780CD04269E142Eull, 0xBB70EE3F23BDEDA9ull},
+ {0x4A401F1937E99EC3ull, 0x4B3D1385D6B4E214ull},
+ {0x045C6EDE080E2CB0ull, 0x7327B45D2132DC89ull},
+ {0x97E1624BEB1C1756ull, 0xB7137E1B69D45024ull},
+ {0x31DBA8E3DB0BF012ull, 0x3E66E6A78A729B16ull},
+ {0x34D6DF1B5D8AF2A7ull, 0x4F1A47FCBC39EB55ull},
+ {0xE2C6BE2D47E5DCBCull, 0xD2FF85284E307C1Full},
+ {0xDA681E06098EC892ull, 0x71AD98355019FED1ull},
+ {0xC4FBD72B1F2FC30Bull, 0x327549B6C9FDEDD5ull},
+ {0x14F429D1C20F0EB5ull, 0x228B40C92F3FA369ull},
+ {0xF5C9535333206D01ull, 0xB6FC46FCCA65F9CCull},
+ {0x3049FAD9DB729D2Dull, 0xB84C931C45F781EAull},
+ {0x7C6FFE6F3706DC04ull, 0x4F94583806AE3C62ull},
+ {0x9EF95EB28BE1CCE0ull, 0xAD9D5B96A0D15BFEull},
+ {0x63D0ED54AF2985E6ull, 0xDFAFB1B6485C1B01ull},
+ {0xA46C8A2FE498D46Cull, 0xF4DBAEC0FF03BAD6ull},
+ {0xED978A0FBB3E5158ull, 0x060D144D57FBE6FDull},
+ {0x53F1D80C8922E4E5ull, 0x1324880D932140C9ull},
+ {0xDD363B03563870CEull, 0x0DFDB79F4F34184Bull},
+ {0x4E702701AE65DB38ull, 0x1B67E0A2E2DBFB04ull},
+ {0x240DA388551D0822ull, 0x2FF1BB584AC4BD61ull},
+ {0x3FAFB8B7C26499ABull, 0x072516308E889132ull},
+ {0x0AB452339406AB22ull, 0x751DBB7FF9472D42ull},
+ {0x83BA782DB6EB1186ull, 0x4391544D9318DC29ull},
+ {0x25077ECDAAB201E8ull, 0x695E0E95446D63A2ull},
+ {0x1AF0BF12F91F17D4ull, 0x5BB8FF299368D22Cull},
+ {0x338C09CBAF701E38ull, 0xA7D24D5E7C06DC78ull},
+ {0x5AB58D6555D28B56ull, 0xE781413A9AE1310Full},
+ {0xB0281CD10BCA7B89ull, 0xF49873B45C0F7274ull},
+ {0x67EEBD6D71E57B06ull, 0x9421CB1DB54EEDDFull},
+ {0x00DAB867E37EDA65ull, 0x6477E454191E213Full},
+ {0x9AF9C4817C24C82Eull, 0xAE3A73522F311EEBull},
+ {0xD8A334E30D23C6E6ull, 0xAF57EF86CCCF12FFull},
+ {0x0353A48FC9E139DDull, 0x27D5626170A7DD0Full},
+ {0x0DA12E888EB61876ull, 0x67B17DF10CB365CDull},
+ {0x967CD764883A5E85ull, 0x570D7C9A774A6AB4ull},
+ {0xA8DF13980C81E533ull, 0x9C33FE4797F87F1Aull},
+ {0xCABB59F53AE75FF2ull, 0x6D25512E77172E7Aull},
+ {0xB24E7F0C7DA62BE7ull, 0x2442F94890F57D89ull},
+ {0x7DCBA0A5B9689BBDull, 0x700FC8D13DA4CC60ull},
+ {0x1E8E014B97A9F828ull, 0xF858EFCA33E8A502ull},
+ {0x4DAF4E31F34D10C7ull, 0x47E382D0A5A8C613ull},
+ {0x577CAB4EF626BB28ull, 0xF6ED27E594C5795Full},
+ {0x989188C958586C96ull, 0x8B3A2CB0D5B48FD9ull},
+ {0x13CC58F5A076C088ull, 0x932A0FD21D4B422Cull},
+ {0xD067380DAD885647ull, 0xC1020E396B31BB4Aull},
+ {0x47D05A73072758D0ull, 0x5CF6075A0AEB5D78ull},
+ {0x54441D7AE94E2D4Eull, 0x3B4F67953ABD3EA4ull},
+ {0xEDD4250C3733EEBCull, 0x26E365AA1167C723ull},
+ {0x92D02D2A641DA598ull, 0x3DAF5EB24A0C2A94ull},
+ {0xAE6CF7FE2D76CA56ull, 0xC7918532A42D2F5Dull},
+ {0xAD24762A08D96F1Bull, 0x729083EC59FA8DF7ull}};
+
+const HHResult256 kExpected256[kMaxSize + 1] = {
+ {0xC6DC0C823434863Full, 0x6A42CCB644CBFAD9ull, 0x18DEF6A60EA5D873ull,
+ 0x3596F663D00D1225ull},
+ {0x00518B3D2BD22424ull, 0xE5791619BF612E97ull, 0xF4DAF07017FAF99Dull,
+ 0xE36AE62C5509B5D6ull},
+ {0x81021CC5067D8526ull, 0xBEEFC1BC87A6911Aull, 0xE2AEC605F80657FEull,
+ 0x3C6576B5DF982327ull},
+ {0x118D72C0B5DB2C70ull, 0x0BE2E64BF538CA74ull, 0x667B33FE41DDAA74ull,
+ 0xB6199539303E13E1ull},
+ {0x4AC9B8B2E4FD873Bull, 0xDE0FE265A45FFC97ull, 0x1FC1476F896ADA3Bull,
+ 0x7680B4AE30B371E7ull},
+ {0x518ABC6B5E88214Full, 0xFD62A05B2B06026Bull, 0x9C978E8B38DBE795ull,
+ 0x41412401886FF054ull},
+ {0x2DEDEF0832BEA7D9ull, 0x44EFE0AEAB7944FCull, 0x09AA7C9374A1E980ull,
+ 0x714DB8B507C507FBull},
+ {0x6FA2135DE3D3D3AAull, 0xC0EEA9A890E36156ull, 0xFAC1DB8C817DB095ull,
+ 0x7B42789096836327ull},
+ {0x27257C518B1FFC5Cull, 0x26CC8E669DA1AB0Full, 0xCD7B17C661A0A680ull,
+ 0x31D0A7EC0AA3B9BFull},
+ {0xB91869900A1AF26Cull, 0x95B0D74B7FF20B43ull, 0x2A6CABF6F931B575ull,
+ 0x69734DC9E66A1965ull},
+ {0xDD7DA31F5C4DD30Full, 0x08940D249A0A7B69ull, 0xAE7D3AD1C5EA81F2ull,
+ 0x96701DB5C6602B21ull},
+ {0x2E4A230847E64687ull, 0xF96176C38E48B038ull, 0x9ED0B88A3026E1BCull,
+ 0x9AAB5DCA46FCFE19ull},
+ {0x3E5CF04BFBAC2642ull, 0x591A3581001709DFull, 0xA0288F5FA63C10A2ull,
+ 0x85B94D3641A2C108ull},
+ {0x454A95FAD8901350ull, 0x5546E8E75D2AC833ull, 0xCF5FF2ACB4B5F2C1ull,
+ 0x14F314318028D62Eull},
+ {0x0DED251FB81F34A9ull, 0xC42111DB31618AA6ull, 0xC1C3352B70B00C5Dull,
+ 0xDC8947DBC398F0C2ull},
+ {0xC591A100AB4E9E72ull, 0x4CCFD2A7B0D8D911ull, 0x6FEDFDDE1BA3F770ull,
+ 0x03E5C5A2F6E708A1ull},
+ {0x537C42CC5E7B448Aull, 0xA7343E04249B2231ull, 0x2CB51D697EFE9B6Dull,
+ 0x589D83141A699A97ull},
+ {0x3F7E6EA60343B870ull, 0x4E27E907E296D4D7ull, 0x87525BF1AABBF794ull,
+ 0x6B03C4DC206EC628ull},
+ {0x741BA4D7A856E03Cull, 0x3798422CB64C9AFAull, 0xB1D89C9720D33FDDull,
+ 0x08DE607FC4E3B5C3ull},
+ {0x77D77342C85BA466ull, 0xA01C603C58F6D97Eull, 0x342AF0A7309EA4EAull,
+ 0x9C958EB3F6A64B94ull},
+ {0x9EDCADDD1FFC763Full, 0xBD9BAA6E9BE936EFull, 0xAAB0F78F1A4A94F7ull,
+ 0xE71D9CA601DA4C02ull},
+ {0xE3AA0D0A422BF888ull, 0x07734C8173411035ull, 0x8A085019DE545AF6ull,
+ 0xBC3C520B1221A779ull},
+ {0x16170C02C5E5439Dull, 0x45C6004513BFC174ull, 0x35CF3AD65D225EC8ull,
+ 0xE10BAA702D37C90Eull},
+ {0x6BD63B47EA43ABC6ull, 0xCC08BE8A651E24C0ull, 0xB564F0FC6FF8998Aull,
+ 0x3EE409A34232E589ull},
+ {0xD6CEE5574355BB81ull, 0x8E31FF40B271A16Dull, 0xC3ECEDBEEACCCAE9ull,
+ 0x19386CD3A23B92E9ull},
+ {0x32475E05D248DBB1ull, 0xF2396A122830E72Cull, 0xB88395678C0DB899ull,
+ 0x8BD410A22A247066ull},
+ {0x0BFA3B3C4775EB43ull, 0x496596C36FB2A200ull, 0xA00F533EF150D7DDull,
+ 0xB5D70BBCABB572C4ull},
+ {0x932B0ED33ED691B1ull, 0xB58394EDCEA3C53Dull, 0xB935E0786B132755ull,
+ 0x3E0998322B3F74BAull},
+ {0xE21F2CE1BDD156A7ull, 0x764518A56E1363B5ull, 0x461251D3EC39B93Full,
+ 0x33C1FE46C9664CC4ull},
+ {0x8ABD3F6184C9CD7Dull, 0x8195816637017FC0ull, 0x284B3E93524765DEull,
+ 0x56147BDBA9362D0Eull},
+ {0x1F050672342807B6ull, 0x9B0AD1091A83910Dull, 0xF23AD4A58C3B1E21ull,
+ 0xCC986EC0BEA16781ull},
+ {0x053164DEF96B10CEull, 0x1D5ADA15E36D8F6Cull, 0x06FB43534C0472EFull,
+ 0x021C0ED1FDEA0948ull},
+ {0xF62BA4C5A665E602ull, 0x490D89FD89430C56ull, 0x18F423BE8A9B7E3Cull,
+ 0x769E5DDA4DCAC619ull},
+ {0xDABD25FAF07A6684ull, 0xACA85CD21536B927ull, 0xAC05E050B4E3D3D1ull,
+ 0xBE427B2475CCD981ull},
+ {0x89A2B35A34F89F8Cull, 0x1A0E51B2875D34E6ull, 0xBA573CF45E123919ull,
+ 0x1C50815B08F1138Aull},
+ {0x3390CCBE60F2AFF7ull, 0xD9E2D245643E79C2ull, 0x1104A78F85D3CDF5ull,
+ 0x7E55F38F9C53A58Full},
+ {0xC189AE1A9D456C0Eull, 0x06AA4C3D4204A40Full, 0x4B383405A9D451A9ull,
+ 0x7EA34CBCAEF0C31Eull},
+ {0xB45FA7CC19AE4DDFull, 0x306C418E9BA67420ull, 0xDF16D80D4D48C096ull,
+ 0xD3169E50BC8D75CCull},
+ {0x5894367013710C89ull, 0xD39EE6D584E76AF3ull, 0x5C55A414BCDDE505ull,
+ 0x8FA97D561CB174BFull},
+ {0x87355749D59F39DDull, 0x26B8B311E72C50F4ull, 0x1911A8CBCE53E37Bull,
+ 0x5C256452C39B95F6ull},
+ {0x8B9E87C9ABC82821ull, 0x12A5FC06B69CDC2Dull, 0xF95104FF805E5E1Dull,
+ 0xE5D4D2257AD5592Eull},
+ {0x5A89242B02E1E048ull, 0x771602AAD1880A7Eull, 0x0F34507608387843ull,
+ 0x7AFB45F3EA4F0F24ull},
+ {0x3BE3800150FDDE00ull, 0x7871908FF91AD81Aull, 0xA00E07F351BB15C1ull,
+ 0x429658E7FD10D11Aull},
+ {0x2B2B1A6CD1BA454Cull, 0xF19E8CA5C022308Aull, 0xAEFA0EB6F7C3CF74ull,
+ 0x21F4330A5258E7C7ull},
+ {0xD1C806622910A9BEull, 0xFE224EF598F541B1ull, 0xB95A435AEC4DD849ull,
+ 0xD942A277AB57E68Eull},
+ {0x16BF7116E8D2B328ull, 0xB37DC98EA931FC13ull, 0x18E8859A592C8C11ull,
+ 0x11590F16C4C61716ull},
+ {0xD046122D4C7B24AEull, 0xBD0899DFD7345611ull, 0x91AAECB50DE6DFF9ull,
+ 0x6EDC4896BAA90FFAull},
+ {0x2FE97B8135EA956Dull, 0xFBA50900FB4EF23Cull, 0x0BC907363F7EA368ull,
+ 0xA5C982D3094BCEE2ull},
+ {0x247BFB5BA3A0F245ull, 0x6ACBDD4AFFDB03EBull, 0xA4237427D373B619ull,
+ 0xFA9C041D302B728Cull},
+ {0xF93109909D6B80EFull, 0xD1321A6BEE302794ull, 0xD63E1E7985C458D3ull,
+ 0x644CD44F6C6FDE95ull},
+ {0xD0522C663FBE65B0ull, 0x78F366F302EA33F5ull, 0xB9ED66D1CB87C891ull,
+ 0x0CEB2298BA9D1C1Aull},
+ {0x60D60E9B569264E8ull, 0xE34447A5741417EAull, 0x04522108BDF3AFC3ull,
+ 0x90F4FE2D585B25FAull},
+ {0xAF411662AAB81B12ull, 0x3AD58EBBA1BA2F39ull, 0x73E0E8EB5879E37Dull,
+ 0xCE0E8F8F613D3FC5ull},
+ {0xCA756CB9E1FDF1C6ull, 0x89731D81712D34BDull, 0xBF520B2D830959C2ull,
+ 0xD35ED12BB24CE9EFull},
+ {0x5FB2B65ABF038045ull, 0x3F2D32F8532E14D6ull, 0x06443CC95CDD58C8ull,
+ 0x30FC6FBE8CCE8EB8ull},
+ {0x94A9774F02848D73ull, 0x83F9AFC4C0B48768ull, 0xDB7BF5FBD9B25A26ull,
+ 0x7F7D50266FFA639Bull},
+ {0x352A775C646259DDull, 0xB2B532B472539832ull, 0x9981AE050A2FB38Cull,
+ 0xE13641E804F6DC00ull},
+ {0x080E005A04E73352ull, 0x0314F6EA196A210Cull, 0x29EA80869CE307A4ull,
+ 0x4FABEB9ADE04BE00ull},
+ {0x5674A4A533335ADFull, 0x3C7C0650FF6C585Bull, 0x384E4F8246446812ull,
+ 0xAE2DADA5E0EB6D81ull},
+ {0xB6CE794A89B0A1F7ull, 0x0DC2B87EC9473CDDull, 0x349A006CA2899C88ull,
+ 0x4B411CB7DF6BF33Cull},
+ {0xD79BB5606CE6BDAFull, 0x4040EA447818A5C1ull, 0x53D58C5710475284ull,
+ 0x3DA8730E092608BAull},
+ {0x5900A2DAA12E085Cull, 0x80D490C510C493DDull, 0x4BDF17B0247C8D1Bull,
+ 0xA8649490D6CFCE67ull},
+ {0xFBDAB07B10180D47ull, 0xED6C196BDC43E292ull, 0xE7D494077FA2791Dull,
+ 0xC7108D4FD01BBF85ull},
+ {0x4365D6236E6AE467ull, 0xB3D540909D4308A5ull, 0xE38207ABD4588D68ull,
+ 0xBBD42849A8C92313ull},
+ {0x064DB5FE415126F5ull, 0x248AF8FB29A9C595ull, 0x508633A742B3FFF7ull,
+ 0x24CFDCA800C34770ull}};
+
+void RunTests() {
+ // TODO(janwas): detect number of cores.
+ ThreadPool pool(4);
+
+ TargetBits tested = ~0U;
+ tested &= VerifyImplementations(kExpected64);
+ tested &= VerifyImplementations(kExpected128);
+ tested &= VerifyImplementations(kExpected256);
+ // Any failure causes immediate exit, so apparently all succeeded.
+ HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) {
+ printf("%10s: OK\n", TargetName(target));
+ });
+
+ tested = ~0U;
+ tested &= VerifyCat<HHResult64>(&pool);
+ tested &= VerifyCat<HHResult128>(&pool);
+ tested &= VerifyCat<HHResult256>(&pool);
+ HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) {
+ printf("%10sCat: OK\n", TargetName(target));
+ });
+}
+
+#ifdef HH_GOOGLETEST
+TEST(HighwayhashTest, OutputMatchesExpectations) { RunTests(); }
+#endif
+
+} // namespace
+} // namespace highwayhash
+
+#ifndef HH_GOOGLETEST
+int main(int argc, char* argv[]) {
+ highwayhash::RunTests();
+ return 0;
+}
+#endif
diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_test_avx2.cc b/contrib/libs/highwayhash/highwayhash/highwayhash_test_avx2.cc
index f1efe0b5f0..6e12132e8c 100644
--- a/contrib/libs/highwayhash/highwayhash/highwayhash_test_avx2.cc
+++ b/contrib/libs/highwayhash/highwayhash/highwayhash_test_avx2.cc
@@ -1,19 +1,19 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// WARNING: this is a "restricted" source file; avoid including any headers
-// unless they are also restricted. See arch_specific.h for details.
-
-#define HH_TARGET_NAME AVX2
-#include "highwayhash/highwayhash_test_target.cc"
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME AVX2
+#include "highwayhash/highwayhash_test_target.cc"
diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_test_portable.cc b/contrib/libs/highwayhash/highwayhash/highwayhash_test_portable.cc
index 04930a7e12..e5bee564a7 100644
--- a/contrib/libs/highwayhash/highwayhash/highwayhash_test_portable.cc
+++ b/contrib/libs/highwayhash/highwayhash/highwayhash_test_portable.cc
@@ -1,19 +1,19 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// WARNING: this is a "restricted" source file; avoid including any headers
-// unless they are also restricted. See arch_specific.h for details.
-
-#define HH_TARGET_NAME Portable
-#include "highwayhash/highwayhash_test_target.cc"
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME Portable
+#include "highwayhash/highwayhash_test_target.cc"
diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_test_sse41.cc b/contrib/libs/highwayhash/highwayhash/highwayhash_test_sse41.cc
index 2d6e83d66f..1ae43bcca9 100644
--- a/contrib/libs/highwayhash/highwayhash/highwayhash_test_sse41.cc
+++ b/contrib/libs/highwayhash/highwayhash/highwayhash_test_sse41.cc
@@ -1,19 +1,19 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// WARNING: this is a "restricted" source file; avoid including any headers
-// unless they are also restricted. See arch_specific.h for details.
-
-#define HH_TARGET_NAME SSE41
-#include "highwayhash/highwayhash_test_target.cc"
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME SSE41
+#include "highwayhash/highwayhash_test_target.cc"
diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.cc b/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.cc
index 701c14b927..b00704b83c 100644
--- a/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.cc
+++ b/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.cc
@@ -1,211 +1,211 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// WARNING: this is a "restricted" source file; avoid including any headers
-// unless they are also restricted. See arch_specific.h for details.
-
-#include "highwayhash/highwayhash_test_target.h"
-
-#include "highwayhash/highwayhash.h"
-
-#ifndef HH_DISABLE_TARGET_SPECIFIC
-namespace highwayhash {
-namespace {
-
-void NotifyIfUnequal(const size_t size, const HHResult64& expected,
- const HHResult64& actual, const HHNotify notify) {
- if (expected != actual) {
- (*notify)(TargetName(HH_TARGET), size);
- }
-}
-
-// Overload for HHResult128 or HHResult256 (arrays).
-template <size_t kNumLanes>
-void NotifyIfUnequal(const size_t size, const uint64_t (&expected)[kNumLanes],
- const uint64_t (&actual)[kNumLanes],
- const HHNotify notify) {
- for (size_t i = 0; i < kNumLanes; ++i) {
- if (expected[i] != actual[i]) {
- (*notify)(TargetName(HH_TARGET), size);
- return;
- }
- }
-}
-
-// Shared logic for all HighwayHashTest::operator() overloads.
-template <typename Result>
-void TestHighwayHash(HHStateT<HH_TARGET>* HH_RESTRICT state,
- const char* HH_RESTRICT bytes, const size_t size,
- const Result* expected, const HHNotify notify) {
- Result actual;
- HighwayHashT(state, bytes, size, &actual);
- NotifyIfUnequal(size, *expected, actual, notify);
-}
-
-// Shared logic for all HighwayHashCatTest::operator() overloads.
-template <typename Result>
-void TestHighwayHashCat(const HHKey& key, const char* HH_RESTRICT bytes,
- const size_t size, const Result* expected,
- const HHNotify notify) {
- // Slightly faster to compute the expected prefix hashes only once.
- // Use new instead of vector to avoid headers with inline functions.
- Result* results = new Result[size + 1];
- for (size_t i = 0; i <= size; ++i) {
- HHStateT<HH_TARGET> state_flat(key);
- HighwayHashT(&state_flat, bytes, i, &results[i]);
- }
-
- // Splitting into three fragments/Append should cover all codepaths.
- const size_t max_fragment_size = size / 3;
- for (size_t size1 = 0; size1 < max_fragment_size; ++size1) {
- for (size_t size2 = 0; size2 < max_fragment_size; ++size2) {
- for (size_t size3 = 0; size3 < max_fragment_size; ++size3) {
- HighwayHashCatT<HH_TARGET> cat(key);
- const char* pos = bytes;
- cat.Append(pos, size1);
- pos += size1;
- cat.Append(pos, size2);
- pos += size2;
- cat.Append(pos, size3);
- pos += size3;
-
- Result result_cat;
- cat.Finalize(&result_cat);
-
- const size_t total_size = pos - bytes;
- NotifyIfUnequal(total_size, results[total_size], result_cat, notify);
- }
- }
- }
-
- delete[] results;
-}
-
-} // namespace
-
-template <TargetBits Target>
-void HighwayHashTest<Target>::operator()(const HHKey& key,
- const char* HH_RESTRICT bytes,
- const size_t size,
- const HHResult64* expected,
- const HHNotify notify) const {
- HHStateT<Target> state(key);
- TestHighwayHash(&state, bytes, size, expected, notify);
-}
-
-template <TargetBits Target>
-void HighwayHashTest<Target>::operator()(const HHKey& key,
- const char* HH_RESTRICT bytes,
- const size_t size,
- const HHResult128* expected,
- const HHNotify notify) const {
- HHStateT<Target> state(key);
- TestHighwayHash(&state, bytes, size, expected, notify);
-}
-
-template <TargetBits Target>
-void HighwayHashTest<Target>::operator()(const HHKey& key,
- const char* HH_RESTRICT bytes,
- const size_t size,
- const HHResult256* expected,
- const HHNotify notify) const {
- HHStateT<Target> state(key);
- TestHighwayHash(&state, bytes, size, expected, notify);
-}
-
-template <TargetBits Target>
-void HighwayHashCatTest<Target>::operator()(const HHKey& key,
- const char* HH_RESTRICT bytes,
- const uint64_t size,
- const HHResult64* expected,
- const HHNotify notify) const {
- TestHighwayHashCat(key, bytes, size, expected, notify);
-}
-
-template <TargetBits Target>
-void HighwayHashCatTest<Target>::operator()(const HHKey& key,
- const char* HH_RESTRICT bytes,
- const uint64_t size,
- const HHResult128* expected,
- const HHNotify notify) const {
- TestHighwayHashCat(key, bytes, size, expected, notify);
-}
-
-template <TargetBits Target>
-void HighwayHashCatTest<Target>::operator()(const HHKey& key,
- const char* HH_RESTRICT bytes,
- const uint64_t size,
- const HHResult256* expected,
- const HHNotify notify) const {
- TestHighwayHashCat(key, bytes, size, expected, notify);
-}
-
-// Instantiate for the current target.
-template struct HighwayHashTest<HH_TARGET>;
-template struct HighwayHashCatTest<HH_TARGET>;
-
-//-----------------------------------------------------------------------------
-// benchmark
-
-namespace {
-
-template <TargetBits Target>
-uint64_t RunHighway(const size_t size) {
- static const HHKey key HH_ALIGNAS(32) = {0, 1, 2, 3};
- char in[kMaxBenchmarkInputSize];
- in[0] = static_cast<char>(size & 0xFF);
- HHResult64 result;
- HHStateT<Target> state(key);
- HighwayHashT(&state, in, size, &result);
- return result;
-}
-
-template <TargetBits Target>
-uint64_t RunHighwayCat(const size_t size) {
- static const HHKey key HH_ALIGNAS(32) = {0, 1, 2, 3};
- HH_ALIGNAS(64) HighwayHashCatT<Target> cat(key);
- char in[kMaxBenchmarkInputSize];
- in[0] = static_cast<char>(size & 0xFF);
- const size_t half_size = size / 2;
- cat.Append(in, half_size);
- cat.Append(in + half_size, size - half_size);
- HHResult64 result;
- cat.Finalize(&result);
- return result;
-}
-
-} // namespace
-
-template <TargetBits Target>
-void HighwayHashBenchmark<Target>::operator()(DurationsForInputs* input_map,
- NotifyBenchmark notify,
- void* context) const {
- MeasureDurations(&RunHighway<Target>, input_map);
- notify("HighwayHash", TargetName(Target), input_map, context);
-}
-
-template <TargetBits Target>
-void HighwayHashCatBenchmark<Target>::operator()(DurationsForInputs* input_map,
- NotifyBenchmark notify,
- void* context) const {
- MeasureDurations(&RunHighwayCat<Target>, input_map);
- notify("HighwayHashCat", TargetName(Target), input_map, context);
-}
-
-// Instantiate for the current target.
-template struct HighwayHashBenchmark<HH_TARGET>;
-template struct HighwayHashCatBenchmark<HH_TARGET>;
-
-} // namespace highwayhash
-#endif // HH_DISABLE_TARGET_SPECIFIC
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#include "highwayhash/highwayhash_test_target.h"
+
+#include "highwayhash/highwayhash.h"
+
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+namespace highwayhash {
+namespace {
+
+void NotifyIfUnequal(const size_t size, const HHResult64& expected,
+ const HHResult64& actual, const HHNotify notify) {
+ if (expected != actual) {
+ (*notify)(TargetName(HH_TARGET), size);
+ }
+}
+
+// Overload for HHResult128 or HHResult256 (arrays).
+template <size_t kNumLanes>
+void NotifyIfUnequal(const size_t size, const uint64_t (&expected)[kNumLanes],
+ const uint64_t (&actual)[kNumLanes],
+ const HHNotify notify) {
+ for (size_t i = 0; i < kNumLanes; ++i) {
+ if (expected[i] != actual[i]) {
+ (*notify)(TargetName(HH_TARGET), size);
+ return;
+ }
+ }
+}
+
+// Shared logic for all HighwayHashTest::operator() overloads.
+template <typename Result>
+void TestHighwayHash(HHStateT<HH_TARGET>* HH_RESTRICT state,
+ const char* HH_RESTRICT bytes, const size_t size,
+ const Result* expected, const HHNotify notify) {
+ Result actual;
+ HighwayHashT(state, bytes, size, &actual);
+ NotifyIfUnequal(size, *expected, actual, notify);
+}
+
+// Shared logic for all HighwayHashCatTest::operator() overloads.
+template <typename Result>
+void TestHighwayHashCat(const HHKey& key, const char* HH_RESTRICT bytes,
+ const size_t size, const Result* expected,
+ const HHNotify notify) {
+ // Slightly faster to compute the expected prefix hashes only once.
+ // Use new instead of vector to avoid headers with inline functions.
+ Result* results = new Result[size + 1];
+ for (size_t i = 0; i <= size; ++i) {
+ HHStateT<HH_TARGET> state_flat(key);
+ HighwayHashT(&state_flat, bytes, i, &results[i]);
+ }
+
+ // Splitting into three fragments/Append should cover all codepaths.
+ const size_t max_fragment_size = size / 3;
+ for (size_t size1 = 0; size1 < max_fragment_size; ++size1) {
+ for (size_t size2 = 0; size2 < max_fragment_size; ++size2) {
+ for (size_t size3 = 0; size3 < max_fragment_size; ++size3) {
+ HighwayHashCatT<HH_TARGET> cat(key);
+ const char* pos = bytes;
+ cat.Append(pos, size1);
+ pos += size1;
+ cat.Append(pos, size2);
+ pos += size2;
+ cat.Append(pos, size3);
+ pos += size3;
+
+ Result result_cat;
+ cat.Finalize(&result_cat);
+
+ const size_t total_size = pos - bytes;
+ NotifyIfUnequal(total_size, results[total_size], result_cat, notify);
+ }
+ }
+ }
+
+ delete[] results;
+}
+
+} // namespace
+
+template <TargetBits Target>
+void HighwayHashTest<Target>::operator()(const HHKey& key,
+ const char* HH_RESTRICT bytes,
+ const size_t size,
+ const HHResult64* expected,
+ const HHNotify notify) const {
+ HHStateT<Target> state(key);
+ TestHighwayHash(&state, bytes, size, expected, notify);
+}
+
+template <TargetBits Target>
+void HighwayHashTest<Target>::operator()(const HHKey& key,
+ const char* HH_RESTRICT bytes,
+ const size_t size,
+ const HHResult128* expected,
+ const HHNotify notify) const {
+ HHStateT<Target> state(key);
+ TestHighwayHash(&state, bytes, size, expected, notify);
+}
+
+template <TargetBits Target>
+void HighwayHashTest<Target>::operator()(const HHKey& key,
+ const char* HH_RESTRICT bytes,
+ const size_t size,
+ const HHResult256* expected,
+ const HHNotify notify) const {
+ HHStateT<Target> state(key);
+ TestHighwayHash(&state, bytes, size, expected, notify);
+}
+
+template <TargetBits Target>
+void HighwayHashCatTest<Target>::operator()(const HHKey& key,
+ const char* HH_RESTRICT bytes,
+ const uint64_t size,
+ const HHResult64* expected,
+ const HHNotify notify) const {
+ TestHighwayHashCat(key, bytes, size, expected, notify);
+}
+
+template <TargetBits Target>
+void HighwayHashCatTest<Target>::operator()(const HHKey& key,
+ const char* HH_RESTRICT bytes,
+ const uint64_t size,
+ const HHResult128* expected,
+ const HHNotify notify) const {
+ TestHighwayHashCat(key, bytes, size, expected, notify);
+}
+
+template <TargetBits Target>
+void HighwayHashCatTest<Target>::operator()(const HHKey& key,
+ const char* HH_RESTRICT bytes,
+ const uint64_t size,
+ const HHResult256* expected,
+ const HHNotify notify) const {
+ TestHighwayHashCat(key, bytes, size, expected, notify);
+}
+
+// Instantiate for the current target.
+template struct HighwayHashTest<HH_TARGET>;
+template struct HighwayHashCatTest<HH_TARGET>;
+
+//-----------------------------------------------------------------------------
+// benchmark
+
+namespace {
+
+template <TargetBits Target>
+uint64_t RunHighway(const size_t size) {
+ static const HHKey key HH_ALIGNAS(32) = {0, 1, 2, 3};
+ char in[kMaxBenchmarkInputSize];
+ in[0] = static_cast<char>(size & 0xFF);
+ HHResult64 result;
+ HHStateT<Target> state(key);
+ HighwayHashT(&state, in, size, &result);
+ return result;
+}
+
+template <TargetBits Target>
+uint64_t RunHighwayCat(const size_t size) {
+ static const HHKey key HH_ALIGNAS(32) = {0, 1, 2, 3};
+ HH_ALIGNAS(64) HighwayHashCatT<Target> cat(key);
+ char in[kMaxBenchmarkInputSize];
+ in[0] = static_cast<char>(size & 0xFF);
+ const size_t half_size = size / 2;
+ cat.Append(in, half_size);
+ cat.Append(in + half_size, size - half_size);
+ HHResult64 result;
+ cat.Finalize(&result);
+ return result;
+}
+
+} // namespace
+
+template <TargetBits Target>
+void HighwayHashBenchmark<Target>::operator()(DurationsForInputs* input_map,
+ NotifyBenchmark notify,
+ void* context) const {
+ MeasureDurations(&RunHighway<Target>, input_map);
+ notify("HighwayHash", TargetName(Target), input_map, context);
+}
+
+template <TargetBits Target>
+void HighwayHashCatBenchmark<Target>::operator()(DurationsForInputs* input_map,
+ NotifyBenchmark notify,
+ void* context) const {
+ MeasureDurations(&RunHighwayCat<Target>, input_map);
+ notify("HighwayHashCat", TargetName(Target), input_map, context);
+}
+
+// Instantiate for the current target.
+template struct HighwayHashBenchmark<HH_TARGET>;
+template struct HighwayHashCatBenchmark<HH_TARGET>;
+
+} // namespace highwayhash
+#endif // HH_DISABLE_TARGET_SPECIFIC
diff --git a/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.h b/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.h
index b89695d346..88cca8c168 100644
--- a/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.h
+++ b/contrib/libs/highwayhash/highwayhash/highwayhash_test_target.h
@@ -1,89 +1,89 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
-#define HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
-
-// Tests called by InstructionSets::RunAll, so we can verify all
-// implementations supported by the current CPU.
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include <stddef.h>
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-#include "highwayhash/hh_types.h"
-#include "highwayhash/nanobenchmark.h"
-
-namespace highwayhash {
-
-// Verifies the hash result matches "expected" and calls "notify" if not.
-template <TargetBits Target>
-struct HighwayHashTest {
- void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
- const size_t size, const HHResult64* expected,
- const HHNotify notify) const;
- void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
- const size_t size, const HHResult128* expected,
- const HHNotify notify) const;
- void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
- const size_t size, const HHResult256* expected,
- const HHNotify notify) const;
-};
-
-// For every possible partition of "bytes" into zero to three fragments,
-// verifies HighwayHashCat returns the same result as HighwayHashT of the
-// concatenated fragments, and calls "notify" if not. The value of "expected"
-// is ignored; it is only used for overloading.
-template <TargetBits Target>
-struct HighwayHashCatTest {
- void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
- const uint64_t size, const HHResult64* expected,
- const HHNotify notify) const;
- void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
- const uint64_t size, const HHResult128* expected,
- const HHNotify notify) const;
- void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
- const uint64_t size, const HHResult256* expected,
- const HHNotify notify) const;
-};
-
-// Called by benchmark with prefix, target_name, input_map, context.
-// This function must set input_map->num_items to 0.
-using NotifyBenchmark = void (*)(const char*, const char*, DurationsForInputs*,
- void*);
-
-constexpr size_t kMaxBenchmarkInputSize = 1024;
-
-// Calls "notify" with benchmark results for the input sizes specified by
-// "input_map" (<= kMaxBenchmarkInputSize) plus a "context" parameter.
-template <TargetBits Target>
-struct HighwayHashBenchmark {
- void operator()(DurationsForInputs* input_map, NotifyBenchmark notify,
- void* context) const;
-};
-
-template <TargetBits Target>
-struct HighwayHashCatBenchmark {
- void operator()(DurationsForInputs* input_map, NotifyBenchmark notify,
- void* context) const;
-};
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
+#define HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
+
+// Tests called by InstructionSets::RunAll, so we can verify all
+// implementations supported by the current CPU.
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/nanobenchmark.h"
+
+namespace highwayhash {
+
+// Verifies the hash result matches "expected" and calls "notify" if not.
+template <TargetBits Target>
+struct HighwayHashTest {
+ void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+ const size_t size, const HHResult64* expected,
+ const HHNotify notify) const;
+ void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+ const size_t size, const HHResult128* expected,
+ const HHNotify notify) const;
+ void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+ const size_t size, const HHResult256* expected,
+ const HHNotify notify) const;
+};
+
+// For every possible partition of "bytes" into zero to three fragments,
+// verifies HighwayHashCat returns the same result as HighwayHashT of the
+// concatenated fragments, and calls "notify" if not. The value of "expected"
+// is ignored; it is only used for overloading.
+template <TargetBits Target>
+struct HighwayHashCatTest {
+ void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+ const uint64_t size, const HHResult64* expected,
+ const HHNotify notify) const;
+ void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+ const uint64_t size, const HHResult128* expected,
+ const HHNotify notify) const;
+ void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+ const uint64_t size, const HHResult256* expected,
+ const HHNotify notify) const;
+};
+
+// Called by benchmark with prefix, target_name, input_map, context.
+// This function must set input_map->num_items to 0.
+using NotifyBenchmark = void (*)(const char*, const char*, DurationsForInputs*,
+ void*);
+
+constexpr size_t kMaxBenchmarkInputSize = 1024;
+
+// Calls "notify" with benchmark results for the input sizes specified by
+// "input_map" (<= kMaxBenchmarkInputSize) plus a "context" parameter.
+template <TargetBits Target>
+struct HighwayHashBenchmark {
+ void operator()(DurationsForInputs* input_map, NotifyBenchmark notify,
+ void* context) const;
+};
+
+template <TargetBits Target>
+struct HighwayHashCatBenchmark {
+ void operator()(DurationsForInputs* input_map, NotifyBenchmark notify,
+ void* context) const;
+};
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
diff --git a/contrib/libs/highwayhash/highwayhash/iaca.h b/contrib/libs/highwayhash/highwayhash/iaca.h
index 80e1013ae0..3a075544d4 100644
--- a/contrib/libs/highwayhash/highwayhash/iaca.h
+++ b/contrib/libs/highwayhash/highwayhash/iaca.h
@@ -1,63 +1,63 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_IACA_H_
-#define HIGHWAYHASH_IACA_H_
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include "highwayhash/compiler_specific.h"
-
-// IACA (Intel's Code Analyzer, go/intel-iaca) analyzes instruction latencies,
-// but only for code between special markers. These functions embed such markers
-// in an executable, but only for reading via IACA - they deliberately trigger
-// a crash if executed to ensure they are removed in normal builds.
-
-// Default off; callers must `#define HH_ENABLE_IACA 1` before including this.
-#ifndef HH_ENABLE_IACA
-#define HH_ENABLE_IACA 0
-#endif
-
-namespace highwayhash {
-
-#if HH_ENABLE_IACA && (HH_GCC_VERSION || HH_CLANG_VERSION)
-
-// Call before the region of interest. Fences hopefully prevent reordering.
-static HH_INLINE void BeginIACA() {
- HH_COMPILER_FENCE;
- asm volatile(
- ".byte 0x0F, 0x0B\n\t" // UD2
- "movl $111, %ebx\n\t"
- ".byte 0x64, 0x67, 0x90\n\t");
- HH_COMPILER_FENCE;
-}
-
-// Call after the region of interest. Fences hopefully prevent reordering.
-static HH_INLINE void EndIACA() {
- HH_COMPILER_FENCE;
- asm volatile(
- "movl $222, %ebx\n\t"
- ".byte 0x64, 0x67, 0x90\n\t"
- ".byte 0x0F, 0x0B\n\t"); // UD2
- HH_COMPILER_FENCE;
-}
-
-#endif
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_IACA_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_IACA_H_
+#define HIGHWAYHASH_IACA_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/compiler_specific.h"
+
+// IACA (Intel's Code Analyzer, go/intel-iaca) analyzes instruction latencies,
+// but only for code between special markers. These functions embed such markers
+// in an executable, but only for reading via IACA - they deliberately trigger
+// a crash if executed to ensure they are removed in normal builds.
+
+// Default off; callers must `#define HH_ENABLE_IACA 1` before including this.
+#ifndef HH_ENABLE_IACA
+#define HH_ENABLE_IACA 0
+#endif
+
+namespace highwayhash {
+
+#if HH_ENABLE_IACA && (HH_GCC_VERSION || HH_CLANG_VERSION)
+
+// Call before the region of interest. Fences hopefully prevent reordering.
+static HH_INLINE void BeginIACA() {
+ HH_COMPILER_FENCE;
+ asm volatile(
+ ".byte 0x0F, 0x0B\n\t" // UD2
+ "movl $111, %ebx\n\t"
+ ".byte 0x64, 0x67, 0x90\n\t");
+ HH_COMPILER_FENCE;
+}
+
+// Call after the region of interest. Fences hopefully prevent reordering.
+static HH_INLINE void EndIACA() {
+ HH_COMPILER_FENCE;
+ asm volatile(
+ "movl $222, %ebx\n\t"
+ ".byte 0x64, 0x67, 0x90\n\t"
+ ".byte 0x0F, 0x0B\n\t"); // UD2
+ HH_COMPILER_FENCE;
+}
+
+#endif
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_IACA_H_
diff --git a/contrib/libs/highwayhash/highwayhash/instruction_sets.cc b/contrib/libs/highwayhash/highwayhash/instruction_sets.cc
index a02e1f81d9..5760cd6303 100644
--- a/contrib/libs/highwayhash/highwayhash/instruction_sets.cc
+++ b/contrib/libs/highwayhash/highwayhash/instruction_sets.cc
@@ -1,141 +1,141 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "highwayhash/instruction_sets.h"
-#include "highwayhash/arch_specific.h"
-
-// Currently there are only specialized targets for X64; other architectures
-// only use HH_TARGET_Portable, in which case Supported() just returns that.
-#if HH_ARCH_X64
-
-#include <atomic>
-
-namespace highwayhash {
-
-namespace {
-
-bool IsBitSet(const uint32_t reg, const int index) {
- return (reg & (1U << index)) != 0;
-}
-
-// Returns the lower 32 bits of extended control register 0.
-// Requires CPU support for "OSXSAVE" (see below).
-uint32_t ReadXCR0() {
-#if HH_MSC_VERSION
- return static_cast<uint32_t>(_xgetbv(0));
-#else
- uint32_t xcr0, xcr0_high;
- const uint32_t index = 0;
- asm volatile(".byte 0x0F, 0x01, 0xD0"
- : "=a"(xcr0), "=d"(xcr0_high)
- : "c"(index));
- return xcr0;
-#endif
-}
-
-// 0 iff not yet initialized by Supported().
-// Not function-local => no compiler-generated locking.
-std::atomic<TargetBits> supported_{0};
-
-// Bits indicating which instruction set extensions are supported.
-enum {
- kBitSSE = 1 << 0,
- kBitSSE2 = 1 << 1,
- kBitSSE3 = 1 << 2,
- kBitSSSE3 = 1 << 3,
- kBitSSE41 = 1 << 4,
- kBitSSE42 = 1 << 5,
- kBitAVX = 1 << 6,
- kBitAVX2 = 1 << 7,
- kBitFMA = 1 << 8,
- kBitLZCNT = 1 << 9,
- kBitBMI = 1 << 10,
- kBitBMI2 = 1 << 11,
-
- kGroupAVX2 = kBitAVX | kBitAVX2 | kBitFMA | kBitLZCNT | kBitBMI | kBitBMI2,
- kGroupSSE41 = kBitSSE | kBitSSE2 | kBitSSE3 | kBitSSSE3 | kBitSSE41
-};
-
-} // namespace
-
-TargetBits InstructionSets::Supported() {
- TargetBits supported = supported_.load(std::memory_order_acquire);
- // Already initialized, return that.
- if (HH_LIKELY(supported)) {
- return supported;
- }
-
- uint32_t flags = 0;
- uint32_t abcd[4];
-
- Cpuid(0, 0, abcd);
- const uint32_t max_level = abcd[0];
-
- // Standard feature flags
- Cpuid(1, 0, abcd);
- flags |= IsBitSet(abcd[3], 25) ? kBitSSE : 0;
- flags |= IsBitSet(abcd[3], 26) ? kBitSSE2 : 0;
- flags |= IsBitSet(abcd[2], 0) ? kBitSSE3 : 0;
- flags |= IsBitSet(abcd[2], 9) ? kBitSSSE3 : 0;
- flags |= IsBitSet(abcd[2], 19) ? kBitSSE41 : 0;
- flags |= IsBitSet(abcd[2], 20) ? kBitSSE42 : 0;
- flags |= IsBitSet(abcd[2], 12) ? kBitFMA : 0;
- flags |= IsBitSet(abcd[2], 28) ? kBitAVX : 0;
- const bool has_osxsave = IsBitSet(abcd[2], 27);
-
- // Extended feature flags
- Cpuid(0x80000001U, 0, abcd);
- flags |= IsBitSet(abcd[2], 5) ? kBitLZCNT : 0;
-
- // Extended features
- if (max_level >= 7) {
- Cpuid(7, 0, abcd);
- flags |= IsBitSet(abcd[1], 3) ? kBitBMI : 0;
- flags |= IsBitSet(abcd[1], 5) ? kBitAVX2 : 0;
- flags |= IsBitSet(abcd[1], 8) ? kBitBMI2 : 0;
- }
-
- // Verify OS support for XSAVE, without which XMM/YMM registers are not
- // preserved across context switches and are not safe to use.
- if (has_osxsave) {
- const uint32_t xcr0 = ReadXCR0();
- // XMM
- if ((xcr0 & 2) == 0) {
- flags &= ~(kBitSSE | kBitSSE2 | kBitSSE3 | kBitSSSE3 | kBitSSE41 |
- kBitSSE42 | kBitAVX | kBitAVX2 | kBitFMA);
- }
- // YMM
- if ((xcr0 & 4) == 0) {
- flags &= ~(kBitAVX | kBitAVX2);
- }
- }
-
- // Also indicates "supported" has been initialized.
- supported = HH_TARGET_Portable;
-
- // Set target bit(s) if all their group's flags are all set.
- if ((flags & kGroupAVX2) == kGroupAVX2) {
- supported |= HH_TARGET_AVX2;
- }
- if ((flags & kGroupSSE41) == kGroupSSE41) {
- supported |= HH_TARGET_SSE41;
- }
-
- supported_.store(supported, std::memory_order_release);
- return supported;
-}
-
-} // namespace highwayhash
-
-#endif // HH_ARCH_X64
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/instruction_sets.h"
+#include "highwayhash/arch_specific.h"
+
+// Currently there are only specialized targets for X64; other architectures
+// only use HH_TARGET_Portable, in which case Supported() just returns that.
+#if HH_ARCH_X64
+
+#include <atomic>
+
+namespace highwayhash {
+
+namespace {
+
+bool IsBitSet(const uint32_t reg, const int index) {
+ return (reg & (1U << index)) != 0;
+}
+
+// Returns the lower 32 bits of extended control register 0.
+// Requires CPU support for "OSXSAVE" (see below).
+uint32_t ReadXCR0() {
+#if HH_MSC_VERSION
+ return static_cast<uint32_t>(_xgetbv(0));
+#else
+ uint32_t xcr0, xcr0_high;
+ const uint32_t index = 0;
+ asm volatile(".byte 0x0F, 0x01, 0xD0"
+ : "=a"(xcr0), "=d"(xcr0_high)
+ : "c"(index));
+ return xcr0;
+#endif
+}
+
+// 0 iff not yet initialized by Supported().
+// Not function-local => no compiler-generated locking.
+std::atomic<TargetBits> supported_{0};
+
+// Bits indicating which instruction set extensions are supported.
+enum {
+ kBitSSE = 1 << 0,
+ kBitSSE2 = 1 << 1,
+ kBitSSE3 = 1 << 2,
+ kBitSSSE3 = 1 << 3,
+ kBitSSE41 = 1 << 4,
+ kBitSSE42 = 1 << 5,
+ kBitAVX = 1 << 6,
+ kBitAVX2 = 1 << 7,
+ kBitFMA = 1 << 8,
+ kBitLZCNT = 1 << 9,
+ kBitBMI = 1 << 10,
+ kBitBMI2 = 1 << 11,
+
+ kGroupAVX2 = kBitAVX | kBitAVX2 | kBitFMA | kBitLZCNT | kBitBMI | kBitBMI2,
+ kGroupSSE41 = kBitSSE | kBitSSE2 | kBitSSE3 | kBitSSSE3 | kBitSSE41
+};
+
+} // namespace
+
+TargetBits InstructionSets::Supported() {
+ TargetBits supported = supported_.load(std::memory_order_acquire);
+ // Already initialized, return that.
+ if (HH_LIKELY(supported)) {
+ return supported;
+ }
+
+ uint32_t flags = 0;
+ uint32_t abcd[4];
+
+ Cpuid(0, 0, abcd);
+ const uint32_t max_level = abcd[0];
+
+ // Standard feature flags
+ Cpuid(1, 0, abcd);
+ flags |= IsBitSet(abcd[3], 25) ? kBitSSE : 0;
+ flags |= IsBitSet(abcd[3], 26) ? kBitSSE2 : 0;
+ flags |= IsBitSet(abcd[2], 0) ? kBitSSE3 : 0;
+ flags |= IsBitSet(abcd[2], 9) ? kBitSSSE3 : 0;
+ flags |= IsBitSet(abcd[2], 19) ? kBitSSE41 : 0;
+ flags |= IsBitSet(abcd[2], 20) ? kBitSSE42 : 0;
+ flags |= IsBitSet(abcd[2], 12) ? kBitFMA : 0;
+ flags |= IsBitSet(abcd[2], 28) ? kBitAVX : 0;
+ const bool has_osxsave = IsBitSet(abcd[2], 27);
+
+ // Extended feature flags
+ Cpuid(0x80000001U, 0, abcd);
+ flags |= IsBitSet(abcd[2], 5) ? kBitLZCNT : 0;
+
+ // Extended features
+ if (max_level >= 7) {
+ Cpuid(7, 0, abcd);
+ flags |= IsBitSet(abcd[1], 3) ? kBitBMI : 0;
+ flags |= IsBitSet(abcd[1], 5) ? kBitAVX2 : 0;
+ flags |= IsBitSet(abcd[1], 8) ? kBitBMI2 : 0;
+ }
+
+ // Verify OS support for XSAVE, without which XMM/YMM registers are not
+ // preserved across context switches and are not safe to use.
+ if (has_osxsave) {
+ const uint32_t xcr0 = ReadXCR0();
+ // XMM
+ if ((xcr0 & 2) == 0) {
+ flags &= ~(kBitSSE | kBitSSE2 | kBitSSE3 | kBitSSSE3 | kBitSSE41 |
+ kBitSSE42 | kBitAVX | kBitAVX2 | kBitFMA);
+ }
+ // YMM
+ if ((xcr0 & 4) == 0) {
+ flags &= ~(kBitAVX | kBitAVX2);
+ }
+ }
+
+ // Also indicates "supported" has been initialized.
+ supported = HH_TARGET_Portable;
+
+ // Set target bit(s) if all their group's flags are all set.
+ if ((flags & kGroupAVX2) == kGroupAVX2) {
+ supported |= HH_TARGET_AVX2;
+ }
+ if ((flags & kGroupSSE41) == kGroupSSE41) {
+ supported |= HH_TARGET_SSE41;
+ }
+
+ supported_.store(supported, std::memory_order_release);
+ return supported;
+}
+
+} // namespace highwayhash
+
+#endif // HH_ARCH_X64
diff --git a/contrib/libs/highwayhash/highwayhash/instruction_sets.h b/contrib/libs/highwayhash/highwayhash/instruction_sets.h
index 88bc1bc374..5d2251b654 100644
--- a/contrib/libs/highwayhash/highwayhash/instruction_sets.h
+++ b/contrib/libs/highwayhash/highwayhash/instruction_sets.h
@@ -1,88 +1,88 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_INSTRUCTION_SETS_H_
-#define HIGHWAYHASH_INSTRUCTION_SETS_H_
-
-// Calls the best specialization of a template supported by the current CPU.
-//
-// Usage: for each dispatch site, declare a Functor template with a 'Target'
-// argument, add a source file defining its operator() and instantiating
-// Functor<HH_TARGET>, add a cc_library_for_targets rule for that source file,
-// and call InstructionSets::Run<Functor>(/*args*/).
-
-#include <utility> // std::forward
-
-#include "highwayhash/arch_specific.h" // HH_TARGET_*
-#include "highwayhash/compiler_specific.h"
-
-namespace highwayhash {
-
-// Detects TargetBits and calls specializations of a user-defined functor.
-class InstructionSets {
- public:
-// Returns bit array of HH_TARGET_* supported by the current CPU.
-// The HH_TARGET_Portable bit is guaranteed to be set.
-#if HH_ARCH_X64
- static TargetBits Supported();
-#else
- static HH_INLINE TargetBits Supported() { return HH_TARGET_Portable; }
-#endif
-
- // Chooses the best available "Target" for the current CPU, runs the
- // corresponding Func<Target>::operator()(args) and returns that Target
- // (a single bit). The overhead of dispatching is low, about 4 cycles, but
- // this should only be called infrequently (e.g. hoisting it out of loops).
- template <template <TargetBits> class Func, typename... Args>
- static HH_INLINE TargetBits Run(Args&&... args) {
-#if HH_ARCH_X64
- const TargetBits supported = Supported();
- if (supported & HH_TARGET_AVX2) {
- Func<HH_TARGET_AVX2>()(std::forward<Args>(args)...);
- return HH_TARGET_AVX2;
- }
- if (supported & HH_TARGET_SSE41) {
- Func<HH_TARGET_SSE41>()(std::forward<Args>(args)...);
- return HH_TARGET_SSE41;
- }
-#endif // HH_ARCH_X64
-
- Func<HH_TARGET_Portable>()(std::forward<Args>(args)...);
- return HH_TARGET_Portable;
- }
-
- // Calls Func<Target>::operator()(args) for all Target supported by the
- // current CPU, and returns their HH_TARGET_* bits.
- template <template <TargetBits> class Func, typename... Args>
- static HH_INLINE TargetBits RunAll(Args&&... args) {
-#if HH_ARCH_X64
- const TargetBits supported = Supported();
- if (supported & HH_TARGET_AVX2) {
- Func<HH_TARGET_AVX2>()(std::forward<Args>(args)...);
- }
- if (supported & HH_TARGET_SSE41) {
- Func<HH_TARGET_SSE41>()(std::forward<Args>(args)...);
- }
-#else
- const TargetBits supported = HH_TARGET_Portable;
-#endif // HH_ARCH_X64
-
- Func<HH_TARGET_Portable>()(std::forward<Args>(args)...);
- return supported; // i.e. all that were run
- }
-};
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_INSTRUCTION_SETS_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_INSTRUCTION_SETS_H_
+#define HIGHWAYHASH_INSTRUCTION_SETS_H_
+
+// Calls the best specialization of a template supported by the current CPU.
+//
+// Usage: for each dispatch site, declare a Functor template with a 'Target'
+// argument, add a source file defining its operator() and instantiating
+// Functor<HH_TARGET>, add a cc_library_for_targets rule for that source file,
+// and call InstructionSets::Run<Functor>(/*args*/).
+
+#include <utility> // std::forward
+
+#include "highwayhash/arch_specific.h" // HH_TARGET_*
+#include "highwayhash/compiler_specific.h"
+
+namespace highwayhash {
+
+// Detects TargetBits and calls specializations of a user-defined functor.
+class InstructionSets {
+ public:
+// Returns bit array of HH_TARGET_* supported by the current CPU.
+// The HH_TARGET_Portable bit is guaranteed to be set.
+#if HH_ARCH_X64
+ static TargetBits Supported();
+#else
+ static HH_INLINE TargetBits Supported() { return HH_TARGET_Portable; }
+#endif
+
+ // Chooses the best available "Target" for the current CPU, runs the
+ // corresponding Func<Target>::operator()(args) and returns that Target
+ // (a single bit). The overhead of dispatching is low, about 4 cycles, but
+ // this should only be called infrequently (e.g. hoisting it out of loops).
+ template <template <TargetBits> class Func, typename... Args>
+ static HH_INLINE TargetBits Run(Args&&... args) {
+#if HH_ARCH_X64
+ const TargetBits supported = Supported();
+ if (supported & HH_TARGET_AVX2) {
+ Func<HH_TARGET_AVX2>()(std::forward<Args>(args)...);
+ return HH_TARGET_AVX2;
+ }
+ if (supported & HH_TARGET_SSE41) {
+ Func<HH_TARGET_SSE41>()(std::forward<Args>(args)...);
+ return HH_TARGET_SSE41;
+ }
+#endif // HH_ARCH_X64
+
+ Func<HH_TARGET_Portable>()(std::forward<Args>(args)...);
+ return HH_TARGET_Portable;
+ }
+
+ // Calls Func<Target>::operator()(args) for all Target supported by the
+ // current CPU, and returns their HH_TARGET_* bits.
+ template <template <TargetBits> class Func, typename... Args>
+ static HH_INLINE TargetBits RunAll(Args&&... args) {
+#if HH_ARCH_X64
+ const TargetBits supported = Supported();
+ if (supported & HH_TARGET_AVX2) {
+ Func<HH_TARGET_AVX2>()(std::forward<Args>(args)...);
+ }
+ if (supported & HH_TARGET_SSE41) {
+ Func<HH_TARGET_SSE41>()(std::forward<Args>(args)...);
+ }
+#else
+ const TargetBits supported = HH_TARGET_Portable;
+#endif // HH_ARCH_X64
+
+ Func<HH_TARGET_Portable>()(std::forward<Args>(args)...);
+ return supported; // i.e. all that were run
+ }
+};
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_INSTRUCTION_SETS_H_
diff --git a/contrib/libs/highwayhash/highwayhash/load3.h b/contrib/libs/highwayhash/highwayhash/load3.h
index e226b19520..0bf0da9c4d 100644
--- a/contrib/libs/highwayhash/highwayhash/load3.h
+++ b/contrib/libs/highwayhash/highwayhash/load3.h
@@ -1,144 +1,144 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_HH_LOAD3_H_
-#define HIGHWAYHASH_HH_LOAD3_H_
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-#include "highwayhash/endianess.h"
-
-namespace highwayhash {
-// To prevent ODR violations when including this from multiple translation
-// units (TU) that are compiled with different flags, the contents must reside
-// in a namespace whose name is unique to the TU. NOTE: this behavior is
-// incompatible with precompiled modules and requires textual inclusion instead.
-namespace HH_TARGET_NAME {
-
-// Loads 0 to 3 bytes from a given location using one of several policies.
-// These are potentially faster than 8-bit loads, but require certain additional
-// promises by the caller: that 'out of bounds' memory accesses are allowed,
-// and/or that the bytes may be permuted or duplicated.
-class Load3 {
- public:
- // In increasing order of complexity:
- struct AllowReadBeforeAndReturn {};
- struct AllowReadBefore {};
- struct AllowUnordered {};
- struct AllowNone {};
-
- // Up to 4 preceding bytes may be read and returned along with the 0..3
- // valid bytes. The valid bytes are in little-endian order, except that the
- // preceding bytes occupy the least-significant bytes.
- HH_INLINE uint32_t operator()(AllowReadBeforeAndReturn, const char* from,
- const size_t size_mod4) {
- // It's safe to read before "from", so we can load 32 bits, which is faster
- // than individual byte loads. We assume little-endian byte order, so
- // big-endian platforms will need to swap. Type punning can generate
- // incorrect code if compiled with strict aliasing; the only safe
- // alternatives are memcpy and reading through char*. We must avoid memcpy
- // because string.h must not be included per the warning above. On GCC and
- // Clang, we can use a builtin instead.
- uint32_t last4;
- Copy(from + size_mod4 - 4, 4, reinterpret_cast<char*>(&last4));
- return host_from_le32(last4);
- }
-
- // As above, but preceding bytes are removed and upper byte(s) are zero.
- HH_INLINE uint64_t operator()(AllowReadBefore, const char* from,
- const size_t size_mod4) {
- // Shift 0..3 valid bytes into LSB as if loaded in little-endian order.
- // 64-bit type enables 32-bit shift when size_mod4 == 0.
- uint64_t last3 = operator()(AllowReadBeforeAndReturn(), from, size_mod4);
- last3 >>= 32 - (size_mod4 * 8);
- return last3;
- }
-
- // The bytes need not be loaded in little-endian order. This particular order
- // (and the duplication of some bytes depending on "size_mod4") was chosen for
- // computational convenience and can no longer be changed because it is part
- // of the HighwayHash length padding definition.
- HH_INLINE uint64_t operator()(AllowUnordered, const char* from,
- const size_t size_mod4) {
- uint64_t last3 = 0;
- // Not allowed to read any bytes; early-out is faster than reading from a
- // constant array of zeros.
- if (size_mod4 == 0) {
- return last3;
- }
-
- // These indices are chosen as an easy-to-compute sequence containing the
- // same elements as [0, size), but repeated and/or reordered. This enables
- // unconditional loads, which outperform conditional 8 or 16+8 bit loads.
- const uint64_t idx0 = 0;
- const uint64_t idx1 = size_mod4 >> 1;
- const uint64_t idx2 = size_mod4 - 1;
- // Store into least significant bytes (avoids one shift).
- last3 = static_cast<uint64_t>(from[idx0]);
- last3 += static_cast<uint64_t>(from[idx1]) << 8;
- last3 += static_cast<uint64_t>(from[idx2]) << 16;
- return last3;
- }
-
- // Must read exactly [0, size) bytes in little-endian order.
- HH_INLINE uint64_t operator()(AllowNone, const char* from,
- const size_t size_mod4) {
- // We need to load in little-endian order without accessing anything outside
- // [from, from + size_mod4). Unrolling is faster than looping backwards.
- uint64_t last3 = 0;
- if (size_mod4 >= 1) {
- last3 += U64FromChar(from[0]);
- }
- if (size_mod4 >= 2) {
- last3 += U64FromChar(from[1]) << 8;
- }
- if (size_mod4 == 3) {
- last3 += U64FromChar(from[2]) << 16;
- }
- return last3;
- }
-
- private:
- static HH_INLINE uint32_t U32FromChar(const char c) {
- return static_cast<uint32_t>(static_cast<unsigned char>(c));
- }
-
- static HH_INLINE uint64_t U64FromChar(const char c) {
- return static_cast<uint64_t>(static_cast<unsigned char>(c));
- }
-
- static HH_INLINE void Copy(const char* HH_RESTRICT from, const size_t size,
- char* HH_RESTRICT to) {
-#if HH_MSC_VERSION
- for (size_t i = 0; i < size; ++i) {
- to[i] = from[i];
- }
-#else
- __builtin_memcpy(to, from, size);
-#endif
- }
-};
-
-} // namespace HH_TARGET_NAME
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_LOAD3_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_LOAD3_H_
+#define HIGHWAYHASH_HH_LOAD3_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/endianess.h"
+
+namespace highwayhash {
+// To prevent ODR violations when including this from multiple translation
+// units (TU) that are compiled with different flags, the contents must reside
+// in a namespace whose name is unique to the TU. NOTE: this behavior is
+// incompatible with precompiled modules and requires textual inclusion instead.
+namespace HH_TARGET_NAME {
+
+// Loads 0 to 3 bytes from a given location using one of several policies.
+// These are potentially faster than 8-bit loads, but require certain additional
+// promises by the caller: that 'out of bounds' memory accesses are allowed,
+// and/or that the bytes may be permuted or duplicated.
+class Load3 {
+ public:
+ // In increasing order of complexity:
+ struct AllowReadBeforeAndReturn {};
+ struct AllowReadBefore {};
+ struct AllowUnordered {};
+ struct AllowNone {};
+
+ // Up to 4 preceding bytes may be read and returned along with the 0..3
+ // valid bytes. The valid bytes are in little-endian order, except that the
+ // preceding bytes occupy the least-significant bytes.
+ HH_INLINE uint32_t operator()(AllowReadBeforeAndReturn, const char* from,
+ const size_t size_mod4) {
+ // It's safe to read before "from", so we can load 32 bits, which is faster
+ // than individual byte loads. We assume little-endian byte order, so
+ // big-endian platforms will need to swap. Type punning can generate
+ // incorrect code if compiled with strict aliasing; the only safe
+ // alternatives are memcpy and reading through char*. We must avoid memcpy
+ // because string.h must not be included per the warning above. On GCC and
+ // Clang, we can use a builtin instead.
+ uint32_t last4;
+ Copy(from + size_mod4 - 4, 4, reinterpret_cast<char*>(&last4));
+ return host_from_le32(last4);
+ }
+
+ // As above, but preceding bytes are removed and upper byte(s) are zero.
+ HH_INLINE uint64_t operator()(AllowReadBefore, const char* from,
+ const size_t size_mod4) {
+ // Shift 0..3 valid bytes into LSB as if loaded in little-endian order.
+ // 64-bit type enables 32-bit shift when size_mod4 == 0.
+ uint64_t last3 = operator()(AllowReadBeforeAndReturn(), from, size_mod4);
+ last3 >>= 32 - (size_mod4 * 8);
+ return last3;
+ }
+
+ // The bytes need not be loaded in little-endian order. This particular order
+ // (and the duplication of some bytes depending on "size_mod4") was chosen for
+ // computational convenience and can no longer be changed because it is part
+ // of the HighwayHash length padding definition.
+ HH_INLINE uint64_t operator()(AllowUnordered, const char* from,
+ const size_t size_mod4) {
+ uint64_t last3 = 0;
+ // Not allowed to read any bytes; early-out is faster than reading from a
+ // constant array of zeros.
+ if (size_mod4 == 0) {
+ return last3;
+ }
+
+ // These indices are chosen as an easy-to-compute sequence containing the
+ // same elements as [0, size), but repeated and/or reordered. This enables
+ // unconditional loads, which outperform conditional 8 or 16+8 bit loads.
+ const uint64_t idx0 = 0;
+ const uint64_t idx1 = size_mod4 >> 1;
+ const uint64_t idx2 = size_mod4 - 1;
+ // Store into least significant bytes (avoids one shift).
+ last3 = static_cast<uint64_t>(from[idx0]);
+ last3 += static_cast<uint64_t>(from[idx1]) << 8;
+ last3 += static_cast<uint64_t>(from[idx2]) << 16;
+ return last3;
+ }
+
+ // Must read exactly [0, size) bytes in little-endian order.
+ HH_INLINE uint64_t operator()(AllowNone, const char* from,
+ const size_t size_mod4) {
+ // We need to load in little-endian order without accessing anything outside
+ // [from, from + size_mod4). Unrolling is faster than looping backwards.
+ uint64_t last3 = 0;
+ if (size_mod4 >= 1) {
+ last3 += U64FromChar(from[0]);
+ }
+ if (size_mod4 >= 2) {
+ last3 += U64FromChar(from[1]) << 8;
+ }
+ if (size_mod4 == 3) {
+ last3 += U64FromChar(from[2]) << 16;
+ }
+ return last3;
+ }
+
+ private:
+ static HH_INLINE uint32_t U32FromChar(const char c) {
+ return static_cast<uint32_t>(static_cast<unsigned char>(c));
+ }
+
+ static HH_INLINE uint64_t U64FromChar(const char c) {
+ return static_cast<uint64_t>(static_cast<unsigned char>(c));
+ }
+
+ static HH_INLINE void Copy(const char* HH_RESTRICT from, const size_t size,
+ char* HH_RESTRICT to) {
+#if HH_MSC_VERSION
+ for (size_t i = 0; i < size; ++i) {
+ to[i] = from[i];
+ }
+#else
+ __builtin_memcpy(to, from, size);
+#endif
+ }
+};
+
+} // namespace HH_TARGET_NAME
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_LOAD3_H_
diff --git a/contrib/libs/highwayhash/highwayhash/nanobenchmark.cc b/contrib/libs/highwayhash/highwayhash/nanobenchmark.cc
index f0ba6ad35b..5929016f6f 100644
--- a/contrib/libs/highwayhash/highwayhash/nanobenchmark.cc
+++ b/contrib/libs/highwayhash/highwayhash/nanobenchmark.cc
@@ -1,437 +1,437 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "highwayhash/nanobenchmark.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <map>
-#include <random>
-#include <vector>
-
-#include <stddef.h>
-
-#include "highwayhash/os_specific.h"
-#include "highwayhash/robust_statistics.h"
-#include "highwayhash/tsc_timer.h"
-
-namespace highwayhash {
-namespace {
-
-// Enables sanity checks that verify correct operation at the cost of
-// longer benchmark runs.
-#ifndef NANOBENCHMARK_ENABLE_CHECKS
-#define NANOBENCHMARK_ENABLE_CHECKS 0
-#endif
-
-#define NANOBENCHMARK_CHECK_ALWAYS(condition) \
- while (!(condition)) { \
- printf("Nanobenchmark check failed at line %d\n", __LINE__); \
- abort(); \
- }
-
-#if NANOBENCHMARK_ENABLE_CHECKS
-#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
-#else
-#define NANOBENCHMARK_CHECK(condition)
-#endif
-
-#if HH_MSC_VERSION
-
-// MSVC does not support inline assembly anymore (and never supported GCC's
-// RTL constraints used below).
-#pragma optimize("", off)
-// Self-assignment with #pragma optimize("off") might be expected to prevent
-// elision, but it does not with MSVC 2015.
-void UseCharPointer(volatile const char*) {}
-#pragma optimize("", on)
-
-template <class T>
-inline void PreventElision(T&& output) {
- UseCharPointer(reinterpret_cast<volatile const char*>(&output));
-}
-
-#else
-
-// Prevents the compiler from eliding the computations that led to "output".
-// Works by indicating to the compiler that "output" is being read and modified.
-// The +r constraint avoids unnecessary writes to memory, but only works for
-// FuncOutput.
-template <class T>
-inline void PreventElision(T&& output) {
- asm volatile("" : "+r"(output) : : "memory");
-}
-
-#endif
-
-HH_NOINLINE FuncOutput Func1(const FuncInput input) { return input + 1; }
-HH_NOINLINE FuncOutput Func2(const FuncInput input) { return input + 2; }
-
-// Cycles elapsed = difference between two cycle counts. Must be unsigned to
-// ensure wraparound on overflow.
-using Duration = uint32_t;
-
-// Even with high-priority pinned threads and frequency throttling disabled,
-// elapsed times are noisy due to interrupts or SMM operations. It might help
-// to detect such events via transactions and omit affected measurements.
-// Unfortunately, TSX is currently unavailable due to a bug. We achieve
-// repeatable results with a robust measure of the central tendency ("mode").
-
-// Returns time elapsed between timer Start/Stop.
-Duration EstimateResolutionOnCurrentCPU(const Func func) {
- // Even 128K samples are not enough to achieve repeatable results when
- // throttling is enabled; the caller must perform additional aggregation.
- const size_t kNumSamples = 512;
- Duration samples[kNumSamples];
- for (size_t i = 0; i < kNumSamples; ++i) {
- const volatile Duration t0 = Start<Duration>();
- PreventElision(func(i));
- const volatile Duration t1 = Stop<Duration>();
- NANOBENCHMARK_CHECK(t0 <= t1);
- samples[i] = t1 - t0;
- }
- CountingSort(samples, samples + kNumSamples);
- const Duration resolution = Mode(samples, kNumSamples);
- NANOBENCHMARK_CHECK(resolution != 0);
- return resolution;
-}
-
-// Returns mode of EstimateResolutionOnCurrentCPU across all CPUs. This
-// increases repeatability because some CPUs may be throttled or slowed down by
-// interrupts.
-Duration EstimateResolution(const Func func_to_measure) {
- Func func = (func_to_measure == &Func2) ? &Func1 : &Func2;
-
- const size_t kNumSamples = 512;
- std::vector<Duration> resolutions;
- resolutions.reserve(kNumSamples);
-
- const auto cpus = AvailableCPUs();
- const size_t repetitions_per_cpu = kNumSamples / cpus.size();
-
- auto affinity = GetThreadAffinity();
- for (const int cpu : cpus) {
- PinThreadToCPU(cpu);
- for (size_t i = 0; i < repetitions_per_cpu; ++i) {
- resolutions.push_back(EstimateResolutionOnCurrentCPU(func));
- }
- }
- SetThreadAffinity(affinity);
- free(affinity);
-
- Duration* const begin = resolutions.data();
- CountingSort(begin, begin + resolutions.size());
- const Duration resolution = Mode(begin, resolutions.size());
- printf("Resolution %lu\n", long(resolution));
- return resolution;
-}
-
-// Returns cycles elapsed when running an empty region, i.e. the timer
-// resolution/overhead, which will be deducted from other measurements and
-// also used by InitReplicas.
-Duration Resolution(const Func func) {
- // Initialization is expensive and should only happen once.
- static const Duration resolution = EstimateResolution(func);
- return resolution;
-}
-
-// Returns cycles elapsed when passing each of "inputs" (after in-place
-// shuffling) to "func", which must return something it has computed
-// so the compiler does not optimize it away.
-Duration CyclesElapsed(const Duration resolution, const Func func,
- std::vector<FuncInput>* inputs) {
- // This benchmark attempts to measure the performance of "func" when
- // called with realistic inputs, which we assume are randomly drawn
- // from the given "inputs" distribution, so we shuffle those values.
- std::random_shuffle(inputs->begin(), inputs->end());
-
- const Duration t0 = Start<Duration>();
- for (const FuncInput input : *inputs) {
- PreventElision(func(input));
- }
- const Duration t1 = Stop<Duration>();
- const Duration elapsed = t1 - t0;
- NANOBENCHMARK_CHECK(elapsed > resolution);
- return elapsed - resolution;
-}
-
-// Stores input values for a series of calls to the function to measure.
-// We assume inputs are drawn from a known discrete probability distribution,
-// modeled as a vector<FuncInput> v. The probability of a value X
-// in v is count(v.begin(), v.end(), X) / v.size().
-class Inputs {
- Inputs(const Inputs&) = delete;
- Inputs& operator=(const Inputs&) = delete;
-
- public:
- Inputs(const Duration resolution, const std::vector<FuncInput>& distribution,
- const Func func)
- : unique_(InitUnique(distribution)),
- replicas_(InitReplicas(distribution, resolution, func)),
- num_replicas_(replicas_.size() / distribution.size()) {
- printf("NumReplicas %zu\n", num_replicas_);
- }
-
- // Returns vector of the unique values from the input distribution.
- const std::vector<FuncInput>& Unique() const { return unique_; }
-
- // Returns how many instances of "distribution" are in "replicas_", i.e.
- // the number of occurrences of an input value that occurred only once
- // in the distribution. This is the divisor for computing the duration
- // of a single call.
- size_t NumReplicas() const { return num_replicas_; }
-
- // Returns the (replicated) input distribution. Modified by caller
- // (shuffled in-place) => not thread-safe.
- std::vector<FuncInput>& Replicas() { return replicas_; }
-
- // Returns a copy of Replicas() with NumReplicas() occurrences of "input"
- // removed. Used for the leave-one-out measurement.
- std::vector<FuncInput> Without(const FuncInput input_to_remove) const {
- // "input_to_remove" should be in the original distribution.
- NANOBENCHMARK_CHECK(std::find(unique_.begin(), unique_.end(),
- input_to_remove) != unique_.end());
-
- std::vector<FuncInput> copy = replicas_;
- auto pos = std::partition(copy.begin(), copy.end(),
- [input_to_remove](const FuncInput input) {
- return input_to_remove != input;
- });
- // Must occur at least num_replicas_ times.
- NANOBENCHMARK_CHECK(copy.end() - pos >= num_replicas_);
- // (Avoids unused-variable warning.)
- PreventElision(&*pos);
- copy.resize(copy.size() - num_replicas_);
- return copy;
- }
-
- private:
- // Returns a copy with any duplicate values removed. Initializing unique_
- // through this function allows it to be const.
- static std::vector<FuncInput> InitUnique(
- const std::vector<FuncInput>& distribution) {
- std::vector<FuncInput> unique = distribution;
- std::sort(unique.begin(), unique.end());
- unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
- // Our leave-one-out measurement technique only makes sense when
- // there are multiple input values.
- NANOBENCHMARK_CHECK(unique.size() >= 2);
- return unique;
- }
-
- // Returns how many replicas of "distribution" are required before
- // CyclesElapsed is large enough compared to the timer resolution.
- static std::vector<FuncInput> InitReplicas(
- const std::vector<FuncInput>& distribution, const Duration resolution,
- const Func func) {
- // We compute the difference in duration for inputs = Replicas() vs.
- // Without(). Dividing this by num_replicas must yield a value where the
- // quantization error (from the timer resolution) is sufficiently small.
- const uint64_t min_elapsed = distribution.size() * resolution * 400;
-
- std::vector<FuncInput> replicas;
- for (;;) {
- AppendReplica(distribution, &replicas);
-
-#if NANOBENCHMARK_ENABLE_CHECKS
- const uint64_t t0 = Start64();
-#endif
- const Duration elapsed = CyclesElapsed(resolution, func, &replicas);
-#if NANOBENCHMARK_ENABLE_CHECKS
- const uint64_t t1 = Stop64();
-#endif
- // Ensure the 32-bit timer didn't and won't overflow.
- NANOBENCHMARK_CHECK((t1 - t0) < (1ULL << 30));
-
- if (elapsed >= min_elapsed) {
- return replicas;
- }
- }
- }
-
- // Appends all values in "distribution" to "replicas".
- static void AppendReplica(const std::vector<FuncInput>& distribution,
- std::vector<FuncInput>* replicas) {
- replicas->reserve(replicas->size() + distribution.size());
- for (const FuncInput input : distribution) {
- replicas->push_back(input);
- }
- }
-
- const std::vector<FuncInput> unique_;
-
- // Modified by caller (shuffled in-place) => non-const.
- std::vector<FuncInput> replicas_;
-
- // Initialized from replicas_.
- const size_t num_replicas_;
-};
-
-// Holds samples of measured durations, and (robustly) reduces them to a
-// single result for each unique input value.
-class DurationSamples {
- public:
- DurationSamples(const std::vector<FuncInput>& unique_inputs,
- const size_t num_samples)
- : num_samples_(num_samples) {
- // Preallocate storage.
- for (const FuncInput input : unique_inputs) {
- samples_for_input_[input].reserve(num_samples);
- }
- }
-
- void Add(const FuncInput input, const Duration sample) {
- // "input" should be one of the values passed to the ctor.
- NANOBENCHMARK_CHECK(samples_for_input_.find(input) !=
- samples_for_input_.end());
-
- samples_for_input_[input].push_back(sample);
- }
-
- // Invokes "lambda" for each (input, duration) pair. The per-call duration
- // is the central tendency (the mode) of the samples.
- template <class Lambda>
- void Reduce(const Lambda& lambda) {
- for (auto& input_and_samples : samples_for_input_) {
- const FuncInput input = input_and_samples.first;
- std::vector<Duration>& samples = input_and_samples.second;
-
- NANOBENCHMARK_CHECK(samples.size() <= num_samples_);
- std::sort(samples.begin(), samples.end());
- const Duration duration = Mode(samples.data(), samples.size());
- lambda(input, duration);
- }
- }
-
- private:
- const size_t num_samples_;
- std::map<FuncInput, std::vector<Duration>> samples_for_input_;
-};
-
-// Gathers "num_samples" durations via repeated leave-one-out measurements.
-DurationSamples GatherDurationSamples(const Duration resolution, Inputs& inputs,
- const Func func,
- const size_t num_samples) {
- DurationSamples samples(inputs.Unique(), num_samples);
- for (size_t i = 0; i < num_samples; ++i) {
- // Total duration for all shuffled input values. This may change over time,
- // so recompute it for each sample.
- const Duration total = CyclesElapsed(resolution, func, &inputs.Replicas());
-
- for (const FuncInput input : inputs.Unique()) {
- // To isolate the durations of the calls with this input value,
- // we measure the duration without those values and subtract that
- // from the total, and later divide by NumReplicas.
- std::vector<FuncInput> without = inputs.Without(input);
- for (int rep = 0; rep < 3; ++rep) {
- const Duration elapsed = CyclesElapsed(resolution, func, &without);
- if (elapsed < total) {
- samples.Add(input, total - elapsed);
- break;
- }
- }
- }
- }
- return samples;
-}
-
-} // namespace
-
-DurationsForInputs::DurationsForInputs(const FuncInput* inputs,
- const size_t num_inputs,
- const size_t max_durations)
- : num_items(0),
- inputs_(inputs),
- num_inputs_(num_inputs),
- max_durations_(max_durations),
- all_durations_(new float[num_inputs * max_durations]) {
- NANOBENCHMARK_CHECK(num_inputs != 0);
- NANOBENCHMARK_CHECK(max_durations != 0);
-
- items = new Item[num_inputs];
- for (size_t i = 0; i < num_inputs_; ++i) {
- items[i].input = 0; // initialized later
- items[i].num_durations = 0;
- items[i].durations = all_durations_ + i * max_durations;
- }
-}
-
-DurationsForInputs::~DurationsForInputs() {
- delete[] all_durations_;
- delete[] items;
-}
-
-void DurationsForInputs::AddItem(const FuncInput input, const float sample) {
- for (size_t i = 0; i < num_items; ++i) {
- NANOBENCHMARK_CHECK(items[i].input != input);
- }
- Item& item = items[num_items];
- item.input = input;
- item.num_durations = 1;
- item.durations[0] = sample;
- ++num_items;
-}
-
-void DurationsForInputs::AddSample(const FuncInput input, const float sample) {
- for (size_t i = 0; i < num_items; ++i) {
- Item& item = items[i];
- if (item.input == input) {
- item.durations[item.num_durations] = sample;
- ++item.num_durations;
- return;
- }
- }
- NANOBENCHMARK_CHECK(!"Item not found");
-}
-
-void DurationsForInputs::Item::PrintMedianAndVariability() {
- // Copy so that Median can modify.
- std::vector<float> duration_vec(durations, durations + num_durations);
- const float median = Median(&duration_vec);
- const float variability = MedianAbsoluteDeviation(duration_vec, median);
- printf("%5zu: median=%5.1f cycles; median abs. deviation=%4.1f cycles\n",
- input, median, variability);
-}
-
-void MeasureDurations(const Func func, DurationsForInputs* input_map) {
- const Duration resolution = Resolution(func);
-
- // Adds enough 'replicas' of the distribution to measure "func" given
- // the timer resolution.
- const std::vector<FuncInput> distribution(
- input_map->inputs_, input_map->inputs_ + input_map->num_inputs_);
- Inputs inputs(resolution, distribution, func);
- const double per_call = 1.0 / static_cast<int>(inputs.NumReplicas());
-
- // First iteration: populate input_map items.
- auto samples = GatherDurationSamples(resolution, inputs, func, 512);
- samples.Reduce(
- [per_call, input_map](const FuncInput input, const Duration duration) {
- const float sample = static_cast<float>(duration * per_call);
- input_map->AddItem(input, sample);
- });
-
- // Subsequent iteration(s): append to input_map items' array.
- for (size_t rep = 1; rep < input_map->max_durations_; ++rep) {
- auto samples = GatherDurationSamples(resolution, inputs, func, 512);
- samples.Reduce(
- [per_call, input_map](const FuncInput input, const Duration duration) {
- const float sample = static_cast<float>(duration * per_call);
- input_map->AddSample(input, sample);
- });
- }
-}
-
-} // namespace highwayhash
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/nanobenchmark.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <map>
+#include <random>
+#include <vector>
+
+#include <stddef.h>
+
+#include "highwayhash/os_specific.h"
+#include "highwayhash/robust_statistics.h"
+#include "highwayhash/tsc_timer.h"
+
+namespace highwayhash {
+namespace {
+
+// Enables sanity checks that verify correct operation at the cost of
+// longer benchmark runs.
+#ifndef NANOBENCHMARK_ENABLE_CHECKS
+#define NANOBENCHMARK_ENABLE_CHECKS 0
+#endif
+
+#define NANOBENCHMARK_CHECK_ALWAYS(condition) \
+ while (!(condition)) { \
+ printf("Nanobenchmark check failed at line %d\n", __LINE__); \
+ abort(); \
+ }
+
+#if NANOBENCHMARK_ENABLE_CHECKS
+#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
+#else
+#define NANOBENCHMARK_CHECK(condition)
+#endif
+
+#if HH_MSC_VERSION
+
+// MSVC does not support inline assembly anymore (and never supported GCC's
+// RTL constraints used below).
+#pragma optimize("", off)
+// Self-assignment with #pragma optimize("off") might be expected to prevent
+// elision, but it does not with MSVC 2015.
+void UseCharPointer(volatile const char*) {}
+#pragma optimize("", on)
+
+template <class T>
+inline void PreventElision(T&& output) {
+ UseCharPointer(reinterpret_cast<volatile const char*>(&output));
+}
+
+#else
+
+// Prevents the compiler from eliding the computations that led to "output".
+// Works by indicating to the compiler that "output" is being read and modified.
+// The +r constraint avoids unnecessary writes to memory, but only works for
+// FuncOutput.
+template <class T>
+inline void PreventElision(T&& output) {
+ asm volatile("" : "+r"(output) : : "memory");
+}
+
+#endif
+
+HH_NOINLINE FuncOutput Func1(const FuncInput input) { return input + 1; }
+HH_NOINLINE FuncOutput Func2(const FuncInput input) { return input + 2; }
+
+// Cycles elapsed = difference between two cycle counts. Must be unsigned to
+// ensure wraparound on overflow.
+using Duration = uint32_t;
+
+// Even with high-priority pinned threads and frequency throttling disabled,
+// elapsed times are noisy due to interrupts or SMM operations. It might help
+// to detect such events via transactions and omit affected measurements.
+// Unfortunately, TSX is currently unavailable due to a bug. We achieve
+// repeatable results with a robust measure of the central tendency ("mode").
+
+// Returns time elapsed between timer Start/Stop.
+Duration EstimateResolutionOnCurrentCPU(const Func func) {
+ // Even 128K samples are not enough to achieve repeatable results when
+ // throttling is enabled; the caller must perform additional aggregation.
+ const size_t kNumSamples = 512;
+ Duration samples[kNumSamples];
+ for (size_t i = 0; i < kNumSamples; ++i) {
+ const volatile Duration t0 = Start<Duration>();
+ PreventElision(func(i));
+ const volatile Duration t1 = Stop<Duration>();
+ NANOBENCHMARK_CHECK(t0 <= t1);
+ samples[i] = t1 - t0;
+ }
+ CountingSort(samples, samples + kNumSamples);
+ const Duration resolution = Mode(samples, kNumSamples);
+ NANOBENCHMARK_CHECK(resolution != 0);
+ return resolution;
+}
+
+// Returns mode of EstimateResolutionOnCurrentCPU across all CPUs. This
+// increases repeatability because some CPUs may be throttled or slowed down by
+// interrupts.
+Duration EstimateResolution(const Func func_to_measure) {
+ Func func = (func_to_measure == &Func2) ? &Func1 : &Func2;
+
+ const size_t kNumSamples = 512;
+ std::vector<Duration> resolutions;
+ resolutions.reserve(kNumSamples);
+
+ const auto cpus = AvailableCPUs();
+ const size_t repetitions_per_cpu = kNumSamples / cpus.size();
+
+ auto affinity = GetThreadAffinity();
+ for (const int cpu : cpus) {
+ PinThreadToCPU(cpu);
+ for (size_t i = 0; i < repetitions_per_cpu; ++i) {
+ resolutions.push_back(EstimateResolutionOnCurrentCPU(func));
+ }
+ }
+ SetThreadAffinity(affinity);
+ free(affinity);
+
+ Duration* const begin = resolutions.data();
+ CountingSort(begin, begin + resolutions.size());
+ const Duration resolution = Mode(begin, resolutions.size());
+ printf("Resolution %lu\n", long(resolution));
+ return resolution;
+}
+
+// Returns cycles elapsed when running an empty region, i.e. the timer
+// resolution/overhead, which will be deducted from other measurements and
+// also used by InitReplicas.
+Duration Resolution(const Func func) {
+ // Initialization is expensive and should only happen once.
+ static const Duration resolution = EstimateResolution(func);
+ return resolution;
+}
+
+// Returns cycles elapsed when passing each of "inputs" (after in-place
+// shuffling) to "func", which must return something it has computed
+// so the compiler does not optimize it away.
+Duration CyclesElapsed(const Duration resolution, const Func func,
+ std::vector<FuncInput>* inputs) {
+ // This benchmark attempts to measure the performance of "func" when
+ // called with realistic inputs, which we assume are randomly drawn
+ // from the given "inputs" distribution, so we shuffle those values.
+ std::random_shuffle(inputs->begin(), inputs->end());
+
+ const Duration t0 = Start<Duration>();
+ for (const FuncInput input : *inputs) {
+ PreventElision(func(input));
+ }
+ const Duration t1 = Stop<Duration>();
+ const Duration elapsed = t1 - t0;
+ NANOBENCHMARK_CHECK(elapsed > resolution);
+ return elapsed - resolution;
+}
+
+// Stores input values for a series of calls to the function to measure.
+// We assume inputs are drawn from a known discrete probability distribution,
+// modeled as a vector<FuncInput> v. The probability of a value X
+// in v is count(v.begin(), v.end(), X) / v.size().
+class Inputs {
+ Inputs(const Inputs&) = delete;
+ Inputs& operator=(const Inputs&) = delete;
+
+ public:
+ Inputs(const Duration resolution, const std::vector<FuncInput>& distribution,
+ const Func func)
+ : unique_(InitUnique(distribution)),
+ replicas_(InitReplicas(distribution, resolution, func)),
+ num_replicas_(replicas_.size() / distribution.size()) {
+ printf("NumReplicas %zu\n", num_replicas_);
+ }
+
+ // Returns vector of the unique values from the input distribution.
+ const std::vector<FuncInput>& Unique() const { return unique_; }
+
+ // Returns how many instances of "distribution" are in "replicas_", i.e.
+ // the number of occurrences of an input value that occurred only once
+ // in the distribution. This is the divisor for computing the duration
+ // of a single call.
+ size_t NumReplicas() const { return num_replicas_; }
+
+ // Returns the (replicated) input distribution. Modified by caller
+ // (shuffled in-place) => not thread-safe.
+ std::vector<FuncInput>& Replicas() { return replicas_; }
+
+ // Returns a copy of Replicas() with NumReplicas() occurrences of "input"
+ // removed. Used for the leave-one-out measurement.
+ std::vector<FuncInput> Without(const FuncInput input_to_remove) const {
+ // "input_to_remove" should be in the original distribution.
+ NANOBENCHMARK_CHECK(std::find(unique_.begin(), unique_.end(),
+ input_to_remove) != unique_.end());
+
+ std::vector<FuncInput> copy = replicas_;
+ auto pos = std::partition(copy.begin(), copy.end(),
+ [input_to_remove](const FuncInput input) {
+ return input_to_remove != input;
+ });
+ // Must occur at least num_replicas_ times.
+ NANOBENCHMARK_CHECK(copy.end() - pos >= num_replicas_);
+ // (Avoids unused-variable warning.)
+ PreventElision(&*pos);
+ copy.resize(copy.size() - num_replicas_);
+ return copy;
+ }
+
+ private:
+ // Returns a copy with any duplicate values removed. Initializing unique_
+ // through this function allows it to be const.
+ static std::vector<FuncInput> InitUnique(
+ const std::vector<FuncInput>& distribution) {
+ std::vector<FuncInput> unique = distribution;
+ std::sort(unique.begin(), unique.end());
+ unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
+ // Our leave-one-out measurement technique only makes sense when
+ // there are multiple input values.
+ NANOBENCHMARK_CHECK(unique.size() >= 2);
+ return unique;
+ }
+
+ // Returns how many replicas of "distribution" are required before
+ // CyclesElapsed is large enough compared to the timer resolution.
+ static std::vector<FuncInput> InitReplicas(
+ const std::vector<FuncInput>& distribution, const Duration resolution,
+ const Func func) {
+ // We compute the difference in duration for inputs = Replicas() vs.
+ // Without(). Dividing this by num_replicas must yield a value where the
+ // quantization error (from the timer resolution) is sufficiently small.
+ const uint64_t min_elapsed = distribution.size() * resolution * 400;
+
+ std::vector<FuncInput> replicas;
+ for (;;) {
+ AppendReplica(distribution, &replicas);
+
+#if NANOBENCHMARK_ENABLE_CHECKS
+ const uint64_t t0 = Start64();
+#endif
+ const Duration elapsed = CyclesElapsed(resolution, func, &replicas);
+#if NANOBENCHMARK_ENABLE_CHECKS
+ const uint64_t t1 = Stop64();
+#endif
+ // Ensure the 32-bit timer didn't and won't overflow.
+ NANOBENCHMARK_CHECK((t1 - t0) < (1ULL << 30));
+
+ if (elapsed >= min_elapsed) {
+ return replicas;
+ }
+ }
+ }
+
+ // Appends all values in "distribution" to "replicas".
+ static void AppendReplica(const std::vector<FuncInput>& distribution,
+ std::vector<FuncInput>* replicas) {
+ replicas->reserve(replicas->size() + distribution.size());
+ for (const FuncInput input : distribution) {
+ replicas->push_back(input);
+ }
+ }
+
+ const std::vector<FuncInput> unique_;
+
+ // Modified by caller (shuffled in-place) => non-const.
+ std::vector<FuncInput> replicas_;
+
+ // Initialized from replicas_.
+ const size_t num_replicas_;
+};
+
+// Holds samples of measured durations, and (robustly) reduces them to a
+// single result for each unique input value.
+class DurationSamples {
+ public:
+ DurationSamples(const std::vector<FuncInput>& unique_inputs,
+ const size_t num_samples)
+ : num_samples_(num_samples) {
+ // Preallocate storage.
+ for (const FuncInput input : unique_inputs) {
+ samples_for_input_[input].reserve(num_samples);
+ }
+ }
+
+ void Add(const FuncInput input, const Duration sample) {
+ // "input" should be one of the values passed to the ctor.
+ NANOBENCHMARK_CHECK(samples_for_input_.find(input) !=
+ samples_for_input_.end());
+
+ samples_for_input_[input].push_back(sample);
+ }
+
+ // Invokes "lambda" for each (input, duration) pair. The per-call duration
+ // is the central tendency (the mode) of the samples.
+ template <class Lambda>
+ void Reduce(const Lambda& lambda) {
+ for (auto& input_and_samples : samples_for_input_) {
+ const FuncInput input = input_and_samples.first;
+ std::vector<Duration>& samples = input_and_samples.second;
+
+ NANOBENCHMARK_CHECK(samples.size() <= num_samples_);
+ std::sort(samples.begin(), samples.end());
+ const Duration duration = Mode(samples.data(), samples.size());
+ lambda(input, duration);
+ }
+ }
+
+ private:
+ const size_t num_samples_;
+ std::map<FuncInput, std::vector<Duration>> samples_for_input_;
+};
+
+// Gathers "num_samples" durations via repeated leave-one-out measurements.
+DurationSamples GatherDurationSamples(const Duration resolution, Inputs& inputs,
+ const Func func,
+ const size_t num_samples) {
+ DurationSamples samples(inputs.Unique(), num_samples);
+ for (size_t i = 0; i < num_samples; ++i) {
+ // Total duration for all shuffled input values. This may change over time,
+ // so recompute it for each sample.
+ const Duration total = CyclesElapsed(resolution, func, &inputs.Replicas());
+
+ for (const FuncInput input : inputs.Unique()) {
+ // To isolate the durations of the calls with this input value,
+ // we measure the duration without those values and subtract that
+ // from the total, and later divide by NumReplicas.
+ std::vector<FuncInput> without = inputs.Without(input);
+ for (int rep = 0; rep < 3; ++rep) {
+ const Duration elapsed = CyclesElapsed(resolution, func, &without);
+ if (elapsed < total) {
+ samples.Add(input, total - elapsed);
+ break;
+ }
+ }
+ }
+ }
+ return samples;
+}
+
+} // namespace
+
+DurationsForInputs::DurationsForInputs(const FuncInput* inputs,
+ const size_t num_inputs,
+ const size_t max_durations)
+ : num_items(0),
+ inputs_(inputs),
+ num_inputs_(num_inputs),
+ max_durations_(max_durations),
+ all_durations_(new float[num_inputs * max_durations]) {
+ NANOBENCHMARK_CHECK(num_inputs != 0);
+ NANOBENCHMARK_CHECK(max_durations != 0);
+
+ items = new Item[num_inputs];
+ for (size_t i = 0; i < num_inputs_; ++i) {
+ items[i].input = 0; // initialized later
+ items[i].num_durations = 0;
+ items[i].durations = all_durations_ + i * max_durations;
+ }
+}
+
+DurationsForInputs::~DurationsForInputs() {
+ delete[] all_durations_;
+ delete[] items;
+}
+
+void DurationsForInputs::AddItem(const FuncInput input, const float sample) {
+ for (size_t i = 0; i < num_items; ++i) {
+ NANOBENCHMARK_CHECK(items[i].input != input);
+ }
+ Item& item = items[num_items];
+ item.input = input;
+ item.num_durations = 1;
+ item.durations[0] = sample;
+ ++num_items;
+}
+
+void DurationsForInputs::AddSample(const FuncInput input, const float sample) {
+ for (size_t i = 0; i < num_items; ++i) {
+ Item& item = items[i];
+ if (item.input == input) {
+ item.durations[item.num_durations] = sample;
+ ++item.num_durations;
+ return;
+ }
+ }
+ NANOBENCHMARK_CHECK(!"Item not found");
+}
+
+void DurationsForInputs::Item::PrintMedianAndVariability() {
+ // Copy so that Median can modify.
+ std::vector<float> duration_vec(durations, durations + num_durations);
+ const float median = Median(&duration_vec);
+ const float variability = MedianAbsoluteDeviation(duration_vec, median);
+ printf("%5zu: median=%5.1f cycles; median abs. deviation=%4.1f cycles\n",
+ input, median, variability);
+}
+
+void MeasureDurations(const Func func, DurationsForInputs* input_map) {
+ const Duration resolution = Resolution(func);
+
+ // Adds enough 'replicas' of the distribution to measure "func" given
+ // the timer resolution.
+ const std::vector<FuncInput> distribution(
+ input_map->inputs_, input_map->inputs_ + input_map->num_inputs_);
+ Inputs inputs(resolution, distribution, func);
+ const double per_call = 1.0 / static_cast<int>(inputs.NumReplicas());
+
+ // First iteration: populate input_map items.
+ auto samples = GatherDurationSamples(resolution, inputs, func, 512);
+ samples.Reduce(
+ [per_call, input_map](const FuncInput input, const Duration duration) {
+ const float sample = static_cast<float>(duration * per_call);
+ input_map->AddItem(input, sample);
+ });
+
+ // Subsequent iteration(s): append to input_map items' array.
+ for (size_t rep = 1; rep < input_map->max_durations_; ++rep) {
+ auto samples = GatherDurationSamples(resolution, inputs, func, 512);
+ samples.Reduce(
+ [per_call, input_map](const FuncInput input, const Duration duration) {
+ const float sample = static_cast<float>(duration * per_call);
+ input_map->AddSample(input, sample);
+ });
+ }
+}
+
+} // namespace highwayhash
diff --git a/contrib/libs/highwayhash/highwayhash/nanobenchmark.h b/contrib/libs/highwayhash/highwayhash/nanobenchmark.h
index 1cf6426e0f..ba4ca5a9bb 100644
--- a/contrib/libs/highwayhash/highwayhash/nanobenchmark.h
+++ b/contrib/libs/highwayhash/highwayhash/nanobenchmark.h
@@ -1,158 +1,158 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_NANOBENCHMARK_H_
-#define HIGHWAYHASH_NANOBENCHMARK_H_
-
-// Benchmarks functions of a single integer argument with realistic branch
-// prediction hit rates. Uses a robust estimator to summarize the measurements.
-// Measurements are precise to about 0.2 cycles.
-//
-// Example:
-// #include "highwayhash/nanobenchmark.h"
-// using namespace highwayhash;
-//
-// uint64_t RegionToMeasure(size_t size) {
-// char from[8] = {static_cast<char>(size)};
-// char to[8];
-// memcpy(to, from, size);
-// return to[0];
-// }
-//
-// PinThreadToRandomCPU();
-//
-// static const size_t distribution[] = {3, 3, 4, 4, 7, 7, 8, 8};
-// DurationsForInputs input_map = MakeDurationsForInputs(distribution, 10);
-// MeasureDurations(&RegionToMeasure, &input_map);
-// for (size_t i = 0; i < input_map.num_items; ++i) {
-// input_map.items[i].PrintMedianAndVariability();
-// }
-//
-// Output:
-// 3: median= 25.2 cycles; median abs. deviation= 0.1 cycles
-// 4: median= 13.5 cycles; median abs. deviation= 0.1 cycles
-// 7: median= 13.5 cycles; median abs. deviation= 0.1 cycles
-// 8: median= 27.5 cycles; median abs. deviation= 0.2 cycles
-// (7 is presumably faster because it can use two unaligned 32-bit load/stores.)
-//
-// Background: Microbenchmarks such as http://github.com/google/benchmark
-// can measure elapsed times on the order of a microsecond. Shorter functions
-// are typically measured by repeating them thousands of times and dividing
-// the total elapsed time by this count. Unfortunately, repetition (especially
-// with the same input parameter!) influences the runtime. In time-critical
-// code, it is reasonable to expect warm instruction/data caches and TLBs,
-// but a perfect record of which branches will be taken is unrealistic.
-// Unless the application also repeatedly invokes the measured function with
-// the same parameter, the benchmark is measuring something very different -
-// a best-case result, almost as if the parameter were made a compile-time
-// constant. This may lead to erroneous conclusions about branch-heavy
-// algorithms outperforming branch-free alternatives.
-//
-// Our approach differs in three ways. Adding fences to the timer functions
-// reduces variability due to instruction reordering, improving the timer
-// resolution to about 10 nanoseconds. However, shorter functions must still
-// be invoked repeatedly. For more realistic branch prediction performance,
-// we vary the input parameter according to a user-specified distribution.
-// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
-// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
-// central tendency of the measurement samples with the "half sample mode",
-// which is more robust to outliers and skewed data than the mean or median.
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include <stddef.h>
-#include <stdint.h>
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-
-namespace highwayhash {
-
-// Argument to the function being measured (e.g. number of bytes to copy).
-using FuncInput = size_t;
-
-// "Proof of work" returned by the function to ensure it is not elided.
-using FuncOutput = uint64_t;
-
-// Function to measure (cannot use std::function in a restricted header).
-using Func = FuncOutput (*)(FuncInput);
-
-// Flat map of input -> durations[].
-class DurationsForInputs {
- public:
- struct Item {
- void PrintMedianAndVariability();
-
- FuncInput input; // read-only (set by AddItem).
- size_t num_durations; // written so far: [0, max_durations).
- float* durations; // max_durations entries; points into all_durations.
- };
-
- // "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
- // "func". The values are chosen to maximize coverage of "func". The pointer
- // must remain valid until after MeasureDurations. This represents a
- // distribution, so a value's frequency should reflect its probability in the
- // real application. Order does not matter; for example, a uniform
- // distribution over [0, 4) could be represented as {3,0,2,1}. Repeating each
- // value at least once ensures the leave-one-out distribution is closer to the
- // original distribution, leading to more realistic results.
- //
- // "max_durations" is the number of duration samples to measure for each
- // unique input value. Larger values decrease variability.
- //
- // Runtime is proportional to "num_inputs" * #unique * "max_durations".
- DurationsForInputs(const FuncInput* inputs, const size_t num_inputs,
- const size_t max_durations);
- ~DurationsForInputs();
-
- // Adds an item with the given "input" and "sample". Must only be called once
- // per unique "input" value.
- void AddItem(const FuncInput input, const float sample);
-
- // Adds "sample" to an already existing Item with the given "input".
- void AddSample(const FuncInput input, const float sample);
-
- // Allow direct inspection of items[0..num_items-1] because accessor or
- // ForeachItem functions are unsafe in a restricted header.
- Item* items; // owned by this class, do not allocate/free.
- size_t num_items; // safe to reset to zero.
-
- private:
- friend void MeasureDurations(Func, DurationsForInputs*);
-
- const FuncInput* const inputs_;
- const size_t num_inputs_;
- const size_t max_durations_;
- float* const all_durations_;
-};
-
-// Helper function to detect num_inputs from arrays.
-template <size_t N>
-static HH_INLINE DurationsForInputs MakeDurationsForInputs(
- const FuncInput (&inputs)[N], const size_t max_durations) {
- return DurationsForInputs(&inputs[0], N, max_durations);
-}
-
-// Returns precise measurements of the cycles elapsed when calling "func" with
-// each unique input value in "input_map", taking special care to maintain
-// realistic branch prediction hit rates.
-//
-// "func" returns a 'proof of work' to ensure its computations are not elided.
-void MeasureDurations(const Func func, DurationsForInputs* input_map);
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_NANOBENCHMARK_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_NANOBENCHMARK_H_
+#define HIGHWAYHASH_NANOBENCHMARK_H_
+
+// Benchmarks functions of a single integer argument with realistic branch
+// prediction hit rates. Uses a robust estimator to summarize the measurements.
+// Measurements are precise to about 0.2 cycles.
+//
+// Example:
+// #include "highwayhash/nanobenchmark.h"
+// using namespace highwayhash;
+//
+// uint64_t RegionToMeasure(size_t size) {
+// char from[8] = {static_cast<char>(size)};
+// char to[8];
+// memcpy(to, from, size);
+// return to[0];
+// }
+//
+// PinThreadToRandomCPU();
+//
+// static const size_t distribution[] = {3, 3, 4, 4, 7, 7, 8, 8};
+// DurationsForInputs input_map = MakeDurationsForInputs(distribution, 10);
+// MeasureDurations(&RegionToMeasure, &input_map);
+// for (size_t i = 0; i < input_map.num_items; ++i) {
+// input_map.items[i].PrintMedianAndVariability();
+// }
+//
+// Output:
+// 3: median= 25.2 cycles; median abs. deviation= 0.1 cycles
+// 4: median= 13.5 cycles; median abs. deviation= 0.1 cycles
+// 7: median= 13.5 cycles; median abs. deviation= 0.1 cycles
+// 8: median= 27.5 cycles; median abs. deviation= 0.2 cycles
+// (7 is presumably faster because it can use two unaligned 32-bit load/stores.)
+//
+// Background: Microbenchmarks such as http://github.com/google/benchmark
+// can measure elapsed times on the order of a microsecond. Shorter functions
+// are typically measured by repeating them thousands of times and dividing
+// the total elapsed time by this count. Unfortunately, repetition (especially
+// with the same input parameter!) influences the runtime. In time-critical
+// code, it is reasonable to expect warm instruction/data caches and TLBs,
+// but a perfect record of which branches will be taken is unrealistic.
+// Unless the application also repeatedly invokes the measured function with
+// the same parameter, the benchmark is measuring something very different -
+// a best-case result, almost as if the parameter were made a compile-time
+// constant. This may lead to erroneous conclusions about branch-heavy
+// algorithms outperforming branch-free alternatives.
+//
+// Our approach differs in three ways. Adding fences to the timer functions
+// reduces variability due to instruction reordering, improving the timer
+// resolution to about 10 nanoseconds. However, shorter functions must still
+// be invoked repeatedly. For more realistic branch prediction performance,
+// we vary the input parameter according to a user-specified distribution.
+// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
+// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
+// central tendency of the measurement samples with the "half sample mode",
+// which is more robust to outliers and skewed data than the mean or median.
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h>
+#include <stdint.h>
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+namespace highwayhash {
+
+// Argument to the function being measured (e.g. number of bytes to copy).
+using FuncInput = size_t;
+
+// "Proof of work" returned by the function to ensure it is not elided.
+using FuncOutput = uint64_t;
+
+// Function to measure (cannot use std::function in a restricted header).
+using Func = FuncOutput (*)(FuncInput);
+
+// Flat map of input -> durations[].
+class DurationsForInputs {
+ public:
+ struct Item {
+ void PrintMedianAndVariability();
+
+ FuncInput input; // read-only (set by AddItem).
+ size_t num_durations; // written so far: [0, max_durations).
+ float* durations; // max_durations entries; points into all_durations.
+ };
+
+ // "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
+ // "func". The values are chosen to maximize coverage of "func". The pointer
+ // must remain valid until after MeasureDurations. This represents a
+ // distribution, so a value's frequency should reflect its probability in the
+ // real application. Order does not matter; for example, a uniform
+ // distribution over [0, 4) could be represented as {3,0,2,1}. Repeating each
+ // value at least once ensures the leave-one-out distribution is closer to the
+ // original distribution, leading to more realistic results.
+ //
+ // "max_durations" is the number of duration samples to measure for each
+ // unique input value. Larger values decrease variability.
+ //
+ // Runtime is proportional to "num_inputs" * #unique * "max_durations".
+ DurationsForInputs(const FuncInput* inputs, const size_t num_inputs,
+ const size_t max_durations);
+ ~DurationsForInputs();
+
+ // Adds an item with the given "input" and "sample". Must only be called once
+ // per unique "input" value.
+ void AddItem(const FuncInput input, const float sample);
+
+ // Adds "sample" to an already existing Item with the given "input".
+ void AddSample(const FuncInput input, const float sample);
+
+ // Allow direct inspection of items[0..num_items-1] because accessor or
+ // ForeachItem functions are unsafe in a restricted header.
+ Item* items; // owned by this class, do not allocate/free.
+ size_t num_items; // safe to reset to zero.
+
+ private:
+ friend void MeasureDurations(Func, DurationsForInputs*);
+
+ const FuncInput* const inputs_;
+ const size_t num_inputs_;
+ const size_t max_durations_;
+ float* const all_durations_;
+};
+
+// Helper function to detect num_inputs from arrays.
+template <size_t N>
+static HH_INLINE DurationsForInputs MakeDurationsForInputs(
+ const FuncInput (&inputs)[N], const size_t max_durations) {
+ return DurationsForInputs(&inputs[0], N, max_durations);
+}
+
+// Returns precise measurements of the cycles elapsed when calling "func" with
+// each unique input value in "input_map", taking special care to maintain
+// realistic branch prediction hit rates.
+//
+// "func" returns a 'proof of work' to ensure its computations are not elided.
+void MeasureDurations(const Func func, DurationsForInputs* input_map);
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_NANOBENCHMARK_H_
diff --git a/contrib/libs/highwayhash/highwayhash/nanobenchmark_example.cc b/contrib/libs/highwayhash/highwayhash/nanobenchmark_example.cc
index d95acf144a..f7b2269311 100644
--- a/contrib/libs/highwayhash/highwayhash/nanobenchmark_example.cc
+++ b/contrib/libs/highwayhash/highwayhash/nanobenchmark_example.cc
@@ -1,48 +1,48 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstddef>
-#include <cstring>
-#include <vector>
-
-#include "highwayhash/nanobenchmark.h"
-#include "highwayhash/os_specific.h"
-
-namespace highwayhash {
-namespace {
-
-uint64_t RegionToMeasure(FuncInput size) {
- char from[8] = {static_cast<char>(size)};
- char to[8];
- memcpy(to, from, size);
- return to[0];
-}
-
-void TestMemcpy() {
- PinThreadToRandomCPU();
- static const size_t distribution[] = {3, 3, 4, 4, 7, 7, 8, 8};
- DurationsForInputs input_map = MakeDurationsForInputs(distribution, 10);
- MeasureDurations(&RegionToMeasure, &input_map);
- for (size_t i = 0; i < input_map.num_items; ++i) {
- input_map.items[i].PrintMedianAndVariability();
- }
-}
-
-} // namespace
-} // namespace highwayhash
-
-int main(int argc, char* argv[]) {
- highwayhash::TestMemcpy();
- return 0;
-}
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstring>
+#include <vector>
+
+#include "highwayhash/nanobenchmark.h"
+#include "highwayhash/os_specific.h"
+
+namespace highwayhash {
+namespace {
+
+uint64_t RegionToMeasure(FuncInput size) {
+ char from[8] = {static_cast<char>(size)};
+ char to[8];
+ memcpy(to, from, size);
+ return to[0];
+}
+
+void TestMemcpy() {
+ PinThreadToRandomCPU();
+ static const size_t distribution[] = {3, 3, 4, 4, 7, 7, 8, 8};
+ DurationsForInputs input_map = MakeDurationsForInputs(distribution, 10);
+ MeasureDurations(&RegionToMeasure, &input_map);
+ for (size_t i = 0; i < input_map.num_items; ++i) {
+ input_map.items[i].PrintMedianAndVariability();
+ }
+}
+
+} // namespace
+} // namespace highwayhash
+
+int main(int argc, char* argv[]) {
+ highwayhash::TestMemcpy();
+ return 0;
+}
diff --git a/contrib/libs/highwayhash/highwayhash/os_specific.cc b/contrib/libs/highwayhash/highwayhash/os_specific.cc
index c28b2c1ae3..5c877bc709 100644
--- a/contrib/libs/highwayhash/highwayhash/os_specific.cc
+++ b/contrib/libs/highwayhash/highwayhash/os_specific.cc
@@ -1,244 +1,244 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "highwayhash/os_specific.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#include <random>
-
-#include "highwayhash/arch_specific.h"
-
-#if defined(_WIN32) || defined(_WIN64)
-#define OS_WIN 1
-#define NOMINMAX
-#include <windows.h>
-#else
-#define OS_WIN 0
-#endif
-
-#ifdef __linux__
-#define OS_LINUX 1
-#include <sched.h>
-#include <sys/time.h>
-#else
-#define OS_LINUX 0
-#endif
-
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/os_specific.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <random>
+
+#include "highwayhash/arch_specific.h"
+
+#if defined(_WIN32) || defined(_WIN64)
+#define OS_WIN 1
+#define NOMINMAX
+#include <windows.h>
+#else
+#define OS_WIN 0
+#endif
+
+#ifdef __linux__
+#define OS_LINUX 1
+#include <sched.h>
+#include <sys/time.h>
+#else
+#define OS_LINUX 0
+#endif
+
#if defined(__MACH__) || defined(__APPLE__)
-#define OS_MAC 1
-#include <mach/mach.h>
-#include <mach/mach_time.h>
-#else
-#define OS_MAC 0
-#endif
-
-#ifdef __FreeBSD__
-#define OS_FREEBSD 1
-#include <sys/cpuset.h>
-#include <sys/param.h>
-#include <unistd.h>
-#else
-#define OS_FREEBSD 0
-#endif
-
-namespace highwayhash {
-
-#define CHECK(condition) \
- while (!(condition)) { \
- printf("os_specific CHECK failed at line %d\n", __LINE__); \
- abort(); \
- }
-
-double Now() {
-#if OS_WIN
- LARGE_INTEGER counter;
- (void)QueryPerformanceCounter(&counter);
- static const double rcp_freq = []() {
- LARGE_INTEGER freq;
- (void)QueryPerformanceFrequency(&freq);
- return 1.0 / freq.QuadPart;
- }();
- return counter.QuadPart * rcp_freq;
+#define OS_MAC 1
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#else
+#define OS_MAC 0
+#endif
+
+#ifdef __FreeBSD__
+#define OS_FREEBSD 1
+#include <sys/cpuset.h>
+#include <sys/param.h>
+#include <unistd.h>
+#else
+#define OS_FREEBSD 0
+#endif
+
+namespace highwayhash {
+
+#define CHECK(condition) \
+ while (!(condition)) { \
+ printf("os_specific CHECK failed at line %d\n", __LINE__); \
+ abort(); \
+ }
+
+double Now() {
+#if OS_WIN
+ LARGE_INTEGER counter;
+ (void)QueryPerformanceCounter(&counter);
+ static const double rcp_freq = []() {
+ LARGE_INTEGER freq;
+ (void)QueryPerformanceFrequency(&freq);
+ return 1.0 / freq.QuadPart;
+ }();
+ return counter.QuadPart * rcp_freq;
+#elif OS_MAC
+ const auto t = mach_absolute_time();
+ // On OSX/iOS platform the elapsed time is cpu time unit
+ // We have to query the time base information to convert it back
+ // See https://developer.apple.com/library/mac/qa/qa1398/_index.html
+ static mach_timebase_info_data_t timebase;
+ if (timebase.denom == 0) {
+ (void)mach_timebase_info(&timebase);
+ }
+ return double(t) * timebase.numer / timebase.denom * 1E-9;
+#else
+ timespec t;
+ clock_gettime(CLOCK_REALTIME, &t);
+ return t.tv_sec + t.tv_nsec * 1E-9;
+#endif
+}
+
+void RaiseThreadPriority() {
+#if OS_WIN
+ BOOL ok = SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
+ CHECK(ok);
+ SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
+ CHECK(ok);
+#elif OS_LINUX
+ // omit: SCHED_RR and SCHED_FIFO with sched_priority max, max-1 and max/2
+ // lead to 2-3x runtime and higher variability!
+#elif OS_FREEBSD
#elif OS_MAC
- const auto t = mach_absolute_time();
- // On OSX/iOS platform the elapsed time is cpu time unit
- // We have to query the time base information to convert it back
- // See https://developer.apple.com/library/mac/qa/qa1398/_index.html
- static mach_timebase_info_data_t timebase;
- if (timebase.denom == 0) {
- (void)mach_timebase_info(&timebase);
- }
- return double(t) * timebase.numer / timebase.denom * 1E-9;
-#else
- timespec t;
- clock_gettime(CLOCK_REALTIME, &t);
- return t.tv_sec + t.tv_nsec * 1E-9;
-#endif
-}
-
-void RaiseThreadPriority() {
-#if OS_WIN
- BOOL ok = SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
- CHECK(ok);
- SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
- CHECK(ok);
-#elif OS_LINUX
- // omit: SCHED_RR and SCHED_FIFO with sched_priority max, max-1 and max/2
- // lead to 2-3x runtime and higher variability!
-#elif OS_FREEBSD
+#else
+#error "port"
+#endif
+}
+
+struct ThreadAffinity {
+#if OS_WIN
+ DWORD_PTR mask;
+#elif OS_LINUX
+ cpu_set_t set;
+#elif OS_FREEBSD
+ cpuset_t set;
+#endif
+};
+
+ThreadAffinity* GetThreadAffinity() {
+ ThreadAffinity* affinity =
+ static_cast<ThreadAffinity*>(malloc(sizeof(ThreadAffinity)));
+#if OS_WIN
+ DWORD_PTR system_affinity;
+ const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &affinity->mask,
+ &system_affinity);
+ CHECK(ok);
+#elif OS_LINUX
+ const pid_t pid = 0; // current thread
+ const int err = sched_getaffinity(pid, sizeof(cpu_set_t), &affinity->set);
+ CHECK(err == 0);
+#elif OS_FREEBSD
+ const pid_t pid = getpid(); // current thread
+ const int err = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid,
+ sizeof(cpuset_t), &affinity->set);
+ CHECK(err == 0);
+#endif
+ return affinity;
+}
+
+namespace {
+
+ThreadAffinity* OriginalThreadAffinity() {
+ static ThreadAffinity* original = GetThreadAffinity();
+ return original;
+}
+
+} // namespace
+
+void SetThreadAffinity(ThreadAffinity* affinity) {
+ // Ensure original is initialized before changing.
+ const ThreadAffinity* const original = OriginalThreadAffinity();
+ CHECK(original != nullptr);
+
+#if OS_WIN
+ const HANDLE hThread = GetCurrentThread();
+ const DWORD_PTR prev = SetThreadAffinityMask(hThread, affinity->mask);
+ CHECK(prev != 0);
+#elif OS_LINUX
+ const pid_t pid = 0; // current thread
+ const int err = sched_setaffinity(pid, sizeof(cpu_set_t), &affinity->set);
+ CHECK(err == 0);
+#elif OS_FREEBSD
+ const pid_t pid = getpid(); // current thread
+ const int err = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid,
+ sizeof(cpuset_t), &affinity->set);
+ CHECK(err == 0);
#elif OS_MAC
-#else
-#error "port"
-#endif
-}
-
-struct ThreadAffinity {
-#if OS_WIN
- DWORD_PTR mask;
-#elif OS_LINUX
- cpu_set_t set;
-#elif OS_FREEBSD
- cpuset_t set;
-#endif
-};
-
-ThreadAffinity* GetThreadAffinity() {
- ThreadAffinity* affinity =
- static_cast<ThreadAffinity*>(malloc(sizeof(ThreadAffinity)));
-#if OS_WIN
- DWORD_PTR system_affinity;
- const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &affinity->mask,
- &system_affinity);
- CHECK(ok);
-#elif OS_LINUX
- const pid_t pid = 0; // current thread
- const int err = sched_getaffinity(pid, sizeof(cpu_set_t), &affinity->set);
- CHECK(err == 0);
-#elif OS_FREEBSD
- const pid_t pid = getpid(); // current thread
- const int err = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid,
- sizeof(cpuset_t), &affinity->set);
- CHECK(err == 0);
-#endif
- return affinity;
-}
-
-namespace {
-
-ThreadAffinity* OriginalThreadAffinity() {
- static ThreadAffinity* original = GetThreadAffinity();
- return original;
-}
-
-} // namespace
-
-void SetThreadAffinity(ThreadAffinity* affinity) {
- // Ensure original is initialized before changing.
- const ThreadAffinity* const original = OriginalThreadAffinity();
- CHECK(original != nullptr);
-
-#if OS_WIN
- const HANDLE hThread = GetCurrentThread();
- const DWORD_PTR prev = SetThreadAffinityMask(hThread, affinity->mask);
- CHECK(prev != 0);
-#elif OS_LINUX
- const pid_t pid = 0; // current thread
- const int err = sched_setaffinity(pid, sizeof(cpu_set_t), &affinity->set);
- CHECK(err == 0);
-#elif OS_FREEBSD
- const pid_t pid = getpid(); // current thread
- const int err = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid,
- sizeof(cpuset_t), &affinity->set);
- CHECK(err == 0);
+#else
+#error "port"
+#endif
+}
+
+std::vector<int> AvailableCPUs() {
+ std::vector<int> cpus;
+ cpus.reserve(64);
+ const ThreadAffinity* const affinity = OriginalThreadAffinity();
+#if OS_WIN
+ for (int cpu = 0; cpu < 64; ++cpu) {
+ if (affinity->mask & (1ULL << cpu)) {
+ cpus.push_back(cpu);
+ }
+ }
+#elif OS_LINUX
+ for (size_t cpu = 0; cpu < sizeof(cpu_set_t) * 8; ++cpu) {
+ if (CPU_ISSET(cpu, &affinity->set)) {
+ cpus.push_back(cpu);
+ }
+ }
+#elif OS_FREEBSD
+ for (size_t cpu = 0; cpu < sizeof(cpuset_t) * 8; ++cpu) {
+ if (CPU_ISSET(cpu, &affinity->set)) {
+ cpus.push_back(cpu);
+ }
+ }
#elif OS_MAC
-#else
-#error "port"
-#endif
-}
-
-std::vector<int> AvailableCPUs() {
- std::vector<int> cpus;
- cpus.reserve(64);
- const ThreadAffinity* const affinity = OriginalThreadAffinity();
-#if OS_WIN
- for (int cpu = 0; cpu < 64; ++cpu) {
- if (affinity->mask & (1ULL << cpu)) {
- cpus.push_back(cpu);
- }
- }
-#elif OS_LINUX
- for (size_t cpu = 0; cpu < sizeof(cpu_set_t) * 8; ++cpu) {
- if (CPU_ISSET(cpu, &affinity->set)) {
- cpus.push_back(cpu);
- }
- }
-#elif OS_FREEBSD
- for (size_t cpu = 0; cpu < sizeof(cpuset_t) * 8; ++cpu) {
- if (CPU_ISSET(cpu, &affinity->set)) {
- cpus.push_back(cpu);
- }
- }
+#else
+#error "port"
+#endif
+ return cpus;
+}
+
+void PinThreadToCPU(const int cpu) {
+ ThreadAffinity affinity;
+#if OS_WIN
+ affinity.mask = 1ULL << cpu;
+#elif OS_LINUX
+ CPU_ZERO(&affinity.set);
+ CPU_SET(cpu, &affinity.set);
+#elif OS_FREEBSD
+ CPU_ZERO(&affinity.set);
+ CPU_SET(cpu, &affinity.set);
#elif OS_MAC
-#else
-#error "port"
-#endif
- return cpus;
-}
-
-void PinThreadToCPU(const int cpu) {
- ThreadAffinity affinity;
-#if OS_WIN
- affinity.mask = 1ULL << cpu;
-#elif OS_LINUX
- CPU_ZERO(&affinity.set);
- CPU_SET(cpu, &affinity.set);
-#elif OS_FREEBSD
- CPU_ZERO(&affinity.set);
- CPU_SET(cpu, &affinity.set);
-#elif OS_MAC
-#else
-#error "port"
-#endif
- SetThreadAffinity(&affinity);
-}
-
-void PinThreadToRandomCPU() {
- std::vector<int> cpus = AvailableCPUs();
-
- // Remove first two CPUs because interrupts are often pinned to them.
- CHECK(cpus.size() > 2);
- cpus.erase(cpus.begin(), cpus.begin() + 2);
-
- // Random choice to prevent burning up the same core.
- std::random_device device;
- std::ranlux48 generator(device());
- std::shuffle(cpus.begin(), cpus.end(), generator);
- const int cpu = cpus.front();
-
- PinThreadToCPU(cpu);
-
-#if HH_ARCH_X64
- // After setting affinity, we should be running on the desired CPU.
- printf("Running on CPU #%d, APIC ID %02x\n", cpu, ApicId());
-#else
- printf("Running on CPU #%d\n", cpu);
-#endif
-}
-
-} // namespace highwayhash
+#else
+#error "port"
+#endif
+ SetThreadAffinity(&affinity);
+}
+
+void PinThreadToRandomCPU() {
+ std::vector<int> cpus = AvailableCPUs();
+
+ // Remove first two CPUs because interrupts are often pinned to them.
+ CHECK(cpus.size() > 2);
+ cpus.erase(cpus.begin(), cpus.begin() + 2);
+
+ // Random choice to prevent burning up the same core.
+ std::random_device device;
+ std::ranlux48 generator(device());
+ std::shuffle(cpus.begin(), cpus.end(), generator);
+ const int cpu = cpus.front();
+
+ PinThreadToCPU(cpu);
+
+#if HH_ARCH_X64
+ // After setting affinity, we should be running on the desired CPU.
+ printf("Running on CPU #%d, APIC ID %02x\n", cpu, ApicId());
+#else
+ printf("Running on CPU #%d\n", cpu);
+#endif
+}
+
+} // namespace highwayhash
diff --git a/contrib/libs/highwayhash/highwayhash/os_specific.h b/contrib/libs/highwayhash/highwayhash/os_specific.h
index 46f3c3e3ef..cefd3628e4 100644
--- a/contrib/libs/highwayhash/highwayhash/os_specific.h
+++ b/contrib/libs/highwayhash/highwayhash/os_specific.h
@@ -1,54 +1,54 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_OS_SPECIFIC_H_
-#define HIGHWAYHASH_OS_SPECIFIC_H_
-
-#include <vector>
-
-namespace highwayhash {
-
-// Returns current wall-clock time [seconds].
-double Now();
-
-// Sets this thread's priority to the maximum. This should not be called on
-// single-core systems. Requires elevated permissions. No effect on Linux
-// because it increases runtime and variability (issue #19).
-void RaiseThreadPriority();
-
-// Returns CPU numbers in [0, N), where N is the number of bits in the
-// thread's initial affinity (unaffected by any SetThreadAffinity).
-std::vector<int> AvailableCPUs();
-
-// Opaque.
-struct ThreadAffinity;
-
-// Caller must free() the return value.
-ThreadAffinity* GetThreadAffinity();
-
-// Restores a previous affinity returned by GetThreadAffinity.
-void SetThreadAffinity(ThreadAffinity* affinity);
-
-// Ensures the thread is running on the specified cpu, and no others.
-// Useful for reducing nanobenchmark variability (fewer context switches).
-// Uses SetThreadAffinity.
-void PinThreadToCPU(const int cpu);
-
-// Random choice of CPU avoids overloading any one core.
-// Uses SetThreadAffinity.
-void PinThreadToRandomCPU();
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_OS_SPECIFIC_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_OS_SPECIFIC_H_
+#define HIGHWAYHASH_OS_SPECIFIC_H_
+
+#include <vector>
+
+namespace highwayhash {
+
+// Returns current wall-clock time [seconds].
+double Now();
+
+// Sets this thread's priority to the maximum. This should not be called on
+// single-core systems. Requires elevated permissions. No effect on Linux
+// because it increases runtime and variability (issue #19).
+void RaiseThreadPriority();
+
+// Returns CPU numbers in [0, N), where N is the number of bits in the
+// thread's initial affinity (unaffected by any SetThreadAffinity).
+std::vector<int> AvailableCPUs();
+
+// Opaque.
+struct ThreadAffinity;
+
+// Caller must free() the return value.
+ThreadAffinity* GetThreadAffinity();
+
+// Restores a previous affinity returned by GetThreadAffinity.
+void SetThreadAffinity(ThreadAffinity* affinity);
+
+// Ensures the thread is running on the specified cpu, and no others.
+// Useful for reducing nanobenchmark variability (fewer context switches).
+// Uses SetThreadAffinity.
+void PinThreadToCPU(const int cpu);
+
+// Random choice of CPU avoids overloading any one core.
+// Uses SetThreadAffinity.
+void PinThreadToRandomCPU();
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_OS_SPECIFIC_H_
diff --git a/contrib/libs/highwayhash/highwayhash/profiler.h b/contrib/libs/highwayhash/highwayhash/profiler.h
index 09da7e71a5..9e8f5f6958 100644
--- a/contrib/libs/highwayhash/highwayhash/profiler.h
+++ b/contrib/libs/highwayhash/highwayhash/profiler.h
@@ -1,754 +1,754 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_PROFILER_H_
-#define HIGHWAYHASH_PROFILER_H_
-
-// High precision, low overhead time measurements. Returns exact call counts and
-// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
-//
-// Usage: add this header to BUILD srcs; instrument regions of interest:
-// { PROFILER_ZONE("name"); /*code*/ } or
-// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
-// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
-// print call counts and average durations [CPU cycles] to stdout, sorted in
-// descending order of total duration.
-
-// Configuration settings:
-
-// If zero, this file has no effect and no measurements will be recorded.
-#ifndef PROFILER_ENABLED
-#define PROFILER_ENABLED 1
-#endif
-
-// How many mebibytes to allocate (if PROFILER_ENABLED) per thread that
-// enters at least one zone. Once this buffer is full, the thread will analyze
-// and discard packets, thus temporarily adding some observer overhead.
-// Each zone occupies 16 bytes.
-#ifndef PROFILER_THREAD_STORAGE
-#define PROFILER_THREAD_STORAGE 200ULL
-#endif
-
-#if PROFILER_ENABLED
-
-#include <algorithm> // min/max
-#include <atomic>
-#include <cassert>
-#include <cstddef> // ptrdiff_t
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring> // memcpy
-#include <new>
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-
-// Non-portable aspects:
-// - SSE2 128-bit load/store (write-combining, UpdateOrAdd)
-// - RDTSCP timestamps (serializing, high-resolution)
-// - assumes string literals are stored within an 8 MiB range
-// - compiler-specific annotations (restrict, alignment, fences)
-#if HH_ARCH_X64
-#include <emmintrin.h>
-#if HH_MSC_VERSION
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
-#endif
-
-#include "highwayhash/robust_statistics.h"
-#include "highwayhash/tsc_timer.h"
-
-#define PROFILER_CHECK(condition) \
- while (!(condition)) { \
- printf("Profiler check failed at line %d\n", __LINE__); \
- abort(); \
- }
-
-namespace highwayhash {
-
-// Upper bounds for various fixed-size data structures (guarded via assert):
-
-// How many threads can actually enter a zone (those that don't do not count).
-// Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB.
-// WARNING: a fiber library can spawn hundreds of threads.
-static constexpr size_t kMaxThreads = 128;
-
-// Maximum nesting of zones.
-static constexpr size_t kMaxDepth = 64;
-
-// Total number of zones.
-static constexpr size_t kMaxZones = 256;
-
-// Functions that depend on the cache line size.
-class CacheAligned {
- public:
- static constexpr size_t kPointerSize = sizeof(void*);
- static constexpr size_t kCacheLineSize = 64;
-
- static void* Allocate(const size_t bytes) {
- char* const allocated = static_cast<char*>(malloc(bytes + kCacheLineSize));
- if (allocated == nullptr) {
- return nullptr;
- }
- const uintptr_t misalignment =
- reinterpret_cast<uintptr_t>(allocated) & (kCacheLineSize - 1);
- // malloc is at least kPointerSize aligned, so we can store the "allocated"
- // pointer immediately before the aligned memory.
- assert(misalignment % kPointerSize == 0);
- char* const aligned = allocated + kCacheLineSize - misalignment;
- memcpy(aligned - kPointerSize, &allocated, kPointerSize);
- return aligned;
- }
-
- // Template allows freeing pointer-to-const.
- template <typename T>
- static void Free(T* aligned_pointer) {
- if (aligned_pointer == nullptr) {
- return;
- }
- const char* const aligned = reinterpret_cast<const char*>(aligned_pointer);
- assert(reinterpret_cast<uintptr_t>(aligned) % kCacheLineSize == 0);
- char* allocated;
- memcpy(&allocated, aligned - kPointerSize, kPointerSize);
- assert(allocated <= aligned - kPointerSize);
- assert(allocated >= aligned - kCacheLineSize);
- free(allocated);
- }
-
-#if HH_ARCH_X64
- // Overwrites "to" without loading it into the cache (read-for-ownership).
- template <typename T>
- static void StreamCacheLine(const T* from_items, T* to_items) {
- const __m128i* const from = reinterpret_cast<const __m128i*>(from_items);
- __m128i* const to = reinterpret_cast<__m128i*>(to_items);
- HH_COMPILER_FENCE;
- const __m128i v0 = _mm_load_si128(from + 0);
- const __m128i v1 = _mm_load_si128(from + 1);
- const __m128i v2 = _mm_load_si128(from + 2);
- const __m128i v3 = _mm_load_si128(from + 3);
- // Fences prevent the compiler from reordering loads/stores, which may
- // interfere with write-combining.
- HH_COMPILER_FENCE;
- _mm_stream_si128(to + 0, v0);
- _mm_stream_si128(to + 1, v1);
- _mm_stream_si128(to + 2, v2);
- _mm_stream_si128(to + 3, v3);
- HH_COMPILER_FENCE;
- }
-#endif
-};
-
-// Represents zone entry/exit events. Stores a full-resolution timestamp plus
-// an offset (representing zone name or identifying exit packets). POD.
-class Packet {
- public:
- // If offsets do not fit, UpdateOrAdd will overrun our heap allocation
- // (governed by kMaxZones). We have seen multi-megabyte offsets.
- static constexpr size_t kOffsetBits = 25;
- static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1);
-
- // We need full-resolution timestamps; at an effective rate of 4 GHz,
- // this permits 1 minute zone durations (for longer durations, split into
- // multiple zones). Wraparound is handled by masking.
- static constexpr size_t kTimestampBits = 64 - kOffsetBits;
- static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1;
-
- static Packet Make(const size_t biased_offset, const uint64_t timestamp) {
- assert(biased_offset < (1ULL << kOffsetBits));
-
- Packet packet;
- packet.bits_ =
- (biased_offset << kTimestampBits) + (timestamp & kTimestampMask);
- return packet;
- }
-
- uint64_t Timestamp() const { return bits_ & kTimestampMask; }
-
- size_t BiasedOffset() const { return (bits_ >> kTimestampBits); }
-
- private:
- uint64_t bits_;
-};
-static_assert(sizeof(Packet) == 8, "Wrong Packet size");
-
-// Returns the address of a string literal. Assuming zone names are also
-// literals and stored nearby, we can represent them as offsets, which are
-// faster to compute than hashes or even a static index.
-//
-// This function must not be static - each call (even from other translation
-// units) must return the same value.
-inline const char* StringOrigin() {
- // Chosen such that no zone name is a prefix nor suffix of this string
- // to ensure they aren't merged (offset 0 identifies zone-exit packets).
- static const char* string_origin = "__#__";
- return string_origin - Packet::kOffsetBias;
-}
-
-// Representation of an active zone, stored in a stack. Used to deduct
-// child duration from the parent's self time. POD.
-struct Node {
- Packet packet;
- uint64_t child_total;
-};
-
-// Holds statistics for all zones with the same name. POD.
-struct Accumulator {
- static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits;
-
- uint64_t BiasedOffset() const { return num_calls >> kNumCallBits; }
- uint64_t NumCalls() const { return num_calls & ((1ULL << kNumCallBits) - 1); }
-
- // UpdateOrAdd relies upon this layout.
- uint64_t num_calls = 0; // upper bits = biased_offset.
- uint64_t total_duration = 0;
-};
-#if HH_ARCH_X64
-static_assert(sizeof(Accumulator) == sizeof(__m128i), "Wrong Accumulator size");
-#endif
-
-template <typename T>
-inline T ClampedSubtract(const T minuend, const T subtrahend) {
- if (subtrahend > minuend) {
- return 0;
- }
- return minuend - subtrahend;
-}
-
-// Per-thread call graph (stack) and Accumulator for each zone.
-class Results {
- public:
- Results() {
- // Zero-initialize first accumulator to avoid a check for num_zones_ == 0.
- memset(zones_, 0, sizeof(Accumulator));
- }
-
- // Used for computing overhead when this thread encounters its first Zone.
- // This has no observable effect apart from increasing "analyze_elapsed_".
- uint64_t ZoneDuration(const Packet* packets) {
- PROFILER_CHECK(depth_ == 0);
- PROFILER_CHECK(num_zones_ == 0);
- AnalyzePackets(packets, 2);
- const uint64_t duration = zones_[0].total_duration;
- zones_[0].num_calls = 0;
- zones_[0].total_duration = 0;
- PROFILER_CHECK(depth_ == 0);
- num_zones_ = 0;
- return duration;
- }
-
- void SetSelfOverhead(const uint64_t self_overhead) {
- self_overhead_ = self_overhead;
- }
-
- void SetChildOverhead(const uint64_t child_overhead) {
- child_overhead_ = child_overhead;
- }
-
- // Draw all required information from the packets, which can be discarded
- // afterwards. Called whenever this thread's storage is full.
- void AnalyzePackets(const Packet* packets, const size_t num_packets) {
- const uint64_t t0 = Start<uint64_t>();
-
- for (size_t i = 0; i < num_packets; ++i) {
- const Packet p = packets[i];
- // Entering a zone
- if (p.BiasedOffset() != Packet::kOffsetBias) {
- assert(depth_ < kMaxDepth);
- nodes_[depth_].packet = p;
- nodes_[depth_].child_total = 0;
- ++depth_;
- continue;
- }
-
- assert(depth_ != 0);
- const Node& node = nodes_[depth_ - 1];
- // Masking correctly handles unsigned wraparound.
- const uint64_t duration =
- (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask;
- const uint64_t self_duration = ClampedSubtract(
- duration, self_overhead_ + child_overhead_ + node.child_total);
-
- UpdateOrAdd(node.packet.BiasedOffset(), self_duration);
- --depth_;
-
- // Deduct this nested node's time from its parent's self_duration.
- if (depth_ != 0) {
- nodes_[depth_ - 1].child_total += duration + child_overhead_;
- }
- }
-
- const uint64_t t1 = Stop<uint64_t>();
- analyze_elapsed_ += t1 - t0;
- }
-
- // Incorporates results from another thread. Call after all threads have
- // exited any zones.
- void Assimilate(const Results& other) {
- const uint64_t t0 = Start<uint64_t>();
- assert(depth_ == 0);
- assert(other.depth_ == 0);
-
- for (size_t i = 0; i < other.num_zones_; ++i) {
- const Accumulator& zone = other.zones_[i];
- UpdateOrAdd(zone.BiasedOffset(), zone.total_duration);
- }
- const uint64_t t1 = Stop<uint64_t>();
- analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
- }
-
- // Single-threaded.
- void Print() {
- const uint64_t t0 = Start<uint64_t>();
- MergeDuplicates();
-
- // Sort by decreasing total (self) cost.
- std::sort(zones_, zones_ + num_zones_,
- [](const Accumulator& r1, const Accumulator& r2) {
- return r1.total_duration > r2.total_duration;
- });
-
- const char* string_origin = StringOrigin();
- for (size_t i = 0; i < num_zones_; ++i) {
- const Accumulator& r = zones_[i];
- const uint64_t num_calls = r.NumCalls();
- printf("%40s: %10zu x %15zu = %15zu\n", string_origin + r.BiasedOffset(),
- num_calls, r.total_duration / num_calls, r.total_duration);
- }
-
- const uint64_t t1 = Stop<uint64_t>();
- analyze_elapsed_ += t1 - t0;
- printf("Total clocks during analysis: %zu\n", analyze_elapsed_);
- }
-
- private:
-#if HH_ARCH_X64
- static bool SameOffset(const __m128i& zone, const size_t biased_offset) {
- const uint64_t num_calls = _mm_cvtsi128_si64(zone);
- return (num_calls >> Accumulator::kNumCallBits) == biased_offset;
- }
-#endif
-
- // Updates an existing Accumulator (uniquely identified by biased_offset) or
- // adds one if this is the first time this thread analyzed that zone.
- // Uses a self-organizing list data structure, which avoids dynamic memory
- // allocations and is far faster than unordered_map. Loads, updates and
- // stores the entire Accumulator with vector instructions.
- void UpdateOrAdd(const size_t biased_offset, const uint64_t duration) {
- assert(biased_offset < (1ULL << Packet::kOffsetBits));
-
-#if HH_ARCH_X64
- const __m128i one_64 = _mm_set1_epi64x(1);
- const __m128i duration_64 = _mm_cvtsi64_si128(duration);
- const __m128i add_duration_call = _mm_unpacklo_epi64(one_64, duration_64);
-
- __m128i* const HH_RESTRICT zones = reinterpret_cast<__m128i*>(zones_);
-
- // Special case for first zone: (maybe) update, without swapping.
- __m128i prev = _mm_load_si128(zones);
- if (SameOffset(prev, biased_offset)) {
- prev = _mm_add_epi64(prev, add_duration_call);
- assert(SameOffset(prev, biased_offset));
- _mm_store_si128(zones, prev);
- return;
- }
-
- // Look for a zone with the same offset.
- for (size_t i = 1; i < num_zones_; ++i) {
- __m128i zone = _mm_load_si128(zones + i);
- if (SameOffset(zone, biased_offset)) {
- zone = _mm_add_epi64(zone, add_duration_call);
- assert(SameOffset(zone, biased_offset));
- // Swap with predecessor (more conservative than move to front,
- // but at least as successful).
- _mm_store_si128(zones + i - 1, zone);
- _mm_store_si128(zones + i, prev);
- return;
- }
- prev = zone;
- }
-
- // Not found; create a new Accumulator.
- const __m128i biased_offset_64 = _mm_slli_epi64(
- _mm_cvtsi64_si128(biased_offset), Accumulator::kNumCallBits);
- const __m128i zone = _mm_add_epi64(biased_offset_64, add_duration_call);
- assert(SameOffset(zone, biased_offset));
-
- assert(num_zones_ < kMaxZones);
- _mm_store_si128(zones + num_zones_, zone);
- ++num_zones_;
-#else
- // Special case for first zone: (maybe) update, without swapping.
- if (zones_[0].BiasedOffset() == biased_offset) {
- zones_[0].total_duration += duration;
- zones_[0].num_calls += 1;
- assert(zones_[0].BiasedOffset() == biased_offset);
- return;
- }
-
- // Look for a zone with the same offset.
- for (size_t i = 1; i < num_zones_; ++i) {
- if (zones_[i].BiasedOffset() == biased_offset) {
- zones_[i].total_duration += duration;
- zones_[i].num_calls += 1;
- assert(zones_[i].BiasedOffset() == biased_offset);
- // Swap with predecessor (more conservative than move to front,
- // but at least as successful).
- const Accumulator prev = zones_[i - 1];
- zones_[i - 1] = zones_[i];
- zones_[i] = prev;
- return;
- }
- }
-
- // Not found; create a new Accumulator.
- assert(num_zones_ < kMaxZones);
- Accumulator* HH_RESTRICT zone = zones_ + num_zones_;
- zone->num_calls = (biased_offset << Accumulator::kNumCallBits) + 1;
- zone->total_duration = duration;
- assert(zone->BiasedOffset() == biased_offset);
- ++num_zones_;
-#endif
- }
-
- // Each instantiation of a function template seems to get its own copy of
- // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
- // acceptable because we only expect a few dozen zones.
- void MergeDuplicates() {
- const char* string_origin = StringOrigin();
- for (size_t i = 0; i < num_zones_; ++i) {
- const size_t biased_offset = zones_[i].BiasedOffset();
- const char* name = string_origin + biased_offset;
- // Separate num_calls from biased_offset so we can add them together.
- uint64_t num_calls = zones_[i].NumCalls();
-
- // Add any subsequent duplicates to num_calls and total_duration.
- for (size_t j = i + 1; j < num_zones_;) {
- if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) {
- num_calls += zones_[j].NumCalls();
- zones_[i].total_duration += zones_[j].total_duration;
- // Fill hole with last item.
- zones_[j] = zones_[--num_zones_];
- } else { // Name differed, try next Accumulator.
- ++j;
- }
- }
-
- assert(num_calls < (1ULL << Accumulator::kNumCallBits));
-
- // Re-pack regardless of whether any duplicates were found.
- zones_[i].num_calls =
- (biased_offset << Accumulator::kNumCallBits) + num_calls;
- }
- }
-
- uint64_t analyze_elapsed_ = 0;
- uint64_t self_overhead_ = 0;
- uint64_t child_overhead_ = 0;
-
- size_t depth_ = 0; // Number of active zones.
- size_t num_zones_ = 0; // Number of retired zones.
-
- HH_ALIGNAS(64) Node nodes_[kMaxDepth]; // Stack
- HH_ALIGNAS(64) Accumulator zones_[kMaxZones]; // Self-organizing list
-};
-
-// Per-thread packet storage, allocated via CacheAligned.
-class ThreadSpecific {
- static constexpr size_t kBufferCapacity =
- CacheAligned::kCacheLineSize / sizeof(Packet);
-
- public:
- // "name" is used to sanity-check offsets fit in kOffsetBits.
- explicit ThreadSpecific(const char* name)
- : packets_(static_cast<Packet*>(
- CacheAligned::Allocate(PROFILER_THREAD_STORAGE << 20))),
- num_packets_(0),
- max_packets_(PROFILER_THREAD_STORAGE << 17),
- string_origin_(StringOrigin()) {
- // Even in optimized builds (with NDEBUG), verify that this zone's name
- // offset fits within the allotted space. If not, UpdateOrAdd is likely to
- // overrun zones_[]. We also assert(), but users often do not run debug
- // builds. Checking here on the cold path (only reached once per thread)
- // is cheap, but it only covers one zone.
- const size_t biased_offset = name - string_origin_;
- PROFILER_CHECK(biased_offset <= (1ULL << Packet::kOffsetBits));
- }
-
- ~ThreadSpecific() { CacheAligned::Free(packets_); }
-
- // Depends on Zone => defined below.
- void ComputeOverhead();
-
- void WriteEntry(const char* name, const uint64_t timestamp) {
- const size_t biased_offset = name - string_origin_;
- Write(Packet::Make(biased_offset, timestamp));
- }
-
- void WriteExit(const uint64_t timestamp) {
- const size_t biased_offset = Packet::kOffsetBias;
- Write(Packet::Make(biased_offset, timestamp));
- }
-
- void AnalyzeRemainingPackets() {
-#if HH_ARCH_X64
- // Ensures prior weakly-ordered streaming stores are globally visible.
- _mm_sfence();
-
- // Storage full => empty it.
- if (num_packets_ + buffer_size_ > max_packets_) {
- results_.AnalyzePackets(packets_, num_packets_);
- num_packets_ = 0;
- }
- memcpy(packets_ + num_packets_, buffer_, buffer_size_ * sizeof(Packet));
- num_packets_ += buffer_size_;
-#endif
-
- results_.AnalyzePackets(packets_, num_packets_);
- num_packets_ = 0;
- }
-
- Results& GetResults() { return results_; }
-
- private:
- // Write packet to buffer/storage, emptying them as needed.
- void Write(const Packet packet) {
-#if HH_ARCH_X64
- // Buffer full => copy to storage.
- if (buffer_size_ == kBufferCapacity) {
- // Storage full => empty it.
- if (num_packets_ + kBufferCapacity > max_packets_) {
- results_.AnalyzePackets(packets_, num_packets_);
- num_packets_ = 0;
- }
- // This buffering halves observer overhead and decreases the overall
- // runtime by about 3%.
- CacheAligned::StreamCacheLine(buffer_, packets_ + num_packets_);
- num_packets_ += kBufferCapacity;
- buffer_size_ = 0;
- }
- buffer_[buffer_size_] = packet;
- ++buffer_size_;
-#else
- // Write directly to storage.
- if (num_packets_ >= max_packets_) {
- results_.AnalyzePackets(packets_, num_packets_);
- num_packets_ = 0;
- }
- packets_[num_packets_] = packet;
- ++num_packets_;
-#endif
- }
-
- // Write-combining buffer to avoid cache pollution. Must be the first
- // non-static member to ensure cache-line alignment.
-#if HH_ARCH_X64
- Packet buffer_[kBufferCapacity];
- size_t buffer_size_ = 0;
-#endif
-
- // Contiguous storage for zone enter/exit packets.
- Packet* const HH_RESTRICT packets_;
- size_t num_packets_;
- const size_t max_packets_;
- // Cached here because we already read this cache line on zone entry/exit.
- const char* HH_RESTRICT string_origin_;
- Results results_;
-};
-
-class ThreadList {
- public:
- // Thread-safe.
- void Add(ThreadSpecific* const ts) {
- const uint32_t index = num_threads_.fetch_add(1);
- PROFILER_CHECK(index < kMaxThreads);
- threads_[index] = ts;
- }
-
- // Single-threaded.
- void PrintResults() {
- const uint32_t num_threads = num_threads_.load();
- for (uint32_t i = 0; i < num_threads; ++i) {
- threads_[i]->AnalyzeRemainingPackets();
- }
-
- // Combine all threads into a single Result.
- for (uint32_t i = 1; i < num_threads; ++i) {
- threads_[0]->GetResults().Assimilate(threads_[i]->GetResults());
- }
-
- if (num_threads != 0) {
- threads_[0]->GetResults().Print();
- }
- }
-
- private:
- // Owning pointers.
- HH_ALIGNAS(64) ThreadSpecific* threads_[kMaxThreads];
- std::atomic<uint32_t> num_threads_{0};
-};
-
-// RAII zone enter/exit recorder constructed by the ZONE macro; also
-// responsible for initializing ThreadSpecific.
-class Zone {
- public:
- // "name" must be a string literal (see StringOrigin).
- HH_NOINLINE explicit Zone(const char* name) {
- HH_COMPILER_FENCE;
- ThreadSpecific* HH_RESTRICT thread_specific = StaticThreadSpecific();
- if (HH_UNLIKELY(thread_specific == nullptr)) {
- void* mem = CacheAligned::Allocate(sizeof(ThreadSpecific));
- thread_specific = new (mem) ThreadSpecific(name);
- // Must happen before ComputeOverhead, which re-enters this ctor.
- Threads().Add(thread_specific);
- StaticThreadSpecific() = thread_specific;
- thread_specific->ComputeOverhead();
- }
-
- // (Capture timestamp ASAP, not inside WriteEntry.)
- HH_COMPILER_FENCE;
- const uint64_t timestamp = Start<uint64_t>();
- thread_specific->WriteEntry(name, timestamp);
- }
-
- HH_NOINLINE ~Zone() {
- HH_COMPILER_FENCE;
- const uint64_t timestamp = Stop<uint64_t>();
- StaticThreadSpecific()->WriteExit(timestamp);
- HH_COMPILER_FENCE;
- }
-
- // Call exactly once after all threads have exited all zones.
- static void PrintResults() { Threads().PrintResults(); }
-
- private:
- // Returns reference to the thread's ThreadSpecific pointer (initially null).
- // Function-local static avoids needing a separate definition.
- static ThreadSpecific*& StaticThreadSpecific() {
- static thread_local ThreadSpecific* thread_specific;
- return thread_specific;
- }
-
- // Returns the singleton ThreadList. Non time-critical.
- static ThreadList& Threads() {
- static ThreadList threads_;
- return threads_;
- }
-};
-
-// Creates a zone starting from here until the end of the current scope.
-// Timestamps will be recorded when entering and exiting the zone.
-// "name" must be a string literal, which is ensured by merging with "".
-#define PROFILER_ZONE(name) \
- HH_COMPILER_FENCE; \
- const Zone zone("" name); \
- HH_COMPILER_FENCE
-
-// Creates a zone for an entire function (when placed at its beginning).
-// Shorter/more convenient than ZONE.
-#define PROFILER_FUNC \
- HH_COMPILER_FENCE; \
- const Zone zone(__func__); \
- HH_COMPILER_FENCE
-
-#define PROFILER_PRINT_RESULTS Zone::PrintResults
-
-inline void ThreadSpecific::ComputeOverhead() {
- // Delay after capturing timestamps before/after the actual zone runs. Even
- // with frequency throttling disabled, this has a multimodal distribution,
- // including 32, 34, 48, 52, 59, 62.
- uint64_t self_overhead;
- {
- const size_t kNumSamples = 32;
- uint32_t samples[kNumSamples];
- for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
- const size_t kNumDurations = 1024;
- uint32_t durations[kNumDurations];
-
- for (size_t idx_duration = 0; idx_duration < kNumDurations;
- ++idx_duration) {
- { PROFILER_ZONE("Dummy Zone (never shown)"); }
-#if HH_ARCH_X64
- const uint64_t duration = results_.ZoneDuration(buffer_);
- buffer_size_ = 0;
-#else
- const uint64_t duration = results_.ZoneDuration(packets_);
- num_packets_ = 0;
-#endif
- durations[idx_duration] = static_cast<uint32_t>(duration);
- PROFILER_CHECK(num_packets_ == 0);
- }
- CountingSort(durations, durations + kNumDurations);
- samples[idx_sample] = Mode(durations, kNumDurations);
- }
- // Median.
- CountingSort(samples, samples + kNumSamples);
- self_overhead = samples[kNumSamples / 2];
- printf("Overhead: %zu\n", self_overhead);
- results_.SetSelfOverhead(self_overhead);
- }
-
- // Delay before capturing start timestamp / after end timestamp.
- const size_t kNumSamples = 32;
- uint32_t samples[kNumSamples];
- for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
- const size_t kNumDurations = 16;
- uint32_t durations[kNumDurations];
- for (size_t idx_duration = 0; idx_duration < kNumDurations;
- ++idx_duration) {
- const size_t kReps = 10000;
- // Analysis time should not be included => must fit within buffer.
- PROFILER_CHECK(kReps * 2 < max_packets_);
-#if HH_ARCH_X64
- _mm_mfence();
-#endif
- const uint64_t t0 = Start<uint64_t>();
- for (size_t i = 0; i < kReps; ++i) {
- PROFILER_ZONE("Dummy");
- }
-#if HH_ARCH_X64
- _mm_sfence();
-#endif
- const uint64_t t1 = Stop<uint64_t>();
-#if HH_ARCH_X64
- PROFILER_CHECK(num_packets_ + buffer_size_ == kReps * 2);
- buffer_size_ = 0;
-#else
- PROFILER_CHECK(num_packets_ == kReps * 2);
-#endif
- num_packets_ = 0;
- const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
- durations[idx_duration] =
- static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
- }
- CountingSort(durations, durations + kNumDurations);
- samples[idx_sample] = Mode(durations, kNumDurations);
- }
- CountingSort(samples, samples + kNumSamples);
- const uint64_t child_overhead = samples[9 * kNumSamples / 10];
- printf("Child overhead: %zu\n", child_overhead);
- results_.SetChildOverhead(child_overhead);
-}
-
-} // namespace highwayhash
-
-#else // !PROFILER_ENABLED
-#define PROFILER_ZONE(name)
-#define PROFILER_FUNC
-#define PROFILER_PRINT_RESULTS()
-#endif
-
-#endif // HIGHWAYHASH_PROFILER_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_PROFILER_H_
+#define HIGHWAYHASH_PROFILER_H_
+
+// High precision, low overhead time measurements. Returns exact call counts and
+// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
+//
+// Usage: add this header to BUILD srcs; instrument regions of interest:
+// { PROFILER_ZONE("name"); /*code*/ } or
+// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
+// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
+// print call counts and average durations [CPU cycles] to stdout, sorted in
+// descending order of total duration.
+
+// Configuration settings:
+
+// If zero, this file has no effect and no measurements will be recorded.
+#ifndef PROFILER_ENABLED
+#define PROFILER_ENABLED 1
+#endif
+
+// How many mebibytes to allocate (if PROFILER_ENABLED) per thread that
+// enters at least one zone. Once this buffer is full, the thread will analyze
+// and discard packets, thus temporarily adding some observer overhead.
+// Each zone occupies 16 bytes.
+#ifndef PROFILER_THREAD_STORAGE
+#define PROFILER_THREAD_STORAGE 200ULL
+#endif
+
+#if PROFILER_ENABLED
+
+#include <algorithm> // min/max
+#include <atomic>
+#include <cassert>
+#include <cstddef> // ptrdiff_t
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring> // memcpy
+#include <new>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+// Non-portable aspects:
+// - SSE2 128-bit load/store (write-combining, UpdateOrAdd)
+// - RDTSCP timestamps (serializing, high-resolution)
+// - assumes string literals are stored within an 8 MiB range
+// - compiler-specific annotations (restrict, alignment, fences)
+#if HH_ARCH_X64
+#include <emmintrin.h>
+#if HH_MSC_VERSION
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#endif
+
+#include "highwayhash/robust_statistics.h"
+#include "highwayhash/tsc_timer.h"
+
+#define PROFILER_CHECK(condition) \
+ while (!(condition)) { \
+ printf("Profiler check failed at line %d\n", __LINE__); \
+ abort(); \
+ }
+
+namespace highwayhash {
+
+// Upper bounds for various fixed-size data structures (guarded via assert):
+
+// How many threads can actually enter a zone (those that don't do not count).
+// Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB.
+// WARNING: a fiber library can spawn hundreds of threads.
+static constexpr size_t kMaxThreads = 128;
+
+// Maximum nesting of zones.
+static constexpr size_t kMaxDepth = 64;
+
+// Total number of zones.
+static constexpr size_t kMaxZones = 256;
+
+// Functions that depend on the cache line size.
+class CacheAligned {
+ public:
+ static constexpr size_t kPointerSize = sizeof(void*);
+ static constexpr size_t kCacheLineSize = 64;
+
+ static void* Allocate(const size_t bytes) {
+ char* const allocated = static_cast<char*>(malloc(bytes + kCacheLineSize));
+ if (allocated == nullptr) {
+ return nullptr;
+ }
+ const uintptr_t misalignment =
+ reinterpret_cast<uintptr_t>(allocated) & (kCacheLineSize - 1);
+ // malloc is at least kPointerSize aligned, so we can store the "allocated"
+ // pointer immediately before the aligned memory.
+ assert(misalignment % kPointerSize == 0);
+ char* const aligned = allocated + kCacheLineSize - misalignment;
+ memcpy(aligned - kPointerSize, &allocated, kPointerSize);
+ return aligned;
+ }
+
+ // Template allows freeing pointer-to-const.
+ template <typename T>
+ static void Free(T* aligned_pointer) {
+ if (aligned_pointer == nullptr) {
+ return;
+ }
+ const char* const aligned = reinterpret_cast<const char*>(aligned_pointer);
+ assert(reinterpret_cast<uintptr_t>(aligned) % kCacheLineSize == 0);
+ char* allocated;
+ memcpy(&allocated, aligned - kPointerSize, kPointerSize);
+ assert(allocated <= aligned - kPointerSize);
+ assert(allocated >= aligned - kCacheLineSize);
+ free(allocated);
+ }
+
+#if HH_ARCH_X64
+ // Overwrites "to" without loading it into the cache (read-for-ownership).
+ template <typename T>
+ static void StreamCacheLine(const T* from_items, T* to_items) {
+ const __m128i* const from = reinterpret_cast<const __m128i*>(from_items);
+ __m128i* const to = reinterpret_cast<__m128i*>(to_items);
+ HH_COMPILER_FENCE;
+ const __m128i v0 = _mm_load_si128(from + 0);
+ const __m128i v1 = _mm_load_si128(from + 1);
+ const __m128i v2 = _mm_load_si128(from + 2);
+ const __m128i v3 = _mm_load_si128(from + 3);
+ // Fences prevent the compiler from reordering loads/stores, which may
+ // interfere with write-combining.
+ HH_COMPILER_FENCE;
+ _mm_stream_si128(to + 0, v0);
+ _mm_stream_si128(to + 1, v1);
+ _mm_stream_si128(to + 2, v2);
+ _mm_stream_si128(to + 3, v3);
+ HH_COMPILER_FENCE;
+ }
+#endif
+};
+
+// Represents zone entry/exit events. Stores a full-resolution timestamp plus
+// an offset (representing zone name or identifying exit packets). POD.
+class Packet {
+ public:
+ // If offsets do not fit, UpdateOrAdd will overrun our heap allocation
+ // (governed by kMaxZones). We have seen multi-megabyte offsets.
+ static constexpr size_t kOffsetBits = 25;
+ static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1);
+
+ // We need full-resolution timestamps; at an effective rate of 4 GHz,
+ // this permits 1 minute zone durations (for longer durations, split into
+ // multiple zones). Wraparound is handled by masking.
+ static constexpr size_t kTimestampBits = 64 - kOffsetBits;
+ static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1;
+
+ static Packet Make(const size_t biased_offset, const uint64_t timestamp) {
+ assert(biased_offset < (1ULL << kOffsetBits));
+
+ Packet packet;
+ packet.bits_ =
+ (biased_offset << kTimestampBits) + (timestamp & kTimestampMask);
+ return packet;
+ }
+
+ uint64_t Timestamp() const { return bits_ & kTimestampMask; }
+
+ size_t BiasedOffset() const { return (bits_ >> kTimestampBits); }
+
+ private:
+ uint64_t bits_;
+};
+static_assert(sizeof(Packet) == 8, "Wrong Packet size");
+
+// Returns the address of a string literal. Assuming zone names are also
+// literals and stored nearby, we can represent them as offsets, which are
+// faster to compute than hashes or even a static index.
+//
+// This function must not be static - each call (even from other translation
+// units) must return the same value.
+inline const char* StringOrigin() {
+ // Chosen such that no zone name is a prefix nor suffix of this string
+ // to ensure they aren't merged (offset 0 identifies zone-exit packets).
+ static const char* string_origin = "__#__";
+ return string_origin - Packet::kOffsetBias;
+}
+
+// Representation of an active zone, stored in a stack. Used to deduct
+// child duration from the parent's self time. POD.
+struct Node {
+ Packet packet;
+ uint64_t child_total;
+};
+
+// Holds statistics for all zones with the same name. POD.
+struct Accumulator {
+ static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits;
+
+ uint64_t BiasedOffset() const { return num_calls >> kNumCallBits; }
+ uint64_t NumCalls() const { return num_calls & ((1ULL << kNumCallBits) - 1); }
+
+ // UpdateOrAdd relies upon this layout.
+ uint64_t num_calls = 0; // upper bits = biased_offset.
+ uint64_t total_duration = 0;
+};
+#if HH_ARCH_X64
+static_assert(sizeof(Accumulator) == sizeof(__m128i), "Wrong Accumulator size");
+#endif
+
+template <typename T>
+inline T ClampedSubtract(const T minuend, const T subtrahend) {
+ if (subtrahend > minuend) {
+ return 0;
+ }
+ return minuend - subtrahend;
+}
+
+// Per-thread call graph (stack) and Accumulator for each zone.
+class Results {
+ public:
+ Results() {
+ // Zero-initialize first accumulator to avoid a check for num_zones_ == 0.
+ memset(zones_, 0, sizeof(Accumulator));
+ }
+
+ // Used for computing overhead when this thread encounters its first Zone.
+ // This has no observable effect apart from increasing "analyze_elapsed_".
+ uint64_t ZoneDuration(const Packet* packets) {
+ PROFILER_CHECK(depth_ == 0);
+ PROFILER_CHECK(num_zones_ == 0);
+ AnalyzePackets(packets, 2);
+ const uint64_t duration = zones_[0].total_duration;
+ zones_[0].num_calls = 0;
+ zones_[0].total_duration = 0;
+ PROFILER_CHECK(depth_ == 0);
+ num_zones_ = 0;
+ return duration;
+ }
+
+ void SetSelfOverhead(const uint64_t self_overhead) {
+ self_overhead_ = self_overhead;
+ }
+
+ void SetChildOverhead(const uint64_t child_overhead) {
+ child_overhead_ = child_overhead;
+ }
+
+ // Draw all required information from the packets, which can be discarded
+ // afterwards. Called whenever this thread's storage is full.
+ void AnalyzePackets(const Packet* packets, const size_t num_packets) {
+ const uint64_t t0 = Start<uint64_t>();
+
+ for (size_t i = 0; i < num_packets; ++i) {
+ const Packet p = packets[i];
+ // Entering a zone
+ if (p.BiasedOffset() != Packet::kOffsetBias) {
+ assert(depth_ < kMaxDepth);
+ nodes_[depth_].packet = p;
+ nodes_[depth_].child_total = 0;
+ ++depth_;
+ continue;
+ }
+
+ assert(depth_ != 0);
+ const Node& node = nodes_[depth_ - 1];
+ // Masking correctly handles unsigned wraparound.
+ const uint64_t duration =
+ (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask;
+ const uint64_t self_duration = ClampedSubtract(
+ duration, self_overhead_ + child_overhead_ + node.child_total);
+
+ UpdateOrAdd(node.packet.BiasedOffset(), self_duration);
+ --depth_;
+
+ // Deduct this nested node's time from its parent's self_duration.
+ if (depth_ != 0) {
+ nodes_[depth_ - 1].child_total += duration + child_overhead_;
+ }
+ }
+
+ const uint64_t t1 = Stop<uint64_t>();
+ analyze_elapsed_ += t1 - t0;
+ }
+
+ // Incorporates results from another thread. Call after all threads have
+ // exited any zones.
+ void Assimilate(const Results& other) {
+ const uint64_t t0 = Start<uint64_t>();
+ assert(depth_ == 0);
+ assert(other.depth_ == 0);
+
+ for (size_t i = 0; i < other.num_zones_; ++i) {
+ const Accumulator& zone = other.zones_[i];
+ UpdateOrAdd(zone.BiasedOffset(), zone.total_duration);
+ }
+ const uint64_t t1 = Stop<uint64_t>();
+ analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
+ }
+
+ // Single-threaded.
+ void Print() {
+ const uint64_t t0 = Start<uint64_t>();
+ MergeDuplicates();
+
+ // Sort by decreasing total (self) cost.
+ std::sort(zones_, zones_ + num_zones_,
+ [](const Accumulator& r1, const Accumulator& r2) {
+ return r1.total_duration > r2.total_duration;
+ });
+
+ const char* string_origin = StringOrigin();
+ for (size_t i = 0; i < num_zones_; ++i) {
+ const Accumulator& r = zones_[i];
+ const uint64_t num_calls = r.NumCalls();
+ printf("%40s: %10zu x %15zu = %15zu\n", string_origin + r.BiasedOffset(),
+ num_calls, r.total_duration / num_calls, r.total_duration);
+ }
+
+ const uint64_t t1 = Stop<uint64_t>();
+ analyze_elapsed_ += t1 - t0;
+ printf("Total clocks during analysis: %zu\n", analyze_elapsed_);
+ }
+
+ private:
+#if HH_ARCH_X64
+ static bool SameOffset(const __m128i& zone, const size_t biased_offset) {
+ const uint64_t num_calls = _mm_cvtsi128_si64(zone);
+ return (num_calls >> Accumulator::kNumCallBits) == biased_offset;
+ }
+#endif
+
+ // Updates an existing Accumulator (uniquely identified by biased_offset) or
+ // adds one if this is the first time this thread analyzed that zone.
+ // Uses a self-organizing list data structure, which avoids dynamic memory
+ // allocations and is far faster than unordered_map. Loads, updates and
+ // stores the entire Accumulator with vector instructions.
+ void UpdateOrAdd(const size_t biased_offset, const uint64_t duration) {
+ assert(biased_offset < (1ULL << Packet::kOffsetBits));
+
+#if HH_ARCH_X64
+ const __m128i one_64 = _mm_set1_epi64x(1);
+ const __m128i duration_64 = _mm_cvtsi64_si128(duration);
+ const __m128i add_duration_call = _mm_unpacklo_epi64(one_64, duration_64);
+
+ __m128i* const HH_RESTRICT zones = reinterpret_cast<__m128i*>(zones_);
+
+ // Special case for first zone: (maybe) update, without swapping.
+ __m128i prev = _mm_load_si128(zones);
+ if (SameOffset(prev, biased_offset)) {
+ prev = _mm_add_epi64(prev, add_duration_call);
+ assert(SameOffset(prev, biased_offset));
+ _mm_store_si128(zones, prev);
+ return;
+ }
+
+ // Look for a zone with the same offset.
+ for (size_t i = 1; i < num_zones_; ++i) {
+ __m128i zone = _mm_load_si128(zones + i);
+ if (SameOffset(zone, biased_offset)) {
+ zone = _mm_add_epi64(zone, add_duration_call);
+ assert(SameOffset(zone, biased_offset));
+ // Swap with predecessor (more conservative than move to front,
+ // but at least as successful).
+ _mm_store_si128(zones + i - 1, zone);
+ _mm_store_si128(zones + i, prev);
+ return;
+ }
+ prev = zone;
+ }
+
+ // Not found; create a new Accumulator.
+ const __m128i biased_offset_64 = _mm_slli_epi64(
+ _mm_cvtsi64_si128(biased_offset), Accumulator::kNumCallBits);
+ const __m128i zone = _mm_add_epi64(biased_offset_64, add_duration_call);
+ assert(SameOffset(zone, biased_offset));
+
+ assert(num_zones_ < kMaxZones);
+ _mm_store_si128(zones + num_zones_, zone);
+ ++num_zones_;
+#else
+ // Special case for first zone: (maybe) update, without swapping.
+ if (zones_[0].BiasedOffset() == biased_offset) {
+ zones_[0].total_duration += duration;
+ zones_[0].num_calls += 1;
+ assert(zones_[0].BiasedOffset() == biased_offset);
+ return;
+ }
+
+ // Look for a zone with the same offset.
+ for (size_t i = 1; i < num_zones_; ++i) {
+ if (zones_[i].BiasedOffset() == biased_offset) {
+ zones_[i].total_duration += duration;
+ zones_[i].num_calls += 1;
+ assert(zones_[i].BiasedOffset() == biased_offset);
+ // Swap with predecessor (more conservative than move to front,
+ // but at least as successful).
+ const Accumulator prev = zones_[i - 1];
+ zones_[i - 1] = zones_[i];
+ zones_[i] = prev;
+ return;
+ }
+ }
+
+ // Not found; create a new Accumulator.
+ assert(num_zones_ < kMaxZones);
+ Accumulator* HH_RESTRICT zone = zones_ + num_zones_;
+ zone->num_calls = (biased_offset << Accumulator::kNumCallBits) + 1;
+ zone->total_duration = duration;
+ assert(zone->BiasedOffset() == biased_offset);
+ ++num_zones_;
+#endif
+ }
+
+ // Each instantiation of a function template seems to get its own copy of
+ // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
+ // acceptable because we only expect a few dozen zones.
+ void MergeDuplicates() {
+ const char* string_origin = StringOrigin();
+ for (size_t i = 0; i < num_zones_; ++i) {
+ const size_t biased_offset = zones_[i].BiasedOffset();
+ const char* name = string_origin + biased_offset;
+ // Separate num_calls from biased_offset so we can add them together.
+ uint64_t num_calls = zones_[i].NumCalls();
+
+ // Add any subsequent duplicates to num_calls and total_duration.
+ for (size_t j = i + 1; j < num_zones_;) {
+ if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) {
+ num_calls += zones_[j].NumCalls();
+ zones_[i].total_duration += zones_[j].total_duration;
+ // Fill hole with last item.
+ zones_[j] = zones_[--num_zones_];
+ } else { // Name differed, try next Accumulator.
+ ++j;
+ }
+ }
+
+ assert(num_calls < (1ULL << Accumulator::kNumCallBits));
+
+ // Re-pack regardless of whether any duplicates were found.
+ zones_[i].num_calls =
+ (biased_offset << Accumulator::kNumCallBits) + num_calls;
+ }
+ }
+
+ uint64_t analyze_elapsed_ = 0;
+ uint64_t self_overhead_ = 0;
+ uint64_t child_overhead_ = 0;
+
+ size_t depth_ = 0; // Number of active zones.
+ size_t num_zones_ = 0; // Number of retired zones.
+
+ HH_ALIGNAS(64) Node nodes_[kMaxDepth]; // Stack
+ HH_ALIGNAS(64) Accumulator zones_[kMaxZones]; // Self-organizing list
+};
+
+// Per-thread packet storage, allocated via CacheAligned.
+class ThreadSpecific {
+ static constexpr size_t kBufferCapacity =
+ CacheAligned::kCacheLineSize / sizeof(Packet);
+
+ public:
+ // "name" is used to sanity-check offsets fit in kOffsetBits.
+ explicit ThreadSpecific(const char* name)
+ : packets_(static_cast<Packet*>(
+ CacheAligned::Allocate(PROFILER_THREAD_STORAGE << 20))),
+ num_packets_(0),
+ max_packets_(PROFILER_THREAD_STORAGE << 17),
+ string_origin_(StringOrigin()) {
+ // Even in optimized builds (with NDEBUG), verify that this zone's name
+ // offset fits within the allotted space. If not, UpdateOrAdd is likely to
+ // overrun zones_[]. We also assert(), but users often do not run debug
+ // builds. Checking here on the cold path (only reached once per thread)
+ // is cheap, but it only covers one zone.
+ const size_t biased_offset = name - string_origin_;
+ PROFILER_CHECK(biased_offset <= (1ULL << Packet::kOffsetBits));
+ }
+
+ ~ThreadSpecific() { CacheAligned::Free(packets_); }
+
+ // Depends on Zone => defined below.
+ void ComputeOverhead();
+
+ void WriteEntry(const char* name, const uint64_t timestamp) {
+ const size_t biased_offset = name - string_origin_;
+ Write(Packet::Make(biased_offset, timestamp));
+ }
+
+ void WriteExit(const uint64_t timestamp) {
+ const size_t biased_offset = Packet::kOffsetBias;
+ Write(Packet::Make(biased_offset, timestamp));
+ }
+
+ void AnalyzeRemainingPackets() {
+#if HH_ARCH_X64
+ // Ensures prior weakly-ordered streaming stores are globally visible.
+ _mm_sfence();
+
+ // Storage full => empty it.
+ if (num_packets_ + buffer_size_ > max_packets_) {
+ results_.AnalyzePackets(packets_, num_packets_);
+ num_packets_ = 0;
+ }
+ memcpy(packets_ + num_packets_, buffer_, buffer_size_ * sizeof(Packet));
+ num_packets_ += buffer_size_;
+#endif
+
+ results_.AnalyzePackets(packets_, num_packets_);
+ num_packets_ = 0;
+ }
+
+ Results& GetResults() { return results_; }
+
+ private:
+ // Write packet to buffer/storage, emptying them as needed.
+ void Write(const Packet packet) {
+#if HH_ARCH_X64
+ // Buffer full => copy to storage.
+ if (buffer_size_ == kBufferCapacity) {
+ // Storage full => empty it.
+ if (num_packets_ + kBufferCapacity > max_packets_) {
+ results_.AnalyzePackets(packets_, num_packets_);
+ num_packets_ = 0;
+ }
+ // This buffering halves observer overhead and decreases the overall
+ // runtime by about 3%.
+ CacheAligned::StreamCacheLine(buffer_, packets_ + num_packets_);
+ num_packets_ += kBufferCapacity;
+ buffer_size_ = 0;
+ }
+ buffer_[buffer_size_] = packet;
+ ++buffer_size_;
+#else
+ // Write directly to storage.
+ if (num_packets_ >= max_packets_) {
+ results_.AnalyzePackets(packets_, num_packets_);
+ num_packets_ = 0;
+ }
+ packets_[num_packets_] = packet;
+ ++num_packets_;
+#endif
+ }
+
+ // Write-combining buffer to avoid cache pollution. Must be the first
+ // non-static member to ensure cache-line alignment.
+#if HH_ARCH_X64
+ Packet buffer_[kBufferCapacity];
+ size_t buffer_size_ = 0;
+#endif
+
+ // Contiguous storage for zone enter/exit packets.
+ Packet* const HH_RESTRICT packets_;
+ size_t num_packets_;
+ const size_t max_packets_;
+ // Cached here because we already read this cache line on zone entry/exit.
+ const char* HH_RESTRICT string_origin_;
+ Results results_;
+};
+
+class ThreadList {
+ public:
+ // Thread-safe.
+ void Add(ThreadSpecific* const ts) {
+ const uint32_t index = num_threads_.fetch_add(1);
+ PROFILER_CHECK(index < kMaxThreads);
+ threads_[index] = ts;
+ }
+
+ // Single-threaded.
+ void PrintResults() {
+ const uint32_t num_threads = num_threads_.load();
+ for (uint32_t i = 0; i < num_threads; ++i) {
+ threads_[i]->AnalyzeRemainingPackets();
+ }
+
+ // Combine all threads into a single Result.
+ for (uint32_t i = 1; i < num_threads; ++i) {
+ threads_[0]->GetResults().Assimilate(threads_[i]->GetResults());
+ }
+
+ if (num_threads != 0) {
+ threads_[0]->GetResults().Print();
+ }
+ }
+
+ private:
+ // Owning pointers.
+ HH_ALIGNAS(64) ThreadSpecific* threads_[kMaxThreads];
+ std::atomic<uint32_t> num_threads_{0};
+};
+
+// RAII zone enter/exit recorder constructed by the ZONE macro; also
+// responsible for initializing ThreadSpecific.
+class Zone {
+ public:
+ // "name" must be a string literal (see StringOrigin).
+ HH_NOINLINE explicit Zone(const char* name) {
+ HH_COMPILER_FENCE;
+ ThreadSpecific* HH_RESTRICT thread_specific = StaticThreadSpecific();
+ if (HH_UNLIKELY(thread_specific == nullptr)) {
+ void* mem = CacheAligned::Allocate(sizeof(ThreadSpecific));
+ thread_specific = new (mem) ThreadSpecific(name);
+ // Must happen before ComputeOverhead, which re-enters this ctor.
+ Threads().Add(thread_specific);
+ StaticThreadSpecific() = thread_specific;
+ thread_specific->ComputeOverhead();
+ }
+
+ // (Capture timestamp ASAP, not inside WriteEntry.)
+ HH_COMPILER_FENCE;
+ const uint64_t timestamp = Start<uint64_t>();
+ thread_specific->WriteEntry(name, timestamp);
+ }
+
+ HH_NOINLINE ~Zone() {
+ HH_COMPILER_FENCE;
+ const uint64_t timestamp = Stop<uint64_t>();
+ StaticThreadSpecific()->WriteExit(timestamp);
+ HH_COMPILER_FENCE;
+ }
+
+ // Call exactly once after all threads have exited all zones.
+ static void PrintResults() { Threads().PrintResults(); }
+
+ private:
+ // Returns reference to the thread's ThreadSpecific pointer (initially null).
+ // Function-local static avoids needing a separate definition.
+ static ThreadSpecific*& StaticThreadSpecific() {
+ static thread_local ThreadSpecific* thread_specific;
+ return thread_specific;
+ }
+
+ // Returns the singleton ThreadList. Non time-critical.
+ static ThreadList& Threads() {
+ static ThreadList threads_;
+ return threads_;
+ }
+};
+
+// Creates a zone starting from here until the end of the current scope.
+// Timestamps will be recorded when entering and exiting the zone.
+// "name" must be a string literal, which is ensured by merging with "".
+#define PROFILER_ZONE(name) \
+ HH_COMPILER_FENCE; \
+ const Zone zone("" name); \
+ HH_COMPILER_FENCE
+
+// Creates a zone for an entire function (when placed at its beginning).
+// Shorter/more convenient than ZONE.
+#define PROFILER_FUNC \
+ HH_COMPILER_FENCE; \
+ const Zone zone(__func__); \
+ HH_COMPILER_FENCE
+
+#define PROFILER_PRINT_RESULTS Zone::PrintResults
+
+inline void ThreadSpecific::ComputeOverhead() {
+ // Delay after capturing timestamps before/after the actual zone runs. Even
+ // with frequency throttling disabled, this has a multimodal distribution,
+ // including 32, 34, 48, 52, 59, 62.
+ uint64_t self_overhead;
+ {
+ const size_t kNumSamples = 32;
+ uint32_t samples[kNumSamples];
+ for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
+ const size_t kNumDurations = 1024;
+ uint32_t durations[kNumDurations];
+
+ for (size_t idx_duration = 0; idx_duration < kNumDurations;
+ ++idx_duration) {
+ { PROFILER_ZONE("Dummy Zone (never shown)"); }
+#if HH_ARCH_X64
+ const uint64_t duration = results_.ZoneDuration(buffer_);
+ buffer_size_ = 0;
+#else
+ const uint64_t duration = results_.ZoneDuration(packets_);
+ num_packets_ = 0;
+#endif
+ durations[idx_duration] = static_cast<uint32_t>(duration);
+ PROFILER_CHECK(num_packets_ == 0);
+ }
+ CountingSort(durations, durations + kNumDurations);
+ samples[idx_sample] = Mode(durations, kNumDurations);
+ }
+ // Median.
+ CountingSort(samples, samples + kNumSamples);
+ self_overhead = samples[kNumSamples / 2];
+ printf("Overhead: %zu\n", self_overhead);
+ results_.SetSelfOverhead(self_overhead);
+ }
+
+ // Delay before capturing start timestamp / after end timestamp.
+ const size_t kNumSamples = 32;
+ uint32_t samples[kNumSamples];
+ for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
+ const size_t kNumDurations = 16;
+ uint32_t durations[kNumDurations];
+ for (size_t idx_duration = 0; idx_duration < kNumDurations;
+ ++idx_duration) {
+ const size_t kReps = 10000;
+ // Analysis time should not be included => must fit within buffer.
+ PROFILER_CHECK(kReps * 2 < max_packets_);
+#if HH_ARCH_X64
+ _mm_mfence();
+#endif
+ const uint64_t t0 = Start<uint64_t>();
+ for (size_t i = 0; i < kReps; ++i) {
+ PROFILER_ZONE("Dummy");
+ }
+#if HH_ARCH_X64
+ _mm_sfence();
+#endif
+ const uint64_t t1 = Stop<uint64_t>();
+#if HH_ARCH_X64
+ PROFILER_CHECK(num_packets_ + buffer_size_ == kReps * 2);
+ buffer_size_ = 0;
+#else
+ PROFILER_CHECK(num_packets_ == kReps * 2);
+#endif
+ num_packets_ = 0;
+ const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
+ durations[idx_duration] =
+ static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
+ }
+ CountingSort(durations, durations + kNumDurations);
+ samples[idx_sample] = Mode(durations, kNumDurations);
+ }
+ CountingSort(samples, samples + kNumSamples);
+ const uint64_t child_overhead = samples[9 * kNumSamples / 10];
+ printf("Child overhead: %zu\n", child_overhead);
+ results_.SetChildOverhead(child_overhead);
+}
+
+} // namespace highwayhash
+
+#else // !PROFILER_ENABLED
+#define PROFILER_ZONE(name)
+#define PROFILER_FUNC
+#define PROFILER_PRINT_RESULTS()
+#endif
+
+#endif // HIGHWAYHASH_PROFILER_H_
diff --git a/contrib/libs/highwayhash/highwayhash/profiler_example.cc b/contrib/libs/highwayhash/highwayhash/profiler_example.cc
index 999cc4581f..9d97066ec9 100644
--- a/contrib/libs/highwayhash/highwayhash/profiler_example.cc
+++ b/contrib/libs/highwayhash/highwayhash/profiler_example.cc
@@ -1,97 +1,97 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-
-#include "highwayhash/os_specific.h"
-#include "highwayhash/profiler.h"
-
-namespace highwayhash {
-namespace {
-
-void Spin(const double min_time) {
- const double t0 = Now();
- for (;;) {
- const double elapsed = Now() - t0;
- if (elapsed > min_time) {
- break;
- }
- }
-}
-
-void Spin10() {
- PROFILER_FUNC;
- Spin(10E-6);
-}
-
-void Spin20() {
- PROFILER_FUNC;
- Spin(20E-6);
-}
-
-void Spin3060() {
- {
- PROFILER_ZONE("spin30");
- Spin(30E-6);
- }
- {
- PROFILER_ZONE("spin60");
- Spin(60E-6);
- }
-}
-
-void Level3() {
- PROFILER_FUNC;
- for (int rep = 0; rep < 10; ++rep) {
- double total = 0.0;
- for (int i = 0; i < 100 - rep; ++i) {
- total += pow(0.9, i);
- }
- if (std::abs(total - 9.999) > 1E-2) {
- abort();
- }
- }
-}
-
-void Level2() {
- PROFILER_FUNC;
- Level3();
-}
-
-void Level1() {
- PROFILER_FUNC;
- Level2();
-}
-
-void ProfilerExample() {
- PinThreadToRandomCPU();
- {
- PROFILER_FUNC;
- Spin10();
- Spin20();
- Spin3060();
- Level1();
- }
- PROFILER_PRINT_RESULTS();
-}
-
-} // namespace
-} // namespace highwayhash
-
-int main(int argc, char* argv[]) {
- highwayhash::ProfilerExample();
- return 0;
-}
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+
+#include "highwayhash/os_specific.h"
+#include "highwayhash/profiler.h"
+
+namespace highwayhash {
+namespace {
+
+void Spin(const double min_time) {
+ const double t0 = Now();
+ for (;;) {
+ const double elapsed = Now() - t0;
+ if (elapsed > min_time) {
+ break;
+ }
+ }
+}
+
+void Spin10() {
+ PROFILER_FUNC;
+ Spin(10E-6);
+}
+
+void Spin20() {
+ PROFILER_FUNC;
+ Spin(20E-6);
+}
+
+void Spin3060() {
+ {
+ PROFILER_ZONE("spin30");
+ Spin(30E-6);
+ }
+ {
+ PROFILER_ZONE("spin60");
+ Spin(60E-6);
+ }
+}
+
+void Level3() {
+ PROFILER_FUNC;
+ for (int rep = 0; rep < 10; ++rep) {
+ double total = 0.0;
+ for (int i = 0; i < 100 - rep; ++i) {
+ total += pow(0.9, i);
+ }
+ if (std::abs(total - 9.999) > 1E-2) {
+ abort();
+ }
+ }
+}
+
+void Level2() {
+ PROFILER_FUNC;
+ Level3();
+}
+
+void Level1() {
+ PROFILER_FUNC;
+ Level2();
+}
+
+void ProfilerExample() {
+ PinThreadToRandomCPU();
+ {
+ PROFILER_FUNC;
+ Spin10();
+ Spin20();
+ Spin3060();
+ Level1();
+ }
+ PROFILER_PRINT_RESULTS();
+}
+
+} // namespace
+} // namespace highwayhash
+
+int main(int argc, char* argv[]) {
+ highwayhash::ProfilerExample();
+ return 0;
+}
diff --git a/contrib/libs/highwayhash/highwayhash/robust_statistics.h b/contrib/libs/highwayhash/highwayhash/robust_statistics.h
index 4e45494f9b..9c4a0b4cd5 100644
--- a/contrib/libs/highwayhash/highwayhash/robust_statistics.h
+++ b/contrib/libs/highwayhash/highwayhash/robust_statistics.h
@@ -1,135 +1,135 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_ROBUST_STATISTICS_H_
-#define HIGHWAYHASH_ROBUST_STATISTICS_H_
-
-// Robust statistics: Mode, Median, MedianAbsoluteDeviation.
-
-#include <stddef.h>
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <limits>
-#include <vector>
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-
-namespace highwayhash {
-
-// @return i in [idx_begin, idx_begin + half_count) that minimizes
-// sorted[i + half_count] - sorted[i].
-template <typename T>
-size_t MinRange(const T* const HH_RESTRICT sorted, const size_t idx_begin,
- const size_t half_count) {
- T min_range = std::numeric_limits<T>::max();
- size_t min_idx = 0;
-
- for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
- assert(sorted[idx] <= sorted[idx + half_count]);
- const T range = sorted[idx + half_count] - sorted[idx];
- if (range < min_range) {
- min_range = range;
- min_idx = idx;
- }
- }
-
- return min_idx;
-}
-
-// Returns an estimate of the mode by calling MinRange on successively
-// halved intervals. "sorted" must be in ascending order. This is the
-// Half Sample Mode estimator proposed by Bickel in "On a fast, robust
-// estimator of the mode", with complexity O(N log N). The mode is less
-// affected by outliers in highly-skewed distributions than the median.
-// The averaging operation below assumes "T" is an unsigned integer type.
-template <typename T>
-T Mode(const T* const HH_RESTRICT sorted, const size_t num_values) {
- size_t idx_begin = 0;
- size_t half_count = num_values / 2;
- while (half_count > 1) {
- idx_begin = MinRange(sorted, idx_begin, half_count);
- half_count >>= 1;
- }
-
- const T x = sorted[idx_begin + 0];
- if (half_count == 0) {
- return x;
- }
- assert(half_count == 1);
- const T average = (x + sorted[idx_begin + 1] + 1) / 2;
- return average;
-}
-
-// Sorts integral values in ascending order. About 3x faster than std::sort for
-// input distributions with very few unique values.
-template <class T>
-void CountingSort(T* begin, T* end) {
- // Unique values and their frequency (similar to flat_map).
- using Unique = std::pair<T, int>;
- std::vector<Unique> unique;
- for (const T* p = begin; p != end; ++p) {
- const T value = *p;
- const auto pos =
- std::find_if(unique.begin(), unique.end(),
- [value](const Unique& u) { return u.first == value; });
- if (pos == unique.end()) {
- unique.push_back(std::make_pair(*p, 1));
- } else {
- ++pos->second;
- }
- }
-
- // Sort in ascending order of value (pair.first).
- std::sort(unique.begin(), unique.end());
-
- // Write that many copies of each unique value to the array.
- T* HH_RESTRICT p = begin;
- for (const auto& value_count : unique) {
- std::fill(p, p + value_count.second, value_count.first);
- p += value_count.second;
- }
- assert(p == end);
-}
-
-// Returns the median value. Side effect: sorts "samples".
-template <typename T>
-T Median(std::vector<T>* samples) {
- assert(!samples->empty());
- std::sort(samples->begin(), samples->end());
- const size_t half = samples->size() / 2;
- // Odd count: return middle
- if (samples->size() % 2) {
- return (*samples)[half];
- }
- // Even count: return average of middle two.
- return ((*samples)[half] + (*samples)[half - 1]) / 2;
-}
-
-// Returns a robust measure of variability.
-template <typename T>
-T MedianAbsoluteDeviation(const std::vector<T>& samples, const T median) {
- assert(!samples.empty());
- std::vector<T> abs_deviations;
- abs_deviations.reserve(samples.size());
- for (const T sample : samples) {
- abs_deviations.push_back(std::abs(sample - median));
- }
- return Median(&abs_deviations);
-}
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_ROBUST_STATISTICS_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_ROBUST_STATISTICS_H_
+#define HIGHWAYHASH_ROBUST_STATISTICS_H_
+
+// Robust statistics: Mode, Median, MedianAbsoluteDeviation.
+
+#include <stddef.h>
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+namespace highwayhash {
+
+// @return i in [idx_begin, idx_begin + half_count) that minimizes
+// sorted[i + half_count] - sorted[i].
+template <typename T>
+size_t MinRange(const T* const HH_RESTRICT sorted, const size_t idx_begin,
+ const size_t half_count) {
+ T min_range = std::numeric_limits<T>::max();
+ size_t min_idx = 0;
+
+ for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
+ assert(sorted[idx] <= sorted[idx + half_count]);
+ const T range = sorted[idx + half_count] - sorted[idx];
+ if (range < min_range) {
+ min_range = range;
+ min_idx = idx;
+ }
+ }
+
+ return min_idx;
+}
+
+// Returns an estimate of the mode by calling MinRange on successively
+// halved intervals. "sorted" must be in ascending order. This is the
+// Half Sample Mode estimator proposed by Bickel in "On a fast, robust
+// estimator of the mode", with complexity O(N log N). The mode is less
+// affected by outliers in highly-skewed distributions than the median.
+// The averaging operation below assumes "T" is an unsigned integer type.
+template <typename T>
+T Mode(const T* const HH_RESTRICT sorted, const size_t num_values) {
+ size_t idx_begin = 0;
+ size_t half_count = num_values / 2;
+ while (half_count > 1) {
+ idx_begin = MinRange(sorted, idx_begin, half_count);
+ half_count >>= 1;
+ }
+
+ const T x = sorted[idx_begin + 0];
+ if (half_count == 0) {
+ return x;
+ }
+ assert(half_count == 1);
+ const T average = (x + sorted[idx_begin + 1] + 1) / 2;
+ return average;
+}
+
+// Sorts integral values in ascending order. About 3x faster than std::sort for
+// input distributions with very few unique values.
+template <class T>
+void CountingSort(T* begin, T* end) {
+ // Unique values and their frequency (similar to flat_map).
+ using Unique = std::pair<T, int>;
+ std::vector<Unique> unique;
+ for (const T* p = begin; p != end; ++p) {
+ const T value = *p;
+ const auto pos =
+ std::find_if(unique.begin(), unique.end(),
+ [value](const Unique& u) { return u.first == value; });
+ if (pos == unique.end()) {
+ unique.push_back(std::make_pair(*p, 1));
+ } else {
+ ++pos->second;
+ }
+ }
+
+ // Sort in ascending order of value (pair.first).
+ std::sort(unique.begin(), unique.end());
+
+ // Write that many copies of each unique value to the array.
+ T* HH_RESTRICT p = begin;
+ for (const auto& value_count : unique) {
+ std::fill(p, p + value_count.second, value_count.first);
+ p += value_count.second;
+ }
+ assert(p == end);
+}
+
+// Returns the median value. Side effect: sorts "samples".
+template <typename T>
+T Median(std::vector<T>* samples) {
+ assert(!samples->empty());
+ std::sort(samples->begin(), samples->end());
+ const size_t half = samples->size() / 2;
+ // Odd count: return middle
+ if (samples->size() % 2) {
+ return (*samples)[half];
+ }
+ // Even count: return average of middle two.
+ return ((*samples)[half] + (*samples)[half - 1]) / 2;
+}
+
+// Returns a robust measure of variability.
+template <typename T>
+T MedianAbsoluteDeviation(const std::vector<T>& samples, const T median) {
+ assert(!samples.empty());
+ std::vector<T> abs_deviations;
+ abs_deviations.reserve(samples.size());
+ for (const T sample : samples) {
+ abs_deviations.push_back(std::abs(sample - median));
+ }
+ return Median(&abs_deviations);
+}
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_ROBUST_STATISTICS_H_
diff --git a/contrib/libs/highwayhash/highwayhash/scalar.h b/contrib/libs/highwayhash/highwayhash/scalar.h
index 72ccae727e..eb7bac9c1d 100644
--- a/contrib/libs/highwayhash/highwayhash/scalar.h
+++ b/contrib/libs/highwayhash/highwayhash/scalar.h
@@ -1,352 +1,352 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_SCALAR_H_
-#define HIGHWAYHASH_SCALAR_H_
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include <stddef.h> // size_t
-#include <stdint.h>
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-
-namespace highwayhash {
-// To prevent ODR violations when including this from multiple translation
-// units (TU) that are compiled with different flags, the contents must reside
-// in a namespace whose name is unique to the TU. NOTE: this behavior is
-// incompatible with precompiled modules and requires textual inclusion instead.
-namespace HH_TARGET_NAME {
-
-// Single-lane "vector" type with the same interface as V128/Scalar. Allows the
-// same client template to generate both SIMD and portable code.
-template <typename Type>
-class Scalar {
- public:
- struct Intrinsic {
- Type t;
- };
-
- using T = Type;
- static constexpr size_t N = 1;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE Scalar() {}
-
- HH_INLINE explicit Scalar(const T t) : v_(t) {}
-
- HH_INLINE Scalar(const Scalar<T>& other) : v_(other.v_) {}
-
- HH_INLINE Scalar& operator=(const Scalar<T>& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE Scalar(const Intrinsic& v) : v_(v.t) {}
- HH_INLINE Scalar& operator=(const Intrinsic& v) {
- v_ = v.t;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return {v_}; }
-
- HH_INLINE Scalar operator==(const Scalar& other) const {
- Scalar eq;
- eq.FillWithByte(v_ == other.v_ ? 0xFF : 0x00);
- return eq;
- }
- HH_INLINE Scalar operator<(const Scalar& other) const {
- Scalar lt;
- lt.FillWithByte(v_ < other.v_ ? 0xFF : 0x00);
- return lt;
- }
- HH_INLINE Scalar operator>(const Scalar& other) const {
- Scalar gt;
- gt.FillWithByte(v_ > other.v_ ? 0xFF : 0x00);
- return gt;
- }
-
- HH_INLINE Scalar& operator*=(const Scalar& other) {
- v_ *= other.v_;
- return *this;
- }
- HH_INLINE Scalar& operator/=(const Scalar& other) {
- v_ /= other.v_;
- return *this;
- }
- HH_INLINE Scalar& operator+=(const Scalar& other) {
- v_ += other.v_;
- return *this;
- }
- HH_INLINE Scalar& operator-=(const Scalar& other) {
- v_ -= other.v_;
- return *this;
- }
-
- HH_INLINE Scalar& operator&=(const Scalar& other) {
- v_ &= other.v_;
- return *this;
- }
- HH_INLINE Scalar& operator|=(const Scalar& other) {
- v_ |= other.v_;
- return *this;
- }
- HH_INLINE Scalar& operator^=(const Scalar& other) {
- v_ ^= other.v_;
- return *this;
- }
-
- HH_INLINE Scalar& operator<<=(const int count) {
- // In C, int64_t << 64 is undefined, but we want to match the sensible
- // behavior of SSE2 (zeroing).
- if (count >= sizeof(T) * 8) {
- v_ = 0;
- } else {
- v_ <<= count;
- }
- return *this;
- }
-
- HH_INLINE Scalar& operator>>=(const int count) {
- if (count >= sizeof(T) * 8) {
- v_ = 0;
- } else {
- v_ >>= count;
- }
- return *this;
- }
-
- // For internal use only. We need to avoid memcpy/memset because this is a
- // restricted header.
- void FillWithByte(const unsigned char value) {
- unsigned char* bytes = reinterpret_cast<unsigned char*>(&v_);
- for (size_t i = 0; i < sizeof(T); ++i) {
- bytes[i] = value;
- }
- }
-
- void CopyTo(unsigned char* HH_RESTRICT to_bytes) const {
- const unsigned char* from_bytes =
- reinterpret_cast<const unsigned char*>(&v_);
- for (size_t i = 0; i < sizeof(T); ++i) {
- to_bytes[i] = from_bytes[i];
- }
- }
-
- private:
- T v_;
-};
-
-// Non-member operators.
-
-template <typename T>
-HH_INLINE Scalar<T> operator*(const Scalar<T>& left, const Scalar<T>& right) {
- Scalar<T> t(left);
- return t *= right;
-}
-
-template <typename T>
-HH_INLINE Scalar<T> operator/(const Scalar<T>& left, const Scalar<T>& right) {
- Scalar<T> t(left);
- return t /= right;
-}
-
-template <typename T>
-HH_INLINE Scalar<T> operator+(const Scalar<T>& left, const Scalar<T>& right) {
- Scalar<T> t(left);
- return t += right;
-}
-
-template <typename T>
-HH_INLINE Scalar<T> operator-(const Scalar<T>& left, const Scalar<T>& right) {
- Scalar<T> t(left);
- return t -= right;
-}
-
-template <typename T>
-HH_INLINE Scalar<T> operator&(const Scalar<T>& left, const Scalar<T>& right) {
- Scalar<T> t(left);
- return t &= right;
-}
-
-template <typename T>
-HH_INLINE Scalar<T> operator|(const Scalar<T> left, const Scalar<T>& right) {
- Scalar<T> t(left);
- return t |= right;
-}
-
-template <typename T>
-HH_INLINE Scalar<T> operator^(const Scalar<T>& left, const Scalar<T>& right) {
- Scalar<T> t(left);
- return t ^= right;
-}
-
-template <typename T>
-HH_INLINE Scalar<T> operator<<(const Scalar<T>& v, const int count) {
- Scalar<T> t(v);
- return t <<= count;
-}
-
-template <typename T>
-HH_INLINE Scalar<T> operator>>(const Scalar<T>& v, const int count) {
- Scalar<T> t(v);
- return t >>= count;
-}
-
-using V1x8U = Scalar<uint8_t>;
-using V1x16U = Scalar<uint16_t>;
-using V1x16I = Scalar<int16_t>;
-using V1x32U = Scalar<uint32_t>;
-using V1x32I = Scalar<int32_t>;
-using V1x64U = Scalar<uint64_t>;
-using V1x32F = Scalar<float>;
-using V1x64F = Scalar<double>;
-
-// Load/Store.
-
-// We differentiate between targets' vector types via template specialization.
-// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may
-// generate better code in unoptimized builds. Only declare the primary
-// templates to avoid needing mutual exclusion with vector128/256.
-template <class V>
-HH_INLINE V Load(const typename V::T* const HH_RESTRICT from);
-template <class V>
-HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from);
-
-template <>
-HH_INLINE V1x8U Load<V1x8U>(const V1x8U::T* const HH_RESTRICT from) {
- return V1x8U(*from);
-}
-template <>
-HH_INLINE V1x16U Load<V1x16U>(const V1x16U::T* const HH_RESTRICT from) {
- return V1x16U(*from);
-}
-template <>
-HH_INLINE V1x16I Load<V1x16I>(const V1x16I::T* const HH_RESTRICT from) {
- return V1x16I(*from);
-}
-template <>
-HH_INLINE V1x32U Load<V1x32U>(const V1x32U::T* const HH_RESTRICT from) {
- return V1x32U(*from);
-}
-template <>
-HH_INLINE V1x32I Load<V1x32I>(const V1x32I::T* const HH_RESTRICT from) {
- return V1x32I(*from);
-}
-template <>
-HH_INLINE V1x64U Load<V1x64U>(const V1x64U::T* const HH_RESTRICT from) {
- return V1x64U(*from);
-}
-template <>
-HH_INLINE V1x32F Load<V1x32F>(const V1x32F::T* const HH_RESTRICT from) {
- return V1x32F(*from);
-}
-template <>
-HH_INLINE V1x64F Load<V1x64F>(const V1x64F::T* const HH_RESTRICT from) {
- return V1x64F(*from);
-}
-
-template <>
-HH_INLINE V1x8U LoadUnaligned<V1x8U>(const V1x8U::T* const HH_RESTRICT from) {
- return V1x8U(*from);
-}
-template <>
-HH_INLINE V1x16U
-LoadUnaligned<V1x16U>(const V1x16U::T* const HH_RESTRICT from) {
- return V1x16U(*from);
-}
-template <>
-HH_INLINE V1x16I
-LoadUnaligned<V1x16I>(const V1x16I::T* const HH_RESTRICT from) {
- return V1x16I(*from);
-}
-template <>
-HH_INLINE V1x32U
-LoadUnaligned<V1x32U>(const V1x32U::T* const HH_RESTRICT from) {
- return V1x32U(*from);
-}
-template <>
-HH_INLINE V1x32I
-LoadUnaligned<V1x32I>(const V1x32I::T* const HH_RESTRICT from) {
- return V1x32I(*from);
-}
-template <>
-HH_INLINE V1x64U
-LoadUnaligned<V1x64U>(const V1x64U::T* const HH_RESTRICT from) {
- return V1x64U(*from);
-}
-template <>
-HH_INLINE V1x32F
-LoadUnaligned<V1x32F>(const V1x32F::T* const HH_RESTRICT from) {
- return V1x32F(*from);
-}
-template <>
-HH_INLINE V1x64F
-LoadUnaligned<V1x64F>(const V1x64F::T* const HH_RESTRICT from) {
- return V1x64F(*from);
-}
-
-template <typename T>
-HH_INLINE void Store(const Scalar<T>& v, T* const HH_RESTRICT to) {
- v.CopyTo(reinterpret_cast<unsigned char*>(to));
-}
-
-template <typename T>
-HH_INLINE void StoreUnaligned(const Scalar<T>& v, T* const HH_RESTRICT to) {
- v.CopyTo(reinterpret_cast<unsigned char*>(to));
-}
-
-template <typename T>
-HH_INLINE void Stream(const Scalar<T>& v, T* const HH_RESTRICT to) {
- v.CopyTo(reinterpret_cast<unsigned char*>(to));
-}
-
-// Miscellaneous functions.
-
-template <typename T>
-HH_INLINE Scalar<T> RotateLeft(const Scalar<T>& v, const int count) {
- constexpr size_t num_bits = sizeof(T) * 8;
- return (v << count) | (v >> (num_bits - count));
-}
-
-template <typename T>
-HH_INLINE Scalar<T> AndNot(const Scalar<T>& neg_mask, const Scalar<T>& values) {
- return values & ~neg_mask;
-}
-
-template <typename T>
-HH_INLINE Scalar<T> Select(const Scalar<T>& a, const Scalar<T>& b,
- const Scalar<T>& mask) {
- const char* mask_bytes = reinterpret_cast<const char*>(&mask);
- return (mask_bytes[sizeof(T) - 1] & 0x80) ? b : a;
-}
-
-template <typename T>
-HH_INLINE Scalar<T> Min(const Scalar<T>& v0, const Scalar<T>& v1) {
- return (v0 < v1) ? v0 : v1;
-}
-
-template <typename T>
-HH_INLINE Scalar<T> Max(const Scalar<T>& v0, const Scalar<T>& v1) {
- return (v0 < v1) ? v1 : v0;
-}
-
-} // namespace HH_TARGET_NAME
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_SCALAR_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_SCALAR_H_
+#define HIGHWAYHASH_SCALAR_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h> // size_t
+#include <stdint.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+namespace highwayhash {
+// To prevent ODR violations when including this from multiple translation
+// units (TU) that are compiled with different flags, the contents must reside
+// in a namespace whose name is unique to the TU. NOTE: this behavior is
+// incompatible with precompiled modules and requires textual inclusion instead.
+namespace HH_TARGET_NAME {
+
+// Single-lane "vector" type with the same interface as V128/Scalar. Allows the
+// same client template to generate both SIMD and portable code.
+template <typename Type>
+class Scalar {
+ public:
+ struct Intrinsic {
+ Type t;
+ };
+
+ using T = Type;
+ static constexpr size_t N = 1;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE Scalar() {}
+
+ HH_INLINE explicit Scalar(const T t) : v_(t) {}
+
+ HH_INLINE Scalar(const Scalar<T>& other) : v_(other.v_) {}
+
+ HH_INLINE Scalar& operator=(const Scalar<T>& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE Scalar(const Intrinsic& v) : v_(v.t) {}
+ HH_INLINE Scalar& operator=(const Intrinsic& v) {
+ v_ = v.t;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return {v_}; }
+
+ HH_INLINE Scalar operator==(const Scalar& other) const {
+ Scalar eq;
+ eq.FillWithByte(v_ == other.v_ ? 0xFF : 0x00);
+ return eq;
+ }
+ HH_INLINE Scalar operator<(const Scalar& other) const {
+ Scalar lt;
+ lt.FillWithByte(v_ < other.v_ ? 0xFF : 0x00);
+ return lt;
+ }
+ HH_INLINE Scalar operator>(const Scalar& other) const {
+ Scalar gt;
+ gt.FillWithByte(v_ > other.v_ ? 0xFF : 0x00);
+ return gt;
+ }
+
+ HH_INLINE Scalar& operator*=(const Scalar& other) {
+ v_ *= other.v_;
+ return *this;
+ }
+ HH_INLINE Scalar& operator/=(const Scalar& other) {
+ v_ /= other.v_;
+ return *this;
+ }
+ HH_INLINE Scalar& operator+=(const Scalar& other) {
+ v_ += other.v_;
+ return *this;
+ }
+ HH_INLINE Scalar& operator-=(const Scalar& other) {
+ v_ -= other.v_;
+ return *this;
+ }
+
+ HH_INLINE Scalar& operator&=(const Scalar& other) {
+ v_ &= other.v_;
+ return *this;
+ }
+ HH_INLINE Scalar& operator|=(const Scalar& other) {
+ v_ |= other.v_;
+ return *this;
+ }
+ HH_INLINE Scalar& operator^=(const Scalar& other) {
+ v_ ^= other.v_;
+ return *this;
+ }
+
+ HH_INLINE Scalar& operator<<=(const int count) {
+ // In C, int64_t << 64 is undefined, but we want to match the sensible
+ // behavior of SSE2 (zeroing).
+ if (count >= sizeof(T) * 8) {
+ v_ = 0;
+ } else {
+ v_ <<= count;
+ }
+ return *this;
+ }
+
+ HH_INLINE Scalar& operator>>=(const int count) {
+ if (count >= sizeof(T) * 8) {
+ v_ = 0;
+ } else {
+ v_ >>= count;
+ }
+ return *this;
+ }
+
+ // For internal use only. We need to avoid memcpy/memset because this is a
+ // restricted header.
+ void FillWithByte(const unsigned char value) {
+ unsigned char* bytes = reinterpret_cast<unsigned char*>(&v_);
+ for (size_t i = 0; i < sizeof(T); ++i) {
+ bytes[i] = value;
+ }
+ }
+
+ void CopyTo(unsigned char* HH_RESTRICT to_bytes) const {
+ const unsigned char* from_bytes =
+ reinterpret_cast<const unsigned char*>(&v_);
+ for (size_t i = 0; i < sizeof(T); ++i) {
+ to_bytes[i] = from_bytes[i];
+ }
+ }
+
+ private:
+ T v_;
+};
+
+// Non-member operators.
+
+template <typename T>
+HH_INLINE Scalar<T> operator*(const Scalar<T>& left, const Scalar<T>& right) {
+ Scalar<T> t(left);
+ return t *= right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator/(const Scalar<T>& left, const Scalar<T>& right) {
+ Scalar<T> t(left);
+ return t /= right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator+(const Scalar<T>& left, const Scalar<T>& right) {
+ Scalar<T> t(left);
+ return t += right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator-(const Scalar<T>& left, const Scalar<T>& right) {
+ Scalar<T> t(left);
+ return t -= right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator&(const Scalar<T>& left, const Scalar<T>& right) {
+ Scalar<T> t(left);
+ return t &= right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator|(const Scalar<T> left, const Scalar<T>& right) {
+ Scalar<T> t(left);
+ return t |= right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator^(const Scalar<T>& left, const Scalar<T>& right) {
+ Scalar<T> t(left);
+ return t ^= right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator<<(const Scalar<T>& v, const int count) {
+ Scalar<T> t(v);
+ return t <<= count;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator>>(const Scalar<T>& v, const int count) {
+ Scalar<T> t(v);
+ return t >>= count;
+}
+
+using V1x8U = Scalar<uint8_t>;
+using V1x16U = Scalar<uint16_t>;
+using V1x16I = Scalar<int16_t>;
+using V1x32U = Scalar<uint32_t>;
+using V1x32I = Scalar<int32_t>;
+using V1x64U = Scalar<uint64_t>;
+using V1x32F = Scalar<float>;
+using V1x64F = Scalar<double>;
+
+// Load/Store.
+
+// We differentiate between targets' vector types via template specialization.
+// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may
+// generate better code in unoptimized builds. Only declare the primary
+// templates to avoid needing mutual exclusion with vector128/256.
+template <class V>
+HH_INLINE V Load(const typename V::T* const HH_RESTRICT from);
+template <class V>
+HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from);
+
+template <>
+HH_INLINE V1x8U Load<V1x8U>(const V1x8U::T* const HH_RESTRICT from) {
+ return V1x8U(*from);
+}
+template <>
+HH_INLINE V1x16U Load<V1x16U>(const V1x16U::T* const HH_RESTRICT from) {
+ return V1x16U(*from);
+}
+template <>
+HH_INLINE V1x16I Load<V1x16I>(const V1x16I::T* const HH_RESTRICT from) {
+ return V1x16I(*from);
+}
+template <>
+HH_INLINE V1x32U Load<V1x32U>(const V1x32U::T* const HH_RESTRICT from) {
+ return V1x32U(*from);
+}
+template <>
+HH_INLINE V1x32I Load<V1x32I>(const V1x32I::T* const HH_RESTRICT from) {
+ return V1x32I(*from);
+}
+template <>
+HH_INLINE V1x64U Load<V1x64U>(const V1x64U::T* const HH_RESTRICT from) {
+ return V1x64U(*from);
+}
+template <>
+HH_INLINE V1x32F Load<V1x32F>(const V1x32F::T* const HH_RESTRICT from) {
+ return V1x32F(*from);
+}
+template <>
+HH_INLINE V1x64F Load<V1x64F>(const V1x64F::T* const HH_RESTRICT from) {
+ return V1x64F(*from);
+}
+
+template <>
+HH_INLINE V1x8U LoadUnaligned<V1x8U>(const V1x8U::T* const HH_RESTRICT from) {
+ return V1x8U(*from);
+}
+template <>
+HH_INLINE V1x16U
+LoadUnaligned<V1x16U>(const V1x16U::T* const HH_RESTRICT from) {
+ return V1x16U(*from);
+}
+template <>
+HH_INLINE V1x16I
+LoadUnaligned<V1x16I>(const V1x16I::T* const HH_RESTRICT from) {
+ return V1x16I(*from);
+}
+template <>
+HH_INLINE V1x32U
+LoadUnaligned<V1x32U>(const V1x32U::T* const HH_RESTRICT from) {
+ return V1x32U(*from);
+}
+template <>
+HH_INLINE V1x32I
+LoadUnaligned<V1x32I>(const V1x32I::T* const HH_RESTRICT from) {
+ return V1x32I(*from);
+}
+template <>
+HH_INLINE V1x64U
+LoadUnaligned<V1x64U>(const V1x64U::T* const HH_RESTRICT from) {
+ return V1x64U(*from);
+}
+template <>
+HH_INLINE V1x32F
+LoadUnaligned<V1x32F>(const V1x32F::T* const HH_RESTRICT from) {
+ return V1x32F(*from);
+}
+template <>
+HH_INLINE V1x64F
+LoadUnaligned<V1x64F>(const V1x64F::T* const HH_RESTRICT from) {
+ return V1x64F(*from);
+}
+
+template <typename T>
+HH_INLINE void Store(const Scalar<T>& v, T* const HH_RESTRICT to) {
+ v.CopyTo(reinterpret_cast<unsigned char*>(to));
+}
+
+template <typename T>
+HH_INLINE void StoreUnaligned(const Scalar<T>& v, T* const HH_RESTRICT to) {
+ v.CopyTo(reinterpret_cast<unsigned char*>(to));
+}
+
+template <typename T>
+HH_INLINE void Stream(const Scalar<T>& v, T* const HH_RESTRICT to) {
+ v.CopyTo(reinterpret_cast<unsigned char*>(to));
+}
+
+// Miscellaneous functions.
+
+template <typename T>
+HH_INLINE Scalar<T> RotateLeft(const Scalar<T>& v, const int count) {
+ constexpr size_t num_bits = sizeof(T) * 8;
+ return (v << count) | (v >> (num_bits - count));
+}
+
+template <typename T>
+HH_INLINE Scalar<T> AndNot(const Scalar<T>& neg_mask, const Scalar<T>& values) {
+ return values & ~neg_mask;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> Select(const Scalar<T>& a, const Scalar<T>& b,
+ const Scalar<T>& mask) {
+ const char* mask_bytes = reinterpret_cast<const char*>(&mask);
+ return (mask_bytes[sizeof(T) - 1] & 0x80) ? b : a;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> Min(const Scalar<T>& v0, const Scalar<T>& v1) {
+ return (v0 < v1) ? v0 : v1;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> Max(const Scalar<T>& v0, const Scalar<T>& v1) {
+ return (v0 < v1) ? v1 : v0;
+}
+
+} // namespace HH_TARGET_NAME
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_SCALAR_H_
diff --git a/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.cc b/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.cc
index 9ddeca64e6..136f2769a1 100644
--- a/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.cc
+++ b/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.cc
@@ -1,183 +1,183 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "highwayhash/scalar_sip_tree_hash.h"
-
-#include <cstddef>
-#include <cstring> // memcpy
-
-#include "highwayhash/compiler_specific.h"
-#include "highwayhash/sip_hash.h"
-
-namespace highwayhash {
-namespace {
-
-// Paper: https://www.131002.net/siphash/siphash.pdf
-// SSE41 implementation: https://goo.gl/80GBSD
-// Tree hash extension: http://dx.doi.org/10.4236/jis.2014.53010
-
-// The hash state is updated by injecting 4x8-byte packets;
-// XORing together all state vectors yields 32 bytes that are
-// reduced to 64 bits via 8-byte SipHash.
-
-const int kNumLanes = 4;
-using Lanes = HH_U64[kNumLanes];
-const int kPacketSize = sizeof(Lanes);
-
-template <int kUpdateRounds, int kFinalizeRounds>
-class ScalarSipTreeHashState {
- public:
- HH_INLINE ScalarSipTreeHashState(const Lanes& keys, const int lane) {
- const HH_U64 key = keys[lane] ^ (kNumLanes | lane);
- v0 = 0x736f6d6570736575ull ^ key;
- v1 = 0x646f72616e646f6dull ^ key;
- v2 = 0x6c7967656e657261ull ^ key;
- v3 = 0x7465646279746573ull ^ key;
- }
-
- HH_INLINE void Update(const HH_U64& packet) {
- v3 ^= packet;
-
- Compress<kUpdateRounds>();
-
- v0 ^= packet;
- }
-
- HH_INLINE HH_U64 Finalize() {
- // Mix in bits to avoid leaking the key if all packets were zero.
- v2 ^= 0xFF;
-
- Compress<kFinalizeRounds>();
-
- return (v0 ^ v1) ^ (v2 ^ v3);
- }
-
- private:
- // Rotate a 64-bit value "v" left by N bits.
- template <HH_U64 bits>
- static HH_INLINE HH_U64 RotateLeft(const HH_U64 v) {
- const HH_U64 left = v << bits;
- const HH_U64 right = v >> (64 - bits);
- return left | right;
- }
-
- template <int kRounds>
- HH_INLINE void Compress() {
- for (int i = 0; i < kRounds; ++i) {
- // ARX network: add, rotate, exclusive-or.
- v0 += v1;
- v2 += v3;
- v1 = RotateLeft<13>(v1);
- v3 = RotateLeft<16>(v3);
- v1 ^= v0;
- v3 ^= v2;
-
- v0 = RotateLeft<32>(v0);
-
- v2 += v1;
- v0 += v3;
- v1 = RotateLeft<17>(v1);
- v3 = RotateLeft<21>(v3);
- v1 ^= v2;
- v3 ^= v0;
-
- v2 = RotateLeft<32>(v2);
- }
- }
-
- HH_U64 v0;
- HH_U64 v1;
- HH_U64 v2;
- HH_U64 v3;
-};
-
-} // namespace
-
-template <size_t kUpdateRounds, size_t kFinalizeRounds>
-HH_U64 ScalarSipTreeHashT(const Lanes& key, const char* bytes,
- const HH_U64 size) {
- // "j-lanes" tree hashing interleaves 8-byte input packets.
- using State = ScalarSipTreeHashState<kUpdateRounds, kFinalizeRounds>;
- State state[kNumLanes] = {State(key, 0), State(key, 1), State(key, 2),
- State(key, 3)};
-
- // Hash entire 32-byte packets.
- const size_t remainder = size & (kPacketSize - 1);
- const size_t truncated_size = size - remainder;
- const HH_U64* packets = reinterpret_cast<const HH_U64*>(bytes);
- for (size_t i = 0; i < truncated_size / kPacketSize; ++i) {
- for (int lane = 0; lane < kNumLanes; ++lane) {
- const HH_U64 packet = *packets++;
- state[lane].Update(packet);
- }
- }
-
- // Update with final 32-byte packet.
- const size_t remainder_mod4 = remainder & 3;
- uint32_t packet4 = static_cast<uint32_t>(remainder << 24);
- const char* final_bytes = bytes + size - remainder_mod4;
- for (size_t i = 0; i < remainder_mod4; ++i) {
- const uint32_t byte = static_cast<unsigned char>(final_bytes[i]);
- packet4 += byte << (i * 8);
- }
-
- char final_packet[kPacketSize] = {0};
- memcpy(final_packet, bytes + truncated_size, remainder - remainder_mod4);
- memcpy(final_packet + kPacketSize - 4, &packet4, sizeof(packet4));
- packets = reinterpret_cast<const HH_U64*>(final_packet);
- for (int lane = 0; lane < kNumLanes; ++lane) {
- state[lane].Update(packets[lane]);
- }
-
- // Store the resulting hashes.
- uint64_t hashes[4];
- for (int lane = 0; lane < kNumLanes; ++lane) {
- hashes[lane] = state[lane].Finalize();
- }
-
- typename SipHashStateT<kUpdateRounds, kFinalizeRounds>::Key reduce_key;
- memcpy(&reduce_key, &key, sizeof(reduce_key));
- return ReduceSipTreeHash<kNumLanes, kUpdateRounds, kFinalizeRounds>(
- reduce_key, hashes);
-}
-
-HH_U64 ScalarSipTreeHash(const Lanes& key, const char* bytes,
- const HH_U64 size) {
- return ScalarSipTreeHashT<2, 4>(key, bytes, size);
-}
-
-HH_U64 ScalarSipTreeHash13(const Lanes& key, const char* bytes,
- const HH_U64 size) {
- return ScalarSipTreeHashT<1, 3>(key, bytes, size);
-}
-} // namespace highwayhash
-
-using highwayhash::HH_U64;
-using highwayhash::ScalarSipTreeHash;
-using highwayhash::ScalarSipTreeHash13;
-using Key = HH_U64[4];
-
-extern "C" {
-
-HH_U64 ScalarSipTreeHashC(const HH_U64* key, const char* bytes,
- const HH_U64 size) {
- return ScalarSipTreeHash(*reinterpret_cast<const Key*>(key), bytes, size);
-}
-
-HH_U64 ScalarSipTreeHash13C(const HH_U64* key, const char* bytes,
- const HH_U64 size) {
- return ScalarSipTreeHash13(*reinterpret_cast<const Key*>(key), bytes, size);
-}
-
-} // extern "C"
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/scalar_sip_tree_hash.h"
+
+#include <cstddef>
+#include <cstring> // memcpy
+
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/sip_hash.h"
+
+namespace highwayhash {
+namespace {
+
+// Paper: https://www.131002.net/siphash/siphash.pdf
+// SSE41 implementation: https://goo.gl/80GBSD
+// Tree hash extension: http://dx.doi.org/10.4236/jis.2014.53010
+
+// The hash state is updated by injecting 4x8-byte packets;
+// XORing together all state vectors yields 32 bytes that are
+// reduced to 64 bits via 8-byte SipHash.
+
+const int kNumLanes = 4;
+using Lanes = HH_U64[kNumLanes];
+const int kPacketSize = sizeof(Lanes);
+
+template <int kUpdateRounds, int kFinalizeRounds>
+class ScalarSipTreeHashState {
+ public:
+ HH_INLINE ScalarSipTreeHashState(const Lanes& keys, const int lane) {
+ const HH_U64 key = keys[lane] ^ (kNumLanes | lane);
+ v0 = 0x736f6d6570736575ull ^ key;
+ v1 = 0x646f72616e646f6dull ^ key;
+ v2 = 0x6c7967656e657261ull ^ key;
+ v3 = 0x7465646279746573ull ^ key;
+ }
+
+ HH_INLINE void Update(const HH_U64& packet) {
+ v3 ^= packet;
+
+ Compress<kUpdateRounds>();
+
+ v0 ^= packet;
+ }
+
+ HH_INLINE HH_U64 Finalize() {
+ // Mix in bits to avoid leaking the key if all packets were zero.
+ v2 ^= 0xFF;
+
+ Compress<kFinalizeRounds>();
+
+ return (v0 ^ v1) ^ (v2 ^ v3);
+ }
+
+ private:
+ // Rotate a 64-bit value "v" left by N bits.
+ template <HH_U64 bits>
+ static HH_INLINE HH_U64 RotateLeft(const HH_U64 v) {
+ const HH_U64 left = v << bits;
+ const HH_U64 right = v >> (64 - bits);
+ return left | right;
+ }
+
+ template <int kRounds>
+ HH_INLINE void Compress() {
+ for (int i = 0; i < kRounds; ++i) {
+ // ARX network: add, rotate, exclusive-or.
+ v0 += v1;
+ v2 += v3;
+ v1 = RotateLeft<13>(v1);
+ v3 = RotateLeft<16>(v3);
+ v1 ^= v0;
+ v3 ^= v2;
+
+ v0 = RotateLeft<32>(v0);
+
+ v2 += v1;
+ v0 += v3;
+ v1 = RotateLeft<17>(v1);
+ v3 = RotateLeft<21>(v3);
+ v1 ^= v2;
+ v3 ^= v0;
+
+ v2 = RotateLeft<32>(v2);
+ }
+ }
+
+ HH_U64 v0;
+ HH_U64 v1;
+ HH_U64 v2;
+ HH_U64 v3;
+};
+
+} // namespace
+
+template <size_t kUpdateRounds, size_t kFinalizeRounds>
+HH_U64 ScalarSipTreeHashT(const Lanes& key, const char* bytes,
+ const HH_U64 size) {
+ // "j-lanes" tree hashing interleaves 8-byte input packets.
+ using State = ScalarSipTreeHashState<kUpdateRounds, kFinalizeRounds>;
+ State state[kNumLanes] = {State(key, 0), State(key, 1), State(key, 2),
+ State(key, 3)};
+
+ // Hash entire 32-byte packets.
+ const size_t remainder = size & (kPacketSize - 1);
+ const size_t truncated_size = size - remainder;
+ const HH_U64* packets = reinterpret_cast<const HH_U64*>(bytes);
+ for (size_t i = 0; i < truncated_size / kPacketSize; ++i) {
+ for (int lane = 0; lane < kNumLanes; ++lane) {
+ const HH_U64 packet = *packets++;
+ state[lane].Update(packet);
+ }
+ }
+
+ // Update with final 32-byte packet.
+ const size_t remainder_mod4 = remainder & 3;
+ uint32_t packet4 = static_cast<uint32_t>(remainder << 24);
+ const char* final_bytes = bytes + size - remainder_mod4;
+ for (size_t i = 0; i < remainder_mod4; ++i) {
+ const uint32_t byte = static_cast<unsigned char>(final_bytes[i]);
+ packet4 += byte << (i * 8);
+ }
+
+ char final_packet[kPacketSize] = {0};
+ memcpy(final_packet, bytes + truncated_size, remainder - remainder_mod4);
+ memcpy(final_packet + kPacketSize - 4, &packet4, sizeof(packet4));
+ packets = reinterpret_cast<const HH_U64*>(final_packet);
+ for (int lane = 0; lane < kNumLanes; ++lane) {
+ state[lane].Update(packets[lane]);
+ }
+
+ // Store the resulting hashes.
+ uint64_t hashes[4];
+ for (int lane = 0; lane < kNumLanes; ++lane) {
+ hashes[lane] = state[lane].Finalize();
+ }
+
+ typename SipHashStateT<kUpdateRounds, kFinalizeRounds>::Key reduce_key;
+ memcpy(&reduce_key, &key, sizeof(reduce_key));
+ return ReduceSipTreeHash<kNumLanes, kUpdateRounds, kFinalizeRounds>(
+ reduce_key, hashes);
+}
+
+HH_U64 ScalarSipTreeHash(const Lanes& key, const char* bytes,
+ const HH_U64 size) {
+ return ScalarSipTreeHashT<2, 4>(key, bytes, size);
+}
+
+HH_U64 ScalarSipTreeHash13(const Lanes& key, const char* bytes,
+ const HH_U64 size) {
+ return ScalarSipTreeHashT<1, 3>(key, bytes, size);
+}
+} // namespace highwayhash
+
+using highwayhash::HH_U64;
+using highwayhash::ScalarSipTreeHash;
+using highwayhash::ScalarSipTreeHash13;
+using Key = HH_U64[4];
+
+extern "C" {
+
+HH_U64 ScalarSipTreeHashC(const HH_U64* key, const char* bytes,
+ const HH_U64 size) {
+ return ScalarSipTreeHash(*reinterpret_cast<const Key*>(key), bytes, size);
+}
+
+HH_U64 ScalarSipTreeHash13C(const HH_U64* key, const char* bytes,
+ const HH_U64 size) {
+ return ScalarSipTreeHash13(*reinterpret_cast<const Key*>(key), bytes, size);
+}
+
+} // extern "C"
diff --git a/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.h b/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.h
index 2f79f3a010..f882be89d2 100644
--- a/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.h
+++ b/contrib/libs/highwayhash/highwayhash/scalar_sip_tree_hash.h
@@ -1,37 +1,37 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_
-#define HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_
-
-// Scalar (non-vector/SIMD) version for comparison purposes.
-
-#include "highwayhash/state_helpers.h"
-
-#ifdef __cplusplus
-namespace highwayhash {
-extern "C" {
-#endif
-
-HH_U64 ScalarSipTreeHash(const HH_U64 (&key)[4], const char* bytes,
- const HH_U64 size);
-HH_U64 ScalarSipTreeHash13(const HH_U64 (&key)[4], const char* bytes,
- const HH_U64 size);
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace highwayhash
-#endif
-
-#endif // HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_
+#define HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_
+
+// Scalar (non-vector/SIMD) version for comparison purposes.
+
+#include "highwayhash/state_helpers.h"
+
+#ifdef __cplusplus
+namespace highwayhash {
+extern "C" {
+#endif
+
+HH_U64 ScalarSipTreeHash(const HH_U64 (&key)[4], const char* bytes,
+ const HH_U64 size);
+HH_U64 ScalarSipTreeHash13(const HH_U64 (&key)[4], const char* bytes,
+ const HH_U64 size);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace highwayhash
+#endif
+
+#endif // HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_
diff --git a/contrib/libs/highwayhash/highwayhash/sip_hash.cc b/contrib/libs/highwayhash/highwayhash/sip_hash.cc
index 1c08533544..3d73a0bcdd 100644
--- a/contrib/libs/highwayhash/highwayhash/sip_hash.cc
+++ b/contrib/libs/highwayhash/highwayhash/sip_hash.cc
@@ -1,33 +1,33 @@
-// Copyright 2016 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "highwayhash/sip_hash.h"
-
-using highwayhash::HH_U64;
-using highwayhash::SipHash;
-using highwayhash::SipHash13;
-using Key = highwayhash::SipHashState::Key;
-using Key13 = highwayhash::SipHash13State::Key;
-
-extern "C" {
-
-HH_U64 SipHashC(const HH_U64* key, const char* bytes, const HH_U64 size) {
- return SipHash(*reinterpret_cast<const Key*>(key), bytes, size);
-}
-
-HH_U64 SipHash13C(const HH_U64* key, const char* bytes, const HH_U64 size) {
- return SipHash13(*reinterpret_cast<const Key13*>(key), bytes, size);
-}
-
-} // extern "C"
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/sip_hash.h"
+
+using highwayhash::HH_U64;
+using highwayhash::SipHash;
+using highwayhash::SipHash13;
+using Key = highwayhash::SipHashState::Key;
+using Key13 = highwayhash::SipHash13State::Key;
+
+extern "C" {
+
+HH_U64 SipHashC(const HH_U64* key, const char* bytes, const HH_U64 size) {
+ return SipHash(*reinterpret_cast<const Key*>(key), bytes, size);
+}
+
+HH_U64 SipHash13C(const HH_U64* key, const char* bytes, const HH_U64 size) {
+ return SipHash13(*reinterpret_cast<const Key13*>(key), bytes, size);
+}
+
+} // extern "C"
diff --git a/contrib/libs/highwayhash/highwayhash/sip_hash.h b/contrib/libs/highwayhash/highwayhash/sip_hash.h
index eebe3dc944..24a5cf4f22 100644
--- a/contrib/libs/highwayhash/highwayhash/sip_hash.h
+++ b/contrib/libs/highwayhash/highwayhash/sip_hash.h
@@ -1,171 +1,171 @@
-// Copyright 2016 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_SIP_HASH_H_
-#define HIGHWAYHASH_SIP_HASH_H_
-
-// Portable but fast SipHash implementation.
-
-#include <cstddef>
-#include <cstring> // memcpy
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-#include "highwayhash/endianess.h"
-#include "highwayhash/state_helpers.h"
-
-namespace highwayhash {
-
-// Paper: https://www.131002.net/siphash/siphash.pdf
-template <int kUpdateIters, int kFinalizeIters>
-class SipHashStateT {
- public:
- using Key = HH_U64[2];
- static const size_t kPacketSize = sizeof(HH_U64);
-
- explicit HH_INLINE SipHashStateT(const Key& key) {
- v0 = 0x736f6d6570736575ull ^ key[0];
- v1 = 0x646f72616e646f6dull ^ key[1];
- v2 = 0x6c7967656e657261ull ^ key[0];
- v3 = 0x7465646279746573ull ^ key[1];
- }
-
- HH_INLINE void Update(const char* bytes) {
- HH_U64 packet;
- memcpy(&packet, bytes, sizeof(packet));
- packet = host_from_le64(packet);
-
- v3 ^= packet;
-
- Compress<kUpdateIters>();
-
- v0 ^= packet;
- }
-
- HH_INLINE HH_U64 Finalize() {
- // Mix in bits to avoid leaking the key if all packets were zero.
- v2 ^= 0xFF;
-
- Compress<kFinalizeIters>();
-
- return (v0 ^ v1) ^ (v2 ^ v3);
- }
- private:
- // Rotate a 64-bit value "v" left by N bits.
- template <HH_U64 bits>
- static HH_INLINE HH_U64 RotateLeft(const HH_U64 v) {
- const HH_U64 left = v << bits;
- const HH_U64 right = v >> (64 - bits);
- return left | right;
- }
-
- template <size_t rounds>
- HH_INLINE void Compress() {
- for (size_t i = 0; i < rounds; ++i) {
- // ARX network: add, rotate, exclusive-or.
- v0 += v1;
- v2 += v3;
- v1 = RotateLeft<13>(v1);
- v3 = RotateLeft<16>(v3);
- v1 ^= v0;
- v3 ^= v2;
-
- v0 = RotateLeft<32>(v0);
-
- v2 += v1;
- v0 += v3;
- v1 = RotateLeft<17>(v1);
- v3 = RotateLeft<21>(v3);
- v1 ^= v2;
- v3 ^= v0;
-
- v2 = RotateLeft<32>(v2);
- }
- }
-
- HH_U64 v0;
- HH_U64 v1;
- HH_U64 v2;
- HH_U64 v3;
-};
-
-using SipHashState = SipHashStateT<2, 4>;
-using SipHash13State = SipHashStateT<1, 3>;
-
-// Override the HighwayTreeHash padding scheme with that of SipHash so that
-// the hash output matches the known-good values in sip_hash_test.
-template <>
-HH_INLINE void PaddedUpdate<SipHashState>(const HH_U64 size,
- const char* remaining_bytes,
- const HH_U64 remaining_size,
- SipHashState* state) {
- // Copy to avoid overrunning the input buffer.
- char final_packet[SipHashState::kPacketSize] = {0};
- memcpy(final_packet, remaining_bytes, remaining_size);
- final_packet[SipHashState::kPacketSize - 1] = static_cast<char>(size & 0xFF);
- state->Update(final_packet);
-}
-
-template <>
-HH_INLINE void PaddedUpdate<SipHash13State>(const HH_U64 size,
- const char* remaining_bytes,
- const HH_U64 remaining_size,
- SipHash13State* state) {
- // Copy to avoid overrunning the input buffer.
- char final_packet[SipHash13State::kPacketSize] = {0};
- memcpy(final_packet, remaining_bytes, remaining_size);
- final_packet[SipHash13State::kPacketSize - 1] =
- static_cast<char>(size & 0xFF);
- state->Update(final_packet);
-}
-
-// Fast, cryptographically strong pseudo-random function, e.g. for
-// deterministic/idempotent 'random' number generation. See also
-// README.md for information on resisting hash flooding attacks.
-//
-// Robust versus timing attacks because memory accesses are sequential
-// and the algorithm is branch-free. Compute time is proportional to the
-// number of 8-byte packets and about twice as fast as an sse41 implementation.
-//
-// "key" is a secret 128-bit key unknown to attackers.
-// "bytes" is the data to hash; ceil(size / 8) * 8 bytes are read.
-// Returns a 64-bit hash of the given data bytes, which are swapped on
-// big-endian CPUs so the return value is the same as on little-endian CPUs.
-static HH_INLINE HH_U64 SipHash(const SipHashState::Key& key, const char* bytes,
- const HH_U64 size) {
- return ComputeHash<SipHashState>(key, bytes, size);
-}
-
-// Round-reduced SipHash version (1 update and 3 finalization rounds).
-static HH_INLINE HH_U64 SipHash13(const SipHash13State::Key& key,
- const char* bytes, const HH_U64 size) {
- return ComputeHash<SipHash13State>(key, bytes, size);
-}
-
-template <int kNumLanes, int kUpdateIters, int kFinalizeIters>
-static HH_INLINE HH_U64 ReduceSipTreeHash(
- const typename SipHashStateT<kUpdateIters, kFinalizeIters>::Key& key,
- const uint64_t (&hashes)[kNumLanes]) {
- SipHashStateT<kUpdateIters, kFinalizeIters> state(key);
-
- for (int i = 0; i < kNumLanes; ++i) {
- state.Update(reinterpret_cast<const char*>(&hashes[i]));
- }
-
- return state.Finalize();
-}
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_SIP_HASH_H_
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_SIP_HASH_H_
+#define HIGHWAYHASH_SIP_HASH_H_
+
+// Portable but fast SipHash implementation.
+
+#include <cstddef>
+#include <cstring> // memcpy
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/endianess.h"
+#include "highwayhash/state_helpers.h"
+
+namespace highwayhash {
+
+// Paper: https://www.131002.net/siphash/siphash.pdf
+template <int kUpdateIters, int kFinalizeIters>
+class SipHashStateT {
+ public:
+ using Key = HH_U64[2];
+ static const size_t kPacketSize = sizeof(HH_U64);
+
+ explicit HH_INLINE SipHashStateT(const Key& key) {
+ v0 = 0x736f6d6570736575ull ^ key[0];
+ v1 = 0x646f72616e646f6dull ^ key[1];
+ v2 = 0x6c7967656e657261ull ^ key[0];
+ v3 = 0x7465646279746573ull ^ key[1];
+ }
+
+ HH_INLINE void Update(const char* bytes) {
+ HH_U64 packet;
+ memcpy(&packet, bytes, sizeof(packet));
+ packet = host_from_le64(packet);
+
+ v3 ^= packet;
+
+ Compress<kUpdateIters>();
+
+ v0 ^= packet;
+ }
+
+ HH_INLINE HH_U64 Finalize() {
+ // Mix in bits to avoid leaking the key if all packets were zero.
+ v2 ^= 0xFF;
+
+ Compress<kFinalizeIters>();
+
+ return (v0 ^ v1) ^ (v2 ^ v3);
+ }
+ private:
+ // Rotate a 64-bit value "v" left by N bits.
+ template <HH_U64 bits>
+ static HH_INLINE HH_U64 RotateLeft(const HH_U64 v) {
+ const HH_U64 left = v << bits;
+ const HH_U64 right = v >> (64 - bits);
+ return left | right;
+ }
+
+ template <size_t rounds>
+ HH_INLINE void Compress() {
+ for (size_t i = 0; i < rounds; ++i) {
+ // ARX network: add, rotate, exclusive-or.
+ v0 += v1;
+ v2 += v3;
+ v1 = RotateLeft<13>(v1);
+ v3 = RotateLeft<16>(v3);
+ v1 ^= v0;
+ v3 ^= v2;
+
+ v0 = RotateLeft<32>(v0);
+
+ v2 += v1;
+ v0 += v3;
+ v1 = RotateLeft<17>(v1);
+ v3 = RotateLeft<21>(v3);
+ v1 ^= v2;
+ v3 ^= v0;
+
+ v2 = RotateLeft<32>(v2);
+ }
+ }
+
+ HH_U64 v0;
+ HH_U64 v1;
+ HH_U64 v2;
+ HH_U64 v3;
+};
+
+using SipHashState = SipHashStateT<2, 4>;
+using SipHash13State = SipHashStateT<1, 3>;
+
+// Override the HighwayTreeHash padding scheme with that of SipHash so that
+// the hash output matches the known-good values in sip_hash_test.
+template <>
+HH_INLINE void PaddedUpdate<SipHashState>(const HH_U64 size,
+ const char* remaining_bytes,
+ const HH_U64 remaining_size,
+ SipHashState* state) {
+ // Copy to avoid overrunning the input buffer.
+ char final_packet[SipHashState::kPacketSize] = {0};
+ memcpy(final_packet, remaining_bytes, remaining_size);
+ final_packet[SipHashState::kPacketSize - 1] = static_cast<char>(size & 0xFF);
+ state->Update(final_packet);
+}
+
+template <>
+HH_INLINE void PaddedUpdate<SipHash13State>(const HH_U64 size,
+ const char* remaining_bytes,
+ const HH_U64 remaining_size,
+ SipHash13State* state) {
+ // Copy to avoid overrunning the input buffer.
+ char final_packet[SipHash13State::kPacketSize] = {0};
+ memcpy(final_packet, remaining_bytes, remaining_size);
+ final_packet[SipHash13State::kPacketSize - 1] =
+ static_cast<char>(size & 0xFF);
+ state->Update(final_packet);
+}
+
+// Fast, cryptographically strong pseudo-random function, e.g. for
+// deterministic/idempotent 'random' number generation. See also
+// README.md for information on resisting hash flooding attacks.
+//
+// Robust versus timing attacks because memory accesses are sequential
+// and the algorithm is branch-free. Compute time is proportional to the
+// number of 8-byte packets and about twice as fast as an sse41 implementation.
+//
+// "key" is a secret 128-bit key unknown to attackers.
+// "bytes" is the data to hash; ceil(size / 8) * 8 bytes are read.
+// Returns a 64-bit hash of the given data bytes, which are swapped on
+// big-endian CPUs so the return value is the same as on little-endian CPUs.
+static HH_INLINE HH_U64 SipHash(const SipHashState::Key& key, const char* bytes,
+ const HH_U64 size) {
+ return ComputeHash<SipHashState>(key, bytes, size);
+}
+
+// Round-reduced SipHash version (1 update and 3 finalization rounds).
+static HH_INLINE HH_U64 SipHash13(const SipHash13State::Key& key,
+ const char* bytes, const HH_U64 size) {
+ return ComputeHash<SipHash13State>(key, bytes, size);
+}
+
+template <int kNumLanes, int kUpdateIters, int kFinalizeIters>
+static HH_INLINE HH_U64 ReduceSipTreeHash(
+ const typename SipHashStateT<kUpdateIters, kFinalizeIters>::Key& key,
+ const uint64_t (&hashes)[kNumLanes]) {
+ SipHashStateT<kUpdateIters, kFinalizeIters> state(key);
+
+ for (int i = 0; i < kNumLanes; ++i) {
+ state.Update(reinterpret_cast<const char*>(&hashes[i]));
+ }
+
+ return state.Finalize();
+}
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_SIP_HASH_H_
diff --git a/contrib/libs/highwayhash/highwayhash/sip_hash_test.cc b/contrib/libs/highwayhash/highwayhash/sip_hash_test.cc
index 425dfea93c..8da79cf058 100644
--- a/contrib/libs/highwayhash/highwayhash/sip_hash_test.cc
+++ b/contrib/libs/highwayhash/highwayhash/sip_hash_test.cc
@@ -1,150 +1,150 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "highwayhash/sip_hash.h"
-
-#include <cassert>
-#include <numeric>
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifdef HH_GOOGLETEST
-#include "base/integral_types.h"
-#include "testing/base/public/benchmark.h"
-#include "testing/base/public/gunit.h"
-#endif
-#include "highwayhash/scalar_sip_tree_hash.h"
-#include "highwayhash/sip_tree_hash.h"
-
-namespace highwayhash {
-namespace {
-
-void VerifySipHash() {
- const int kMaxSize = 64;
- char in[kMaxSize]; // empty string, 00, 00 01, ...
- const HH_U64 key[2] = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL};
-
- // Known-good SipHash-2-4 output from D. Bernstein.
- const HH_U64 kSipHashOutput[64] = {
- 0x726FDB47DD0E0E31, 0x74F839C593DC67FD, 0x0D6C8009D9A94F5A,
- 0x85676696D7FB7E2D, 0xCF2794E0277187B7, 0x18765564CD99A68D,
- 0xCBC9466E58FEE3CE, 0xAB0200F58B01D137, 0x93F5F5799A932462,
- 0x9E0082DF0BA9E4B0, 0x7A5DBBC594DDB9F3, 0xF4B32F46226BADA7,
- 0x751E8FBC860EE5FB, 0x14EA5627C0843D90, 0xF723CA908E7AF2EE,
- 0xA129CA6149BE45E5, 0x3F2ACC7F57C29BDB, 0x699AE9F52CBE4794,
- 0x4BC1B3F0968DD39C, 0xBB6DC91DA77961BD, 0xBED65CF21AA2EE98,
- 0xD0F2CBB02E3B67C7, 0x93536795E3A33E88, 0xA80C038CCD5CCEC8,
- 0xB8AD50C6F649AF94, 0xBCE192DE8A85B8EA, 0x17D835B85BBB15F3,
- 0x2F2E6163076BCFAD, 0xDE4DAAACA71DC9A5, 0xA6A2506687956571,
- 0xAD87A3535C49EF28, 0x32D892FAD841C342, 0x7127512F72F27CCE,
- 0xA7F32346F95978E3, 0x12E0B01ABB051238, 0x15E034D40FA197AE,
- 0x314DFFBE0815A3B4, 0x027990F029623981, 0xCADCD4E59EF40C4D,
- 0x9ABFD8766A33735C, 0x0E3EA96B5304A7D0, 0xAD0C42D6FC585992,
- 0x187306C89BC215A9, 0xD4A60ABCF3792B95, 0xF935451DE4F21DF2,
- 0xA9538F0419755787, 0xDB9ACDDFF56CA510, 0xD06C98CD5C0975EB,
- 0xE612A3CB9ECBA951, 0xC766E62CFCADAF96, 0xEE64435A9752FE72,
- 0xA192D576B245165A, 0x0A8787BF8ECB74B2, 0x81B3E73D20B49B6F,
- 0x7FA8220BA3B2ECEA, 0x245731C13CA42499, 0xB78DBFAF3A8D83BD,
- 0xEA1AD565322A1A0B, 0x60E61C23A3795013, 0x6606D7E446282B93,
- 0x6CA4ECB15C5F91E1, 0x9F626DA15C9625F3, 0xE51B38608EF25F57,
- 0x958A324CEB064572};
-
- for (int size = 0; size < kMaxSize; ++size) {
- in[size] = static_cast<char>(size);
- const HH_U64 hash = highwayhash::SipHash(key, in, size);
-#ifdef HH_GOOGLETEST
- EXPECT_EQ(kSipHashOutput[size], hash) << "Mismatch at length " << size;
-#else
- if (hash != kSipHashOutput[size]) {
- printf("Mismatch at length %d\n", size);
- abort();
- }
-#endif
- }
-}
-
-#ifdef HH_GOOGLETEST
-TEST(SipHashTest, OutputMatchesExpectations) { VerifySipHash(); }
-
-namespace bm {
-/* Run with:
- blaze run -c opt --cpu=haswell third_party/highwayhash:sip_hash_test -- \
- --benchmarks=all --benchmark_min_iters=1 --benchmark_min_time=0.25
-*/
-
-// Returns a pointer to memory of at least size bytes long to be used as hashing
-// input.
-char* GetInput(size_t size) {
- static constexpr size_t kMaxSize = 100 << 20;
- assert(size <= kMaxSize);
- static auto* res = []() {
- auto* res = new char[kMaxSize];
- std::iota(res, res + kMaxSize, 0);
- return res;
- }();
- return res;
-}
-
-template <class Hasher>
-void BM(int iters, int size) {
- StopBenchmarkTiming();
- auto* input = GetInput(size);
- const HH_U64 keys[4] = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL,
- 0x1716151413121110ULL, 0x1F1E1D1C1B1A1918ULL};
- Hasher hasher(keys);
- StartBenchmarkTiming();
- for (int i = 0; i < iters; ++i) {
- testing::DoNotOptimize(hasher(input, size));
- }
- StopBenchmarkTiming();
- SetBenchmarkBytesProcessed(static_cast<int64>(iters) * size);
-}
-
-void Args(::testing::Benchmark* bm) {
- bm->DenseRange(1, 16)->Range(32, 100 << 20);
-}
-
-#define DEFINE_HASHER(hashfn, num_keys) \
- struct hashfn##er { \
- hashfn##er(const HH_U64* k) { memcpy(keys, k, sizeof(keys)); } \
- HH_U64 operator()(const char* input, size_t size) { \
- return highwayhash::hashfn(keys, input, size); \
- } \
- HH_U64 keys[num_keys]; \
- }
-
-DEFINE_HASHER(SipHash, 2);
-BENCHMARK(BM<SipHasher>)->Apply(Args);
-
-DEFINE_HASHER(ScalarSipTreeHash, 4);
-BENCHMARK(BM<ScalarSipTreeHasher>)->Apply(Args);
-
-#ifdef __AVX2__
-DEFINE_HASHER(SipTreeHash, 4);
-BENCHMARK(BM<SipTreeHasher>)->Apply(Args);
-#endif
-
-} // namespace bm
-#endif // HH_GOOGLETEST
-
-} // namespace
-} // namespace highwayhash
-
-#ifndef HH_GOOGLETEST
-int main(int argc, char* argv[]) {
- highwayhash::VerifySipHash();
- printf("VerifySipHash succeeded.\n");
- return 0;
-}
-#endif
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/sip_hash.h"
+
+#include <cassert>
+#include <numeric>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef HH_GOOGLETEST
+#include "base/integral_types.h"
+#include "testing/base/public/benchmark.h"
+#include "testing/base/public/gunit.h"
+#endif
+#include "highwayhash/scalar_sip_tree_hash.h"
+#include "highwayhash/sip_tree_hash.h"
+
+namespace highwayhash {
+namespace {
+
+void VerifySipHash() {
+ const int kMaxSize = 64;
+ char in[kMaxSize]; // empty string, 00, 00 01, ...
+ const HH_U64 key[2] = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL};
+
+ // Known-good SipHash-2-4 output from D. Bernstein.
+ const HH_U64 kSipHashOutput[64] = {
+ 0x726FDB47DD0E0E31, 0x74F839C593DC67FD, 0x0D6C8009D9A94F5A,
+ 0x85676696D7FB7E2D, 0xCF2794E0277187B7, 0x18765564CD99A68D,
+ 0xCBC9466E58FEE3CE, 0xAB0200F58B01D137, 0x93F5F5799A932462,
+ 0x9E0082DF0BA9E4B0, 0x7A5DBBC594DDB9F3, 0xF4B32F46226BADA7,
+ 0x751E8FBC860EE5FB, 0x14EA5627C0843D90, 0xF723CA908E7AF2EE,
+ 0xA129CA6149BE45E5, 0x3F2ACC7F57C29BDB, 0x699AE9F52CBE4794,
+ 0x4BC1B3F0968DD39C, 0xBB6DC91DA77961BD, 0xBED65CF21AA2EE98,
+ 0xD0F2CBB02E3B67C7, 0x93536795E3A33E88, 0xA80C038CCD5CCEC8,
+ 0xB8AD50C6F649AF94, 0xBCE192DE8A85B8EA, 0x17D835B85BBB15F3,
+ 0x2F2E6163076BCFAD, 0xDE4DAAACA71DC9A5, 0xA6A2506687956571,
+ 0xAD87A3535C49EF28, 0x32D892FAD841C342, 0x7127512F72F27CCE,
+ 0xA7F32346F95978E3, 0x12E0B01ABB051238, 0x15E034D40FA197AE,
+ 0x314DFFBE0815A3B4, 0x027990F029623981, 0xCADCD4E59EF40C4D,
+ 0x9ABFD8766A33735C, 0x0E3EA96B5304A7D0, 0xAD0C42D6FC585992,
+ 0x187306C89BC215A9, 0xD4A60ABCF3792B95, 0xF935451DE4F21DF2,
+ 0xA9538F0419755787, 0xDB9ACDDFF56CA510, 0xD06C98CD5C0975EB,
+ 0xE612A3CB9ECBA951, 0xC766E62CFCADAF96, 0xEE64435A9752FE72,
+ 0xA192D576B245165A, 0x0A8787BF8ECB74B2, 0x81B3E73D20B49B6F,
+ 0x7FA8220BA3B2ECEA, 0x245731C13CA42499, 0xB78DBFAF3A8D83BD,
+ 0xEA1AD565322A1A0B, 0x60E61C23A3795013, 0x6606D7E446282B93,
+ 0x6CA4ECB15C5F91E1, 0x9F626DA15C9625F3, 0xE51B38608EF25F57,
+ 0x958A324CEB064572};
+
+ for (int size = 0; size < kMaxSize; ++size) {
+ in[size] = static_cast<char>(size);
+ const HH_U64 hash = highwayhash::SipHash(key, in, size);
+#ifdef HH_GOOGLETEST
+ EXPECT_EQ(kSipHashOutput[size], hash) << "Mismatch at length " << size;
+#else
+ if (hash != kSipHashOutput[size]) {
+ printf("Mismatch at length %d\n", size);
+ abort();
+ }
+#endif
+ }
+}
+
+#ifdef HH_GOOGLETEST
+TEST(SipHashTest, OutputMatchesExpectations) { VerifySipHash(); }
+
+namespace bm {
+/* Run with:
+ blaze run -c opt --cpu=haswell third_party/highwayhash:sip_hash_test -- \
+ --benchmarks=all --benchmark_min_iters=1 --benchmark_min_time=0.25
+*/
+
+// Returns a pointer to memory of at least size bytes long to be used as hashing
+// input.
+char* GetInput(size_t size) {
+ static constexpr size_t kMaxSize = 100 << 20;
+ assert(size <= kMaxSize);
+ static auto* res = []() {
+ auto* res = new char[kMaxSize];
+ std::iota(res, res + kMaxSize, 0);
+ return res;
+ }();
+ return res;
+}
+
+template <class Hasher>
+void BM(int iters, int size) {
+ StopBenchmarkTiming();
+ auto* input = GetInput(size);
+ const HH_U64 keys[4] = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL,
+ 0x1716151413121110ULL, 0x1F1E1D1C1B1A1918ULL};
+ Hasher hasher(keys);
+ StartBenchmarkTiming();
+ for (int i = 0; i < iters; ++i) {
+ testing::DoNotOptimize(hasher(input, size));
+ }
+ StopBenchmarkTiming();
+ SetBenchmarkBytesProcessed(static_cast<int64>(iters) * size);
+}
+
+void Args(::testing::Benchmark* bm) {
+ bm->DenseRange(1, 16)->Range(32, 100 << 20);
+}
+
+#define DEFINE_HASHER(hashfn, num_keys) \
+ struct hashfn##er { \
+ hashfn##er(const HH_U64* k) { memcpy(keys, k, sizeof(keys)); } \
+ HH_U64 operator()(const char* input, size_t size) { \
+ return highwayhash::hashfn(keys, input, size); \
+ } \
+ HH_U64 keys[num_keys]; \
+ }
+
+DEFINE_HASHER(SipHash, 2);
+BENCHMARK(BM<SipHasher>)->Apply(Args);
+
+DEFINE_HASHER(ScalarSipTreeHash, 4);
+BENCHMARK(BM<ScalarSipTreeHasher>)->Apply(Args);
+
+#ifdef __AVX2__
+DEFINE_HASHER(SipTreeHash, 4);
+BENCHMARK(BM<SipTreeHasher>)->Apply(Args);
+#endif
+
+} // namespace bm
+#endif // HH_GOOGLETEST
+
+} // namespace
+} // namespace highwayhash
+
+#ifndef HH_GOOGLETEST
+int main(int argc, char* argv[]) {
+ highwayhash::VerifySipHash();
+ printf("VerifySipHash succeeded.\n");
+ return 0;
+}
+#endif
diff --git a/contrib/libs/highwayhash/highwayhash/sip_tree_hash.cc b/contrib/libs/highwayhash/highwayhash/sip_tree_hash.cc
index 18c56d7907..2dc4ce78e4 100644
--- a/contrib/libs/highwayhash/highwayhash/sip_tree_hash.cc
+++ b/contrib/libs/highwayhash/highwayhash/sip_tree_hash.cc
@@ -1,227 +1,227 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "highwayhash/sip_tree_hash.h"
-
-#include <cstring> // memcpy
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-#include "highwayhash/sip_hash.h"
-
-#if HH_TARGET == HH_TARGET_AVX2
-#include "highwayhash/vector256.h"
-namespace highwayhash {
-namespace HH_TARGET_NAME {
-namespace {
-
-// Paper: https://www.131002.net/siphash/siphash.pdf
-// SSE41 implementation: https://goo.gl/80GBSD
-// Tree hash extension: http://dx.doi.org/10.4236/jis.2014.53010
-
-// The hash state is updated by injecting 4x8-byte packets;
-// XORing together all state vectors yields 32 bytes that are
-// reduced to 64 bits via 8-byte SipHash.
-
-const int kPacketSize = 32;
-const int kNumLanes = kPacketSize / sizeof(HH_U64);
-
-// 32 bytes key. Parameters are hardwired to c=2, d=4 [rounds].
-template <int kUpdateRounds, int kFinalizeRounds>
-class SipTreeHashStateT {
- public:
- explicit HH_INLINE SipTreeHashStateT(const HH_U64 (&keys)[kNumLanes]) {
- const V4x64U init(0x7465646279746573ull, 0x6c7967656e657261ull,
- 0x646f72616e646f6dull, 0x736f6d6570736575ull);
- const V4x64U lanes(kNumLanes | 3, kNumLanes | 2, kNumLanes | 1,
- kNumLanes | 0);
- const V4x64U key =
- LoadUnaligned<V4x64U>(reinterpret_cast<const uint64_t*>(keys)) ^ lanes;
- v0 = V4x64U(_mm256_permute4x64_epi64(init, 0x00)) ^ key;
- v1 = V4x64U(_mm256_permute4x64_epi64(init, 0x55)) ^ key;
- v2 = V4x64U(_mm256_permute4x64_epi64(init, 0xAA)) ^ key;
- v3 = V4x64U(_mm256_permute4x64_epi64(init, 0xFF)) ^ key;
- }
-
- HH_INLINE void Update(const V4x64U& packet) {
- v3 ^= packet;
-
- Compress<kUpdateRounds>();
-
- v0 ^= packet;
- }
-
- HH_INLINE V4x64U Finalize() {
- // Mix in bits to avoid leaking the key if all packets were zero.
- v2 ^= V4x64U(0xFF);
-
- Compress<kFinalizeRounds>();
-
- return (v0 ^ v1) ^ (v2 ^ v3);
- }
-
- private:
- static HH_INLINE V4x64U RotateLeft16(const V4x64U& v) {
- const V4x64U control(0x0D0C0B0A09080F0EULL, 0x0504030201000706ULL,
- 0x0D0C0B0A09080F0EULL, 0x0504030201000706ULL);
- return V4x64U(_mm256_shuffle_epi8(v, control));
- }
-
- // Rotates each 64-bit element of "v" left by N bits.
- template <HH_U64 bits>
- static HH_INLINE V4x64U RotateLeft(const V4x64U& v) {
- const V4x64U left = v << bits;
- const V4x64U right = v >> (64 - bits);
- return left | right;
- }
-
- static HH_INLINE V4x64U Rotate32(const V4x64U& v) {
- return V4x64U(_mm256_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1)));
- }
-
- template <int kRounds>
- HH_INLINE void Compress() {
- // Loop is faster than unrolling!
- for (int i = 0; i < kRounds; ++i) {
- // ARX network: add, rotate, exclusive-or.
- v0 += v1;
- v2 += v3;
- v1 = RotateLeft<13>(v1);
- v3 = RotateLeft16(v3);
- v1 ^= v0;
- v3 ^= v2;
-
- v0 = Rotate32(v0);
-
- v2 += v1;
- v0 += v3;
- v1 = RotateLeft<17>(v1);
- v3 = RotateLeft<21>(v3);
- v1 ^= v2;
- v3 ^= v0;
-
- v2 = Rotate32(v2);
- }
- }
-
- V4x64U v0;
- V4x64U v1;
- V4x64U v2;
- V4x64U v3;
-};
-
-// Returns 32-byte packet by loading the remaining 0..31 bytes, storing
-// "remainder" in the upper byte, and zeroing any intervening bytes.
-// "remainder" is the number of accessible/remaining bytes (size % 32).
-// Loading past the end of the input risks page fault exceptions which even
-// LDDQU cannot prevent.
-static HH_INLINE V4x64U LoadFinalPacket32(const char* bytes, const HH_U64 size,
- const HH_U64 remainder) {
- // Copying into an aligned buffer incurs a store-to-load-forwarding stall.
- // Instead, we use masked loads to read any remaining whole uint32_t
- // without incurring page faults for the others.
- const size_t remaining_32 = remainder >> 2; // 0..7
-
- // mask[32*i+31] := uint32_t #i valid/accessible ? 1 : 0.
- // To avoid large lookup tables, we pack uint32_t lanes into bytes,
- // compute the packed mask by shifting, and then sign-extend 0xFF to
- // 0xFFFFFFFF (although only the MSB needs to be set).
- // remaining_32 = 0 => mask = 00000000; remaining_32 = 7 => mask = 01111111.
- const HH_U64 packed_mask = 0x00FFFFFFFFFFFFFFULL >> ((7 - remaining_32) * 8);
- const V4x64U mask(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(packed_mask)));
- // Load 0..7 remaining (potentially unaligned) uint32_t.
- const V4x64U packet28(
- _mm256_maskload_epi32(reinterpret_cast<const int*>(bytes), mask));
-
- // Load any remaining bytes individually and combine into a uint32_t.
- const int remainder_mod4 = remainder & 3;
- // Length padding ensures that zero-valued buffers of different lengths
- // result in different hashes.
- uint32_t packet4 = static_cast<uint32_t>(remainder << 24);
- const char* final_bytes = bytes + (remaining_32 * 4);
- for (int i = 0; i < remainder_mod4; ++i) {
- const uint32_t byte = static_cast<unsigned char>(final_bytes[i]);
- packet4 += byte << (i * 8);
- }
-
- // The upper 4 bytes of packet28 are zero; replace with packet4 to
- // obtain the (length-padded) 32-byte packet.
- const __m256i v4 = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(packet4));
- const V4x64U packet(_mm256_blend_epi32(packet28, v4, 0x80));
- return packet;
-}
-
-} // namespace
-} // namespace HH_TARGET_NAME
-
-template <size_t kUpdateRounds, size_t kFinalizeRounds>
-HH_U64 SipTreeHashT(const HH_U64 (&key)[4], const char* bytes,
- const HH_U64 size) {
- using namespace HH_TARGET_NAME;
- SipTreeHashStateT<kUpdateRounds, kFinalizeRounds> state(key);
-
- const size_t remainder = size & (kPacketSize - 1);
- const size_t truncated_size = size - remainder;
- const HH_U64* packets = reinterpret_cast<const HH_U64*>(bytes);
- for (size_t i = 0; i < truncated_size / sizeof(HH_U64); i += kNumLanes) {
- const V4x64U packet =
- LoadUnaligned<V4x64U>(reinterpret_cast<const uint64_t*>(packets) + i);
- state.Update(packet);
- }
-
- const V4x64U final_packet =
- LoadFinalPacket32(bytes + truncated_size, size, remainder);
-
- state.Update(final_packet);
-
- // Faster than passing __m256i and extracting.
- HH_ALIGNAS(32) uint64_t hashes[kNumLanes];
- Store(state.Finalize(), hashes);
-
- typename SipHashStateT<kUpdateRounds, kFinalizeRounds>::Key reduce_key;
- memcpy(&reduce_key, &key, sizeof(reduce_key));
- return ReduceSipTreeHash<kNumLanes, kUpdateRounds, kFinalizeRounds>(
- reduce_key, hashes);
-}
-
-HH_U64 SipTreeHash(const HH_U64 (&key)[4], const char* bytes,
- const HH_U64 size) {
- return SipTreeHashT<2, 4>(key, bytes, size);
-}
-
-HH_U64 SipTreeHash13(const HH_U64 (&key)[4], const char* bytes,
- const HH_U64 size) {
- return SipTreeHashT<1, 3>(key, bytes, size);
-}
-
-} // namespace highwayhash
-
-using highwayhash::HH_U64;
-using highwayhash::SipTreeHash;
-using highwayhash::SipTreeHash13;
-using Key = HH_U64[4];
-
-extern "C" {
-
-HH_U64 SipTreeHashC(const HH_U64* key, const char* bytes, const HH_U64 size) {
- return SipTreeHash(*reinterpret_cast<const Key*>(key), bytes, size);
-}
-
-HH_U64 SipTreeHash13C(const HH_U64* key, const char* bytes, const HH_U64 size) {
- return SipTreeHash13(*reinterpret_cast<const Key*>(key), bytes, size);
-}
-
-} // extern "C"
-
-#endif // HH_TARGET == HH_TARGET_AVX2
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/sip_tree_hash.h"
+
+#include <cstring> // memcpy
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/sip_hash.h"
+
+#if HH_TARGET == HH_TARGET_AVX2
+#include "highwayhash/vector256.h"
+namespace highwayhash {
+namespace HH_TARGET_NAME {
+namespace {
+
+// Paper: https://www.131002.net/siphash/siphash.pdf
+// SSE41 implementation: https://goo.gl/80GBSD
+// Tree hash extension: http://dx.doi.org/10.4236/jis.2014.53010
+
+// The hash state is updated by injecting 4x8-byte packets;
+// XORing together all state vectors yields 32 bytes that are
+// reduced to 64 bits via 8-byte SipHash.
+
+const int kPacketSize = 32;
+const int kNumLanes = kPacketSize / sizeof(HH_U64);
+
+// 32 bytes key. Parameters are hardwired to c=2, d=4 [rounds].
+template <int kUpdateRounds, int kFinalizeRounds>
+class SipTreeHashStateT {
+ public:
+ explicit HH_INLINE SipTreeHashStateT(const HH_U64 (&keys)[kNumLanes]) {
+ const V4x64U init(0x7465646279746573ull, 0x6c7967656e657261ull,
+ 0x646f72616e646f6dull, 0x736f6d6570736575ull);
+ const V4x64U lanes(kNumLanes | 3, kNumLanes | 2, kNumLanes | 1,
+ kNumLanes | 0);
+ const V4x64U key =
+ LoadUnaligned<V4x64U>(reinterpret_cast<const uint64_t*>(keys)) ^ lanes;
+ v0 = V4x64U(_mm256_permute4x64_epi64(init, 0x00)) ^ key;
+ v1 = V4x64U(_mm256_permute4x64_epi64(init, 0x55)) ^ key;
+ v2 = V4x64U(_mm256_permute4x64_epi64(init, 0xAA)) ^ key;
+ v3 = V4x64U(_mm256_permute4x64_epi64(init, 0xFF)) ^ key;
+ }
+
+ HH_INLINE void Update(const V4x64U& packet) {
+ v3 ^= packet;
+
+ Compress<kUpdateRounds>();
+
+ v0 ^= packet;
+ }
+
+ HH_INLINE V4x64U Finalize() {
+ // Mix in bits to avoid leaking the key if all packets were zero.
+ v2 ^= V4x64U(0xFF);
+
+ Compress<kFinalizeRounds>();
+
+ return (v0 ^ v1) ^ (v2 ^ v3);
+ }
+
+ private:
+ static HH_INLINE V4x64U RotateLeft16(const V4x64U& v) {
+ const V4x64U control(0x0D0C0B0A09080F0EULL, 0x0504030201000706ULL,
+ 0x0D0C0B0A09080F0EULL, 0x0504030201000706ULL);
+ return V4x64U(_mm256_shuffle_epi8(v, control));
+ }
+
+ // Rotates each 64-bit element of "v" left by N bits.
+ template <HH_U64 bits>
+ static HH_INLINE V4x64U RotateLeft(const V4x64U& v) {
+ const V4x64U left = v << bits;
+ const V4x64U right = v >> (64 - bits);
+ return left | right;
+ }
+
+ static HH_INLINE V4x64U Rotate32(const V4x64U& v) {
+ return V4x64U(_mm256_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1)));
+ }
+
+ template <int kRounds>
+ HH_INLINE void Compress() {
+ // Loop is faster than unrolling!
+ for (int i = 0; i < kRounds; ++i) {
+ // ARX network: add, rotate, exclusive-or.
+ v0 += v1;
+ v2 += v3;
+ v1 = RotateLeft<13>(v1);
+ v3 = RotateLeft16(v3);
+ v1 ^= v0;
+ v3 ^= v2;
+
+ v0 = Rotate32(v0);
+
+ v2 += v1;
+ v0 += v3;
+ v1 = RotateLeft<17>(v1);
+ v3 = RotateLeft<21>(v3);
+ v1 ^= v2;
+ v3 ^= v0;
+
+ v2 = Rotate32(v2);
+ }
+ }
+
+ V4x64U v0;
+ V4x64U v1;
+ V4x64U v2;
+ V4x64U v3;
+};
+
+// Returns 32-byte packet by loading the remaining 0..31 bytes, storing
+// "remainder" in the upper byte, and zeroing any intervening bytes.
+// "remainder" is the number of accessible/remaining bytes (size % 32).
+// Loading past the end of the input risks page fault exceptions which even
+// LDDQU cannot prevent.
+static HH_INLINE V4x64U LoadFinalPacket32(const char* bytes, const HH_U64 size,
+ const HH_U64 remainder) {
+ // Copying into an aligned buffer incurs a store-to-load-forwarding stall.
+ // Instead, we use masked loads to read any remaining whole uint32_t
+ // without incurring page faults for the others.
+ const size_t remaining_32 = remainder >> 2; // 0..7
+
+ // mask[32*i+31] := uint32_t #i valid/accessible ? 1 : 0.
+ // To avoid large lookup tables, we pack uint32_t lanes into bytes,
+ // compute the packed mask by shifting, and then sign-extend 0xFF to
+ // 0xFFFFFFFF (although only the MSB needs to be set).
+ // remaining_32 = 0 => mask = 00000000; remaining_32 = 7 => mask = 01111111.
+ const HH_U64 packed_mask = 0x00FFFFFFFFFFFFFFULL >> ((7 - remaining_32) * 8);
+ const V4x64U mask(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(packed_mask)));
+ // Load 0..7 remaining (potentially unaligned) uint32_t.
+ const V4x64U packet28(
+ _mm256_maskload_epi32(reinterpret_cast<const int*>(bytes), mask));
+
+ // Load any remaining bytes individually and combine into a uint32_t.
+ const int remainder_mod4 = remainder & 3;
+ // Length padding ensures that zero-valued buffers of different lengths
+ // result in different hashes.
+ uint32_t packet4 = static_cast<uint32_t>(remainder << 24);
+ const char* final_bytes = bytes + (remaining_32 * 4);
+ for (int i = 0; i < remainder_mod4; ++i) {
+ const uint32_t byte = static_cast<unsigned char>(final_bytes[i]);
+ packet4 += byte << (i * 8);
+ }
+
+ // The upper 4 bytes of packet28 are zero; replace with packet4 to
+ // obtain the (length-padded) 32-byte packet.
+ const __m256i v4 = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(packet4));
+ const V4x64U packet(_mm256_blend_epi32(packet28, v4, 0x80));
+ return packet;
+}
+
+} // namespace
+} // namespace HH_TARGET_NAME
+
+template <size_t kUpdateRounds, size_t kFinalizeRounds>
+HH_U64 SipTreeHashT(const HH_U64 (&key)[4], const char* bytes,
+ const HH_U64 size) {
+ using namespace HH_TARGET_NAME;
+ SipTreeHashStateT<kUpdateRounds, kFinalizeRounds> state(key);
+
+ const size_t remainder = size & (kPacketSize - 1);
+ const size_t truncated_size = size - remainder;
+ const HH_U64* packets = reinterpret_cast<const HH_U64*>(bytes);
+ for (size_t i = 0; i < truncated_size / sizeof(HH_U64); i += kNumLanes) {
+ const V4x64U packet =
+ LoadUnaligned<V4x64U>(reinterpret_cast<const uint64_t*>(packets) + i);
+ state.Update(packet);
+ }
+
+ const V4x64U final_packet =
+ LoadFinalPacket32(bytes + truncated_size, size, remainder);
+
+ state.Update(final_packet);
+
+ // Faster than passing __m256i and extracting.
+ HH_ALIGNAS(32) uint64_t hashes[kNumLanes];
+ Store(state.Finalize(), hashes);
+
+ typename SipHashStateT<kUpdateRounds, kFinalizeRounds>::Key reduce_key;
+ memcpy(&reduce_key, &key, sizeof(reduce_key));
+ return ReduceSipTreeHash<kNumLanes, kUpdateRounds, kFinalizeRounds>(
+ reduce_key, hashes);
+}
+
+HH_U64 SipTreeHash(const HH_U64 (&key)[4], const char* bytes,
+ const HH_U64 size) {
+ return SipTreeHashT<2, 4>(key, bytes, size);
+}
+
+HH_U64 SipTreeHash13(const HH_U64 (&key)[4], const char* bytes,
+ const HH_U64 size) {
+ return SipTreeHashT<1, 3>(key, bytes, size);
+}
+
+} // namespace highwayhash
+
+using highwayhash::HH_U64;
+using highwayhash::SipTreeHash;
+using highwayhash::SipTreeHash13;
+using Key = HH_U64[4];
+
+extern "C" {
+
+HH_U64 SipTreeHashC(const HH_U64* key, const char* bytes, const HH_U64 size) {
+ return SipTreeHash(*reinterpret_cast<const Key*>(key), bytes, size);
+}
+
+HH_U64 SipTreeHash13C(const HH_U64* key, const char* bytes, const HH_U64 size) {
+ return SipTreeHash13(*reinterpret_cast<const Key*>(key), bytes, size);
+}
+
+} // extern "C"
+
+#endif // HH_TARGET == HH_TARGET_AVX2
diff --git a/contrib/libs/highwayhash/highwayhash/sip_tree_hash.h b/contrib/libs/highwayhash/highwayhash/sip_tree_hash.h
index ee5a42340e..788aa8025b 100644
--- a/contrib/libs/highwayhash/highwayhash/sip_tree_hash.h
+++ b/contrib/libs/highwayhash/highwayhash/sip_tree_hash.h
@@ -1,52 +1,52 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_SIP_TREE_HASH_H_
-#define HIGHWAYHASH_SIP_TREE_HASH_H_
-
-#include "highwayhash/state_helpers.h"
-
-#ifdef __cplusplus
-namespace highwayhash {
-extern "C" {
-#endif
-
-// Fast, cryptographically strong pseudo-random function. Useful for:
-// . hash tables holding attacker-controlled data. This function is
-// immune to hash flooding DOS attacks because multi-collisions are
-// infeasible to compute, provided the key remains secret.
-// . deterministic/idempotent 'random' number generation, e.g. for
-// choosing a subset of items based on their contents.
-//
-// Robust versus timing attacks because memory accesses are sequential
-// and the algorithm is branch-free. Compute time is proportional to the
-// number of 8-byte packets and 1.5x faster than an sse41 implementation.
-// Requires an AVX-2 capable CPU.
-//
-// "key" is a secret 256-bit key unknown to attackers.
-// "bytes" is the data to hash (possibly unaligned).
-// "size" is the number of bytes to hash; exactly that many bytes are read.
-// Returns a 64-bit hash of the given data bytes.
-HH_U64 SipTreeHash(const HH_U64 (&key)[4], const char* bytes,
- const HH_U64 size);
-
-HH_U64 SipTreeHash13(const HH_U64 (&key)[4], const char* bytes,
- const HH_U64 size);
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace highwayhash
-#endif
-
-#endif // HIGHWAYHASH_SIP_TREE_HASH_H_
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_SIP_TREE_HASH_H_
+#define HIGHWAYHASH_SIP_TREE_HASH_H_
+
+#include "highwayhash/state_helpers.h"
+
+#ifdef __cplusplus
+namespace highwayhash {
+extern "C" {
+#endif
+
+// Fast, cryptographically strong pseudo-random function. Useful for:
+// . hash tables holding attacker-controlled data. This function is
+// immune to hash flooding DOS attacks because multi-collisions are
+// infeasible to compute, provided the key remains secret.
+// . deterministic/idempotent 'random' number generation, e.g. for
+// choosing a subset of items based on their contents.
+//
+// Robust versus timing attacks because memory accesses are sequential
+// and the algorithm is branch-free. Compute time is proportional to the
+// number of 8-byte packets and 1.5x faster than an sse41 implementation.
+// Requires an AVX-2 capable CPU.
+//
+// "key" is a secret 256-bit key unknown to attackers.
+// "bytes" is the data to hash (possibly unaligned).
+// "size" is the number of bytes to hash; exactly that many bytes are read.
+// Returns a 64-bit hash of the given data bytes.
+HH_U64 SipTreeHash(const HH_U64 (&key)[4], const char* bytes,
+ const HH_U64 size);
+
+HH_U64 SipTreeHash13(const HH_U64 (&key)[4], const char* bytes,
+ const HH_U64 size);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace highwayhash
+#endif
+
+#endif // HIGHWAYHASH_SIP_TREE_HASH_H_
diff --git a/contrib/libs/highwayhash/highwayhash/state_helpers.h b/contrib/libs/highwayhash/highwayhash/state_helpers.h
index 4dd651260f..88e31a4509 100644
--- a/contrib/libs/highwayhash/highwayhash/state_helpers.h
+++ b/contrib/libs/highwayhash/highwayhash/state_helpers.h
@@ -1,128 +1,128 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_STATE_H_
-#define HIGHWAYHASH_STATE_H_
-
-// Helper functions to split inputs into packets and call State::Update on each.
-
-#include <stdint.h>
-#include <cstddef>
-#include <cstring>
-#include <memory>
-
-#include "highwayhash/compiler_specific.h"
-
-namespace highwayhash {
-
-// uint64_t is unsigned long on Linux; we need 'unsigned long long'
-// for interoperability with TensorFlow.
-typedef unsigned long long HH_U64; // NOLINT
-
-// Copies the remaining bytes to a zero-padded buffer, sets the upper byte to
-// size % 256 (always possible because this should only be called if the
-// total size is not a multiple of the packet size) and updates hash state.
-//
-// The padding scheme is essentially from SipHash, but permuted for the
-// convenience of AVX-2 masked loads. This function must use the same layout so
-// that the vector and scalar HighwayTreeHash have the same result.
-//
-// "remaining_size" is the number of accessible/remaining bytes
-// (size % kPacketSize).
-//
-// Primary template; the specialization for AVX-2 is faster. Intended as an
-// implementation detail, do not call directly.
-template <class State>
-HH_INLINE void PaddedUpdate(const HH_U64 size, const char* remaining_bytes,
- const HH_U64 remaining_size, State* state) {
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_STATE_H_
+#define HIGHWAYHASH_STATE_H_
+
+// Helper functions to split inputs into packets and call State::Update on each.
+
+#include <stdint.h>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+
+#include "highwayhash/compiler_specific.h"
+
+namespace highwayhash {
+
+// uint64_t is unsigned long on Linux; we need 'unsigned long long'
+// for interoperability with TensorFlow.
+typedef unsigned long long HH_U64; // NOLINT
+
+// Copies the remaining bytes to a zero-padded buffer, sets the upper byte to
+// size % 256 (always possible because this should only be called if the
+// total size is not a multiple of the packet size) and updates hash state.
+//
+// The padding scheme is essentially from SipHash, but permuted for the
+// convenience of AVX-2 masked loads. This function must use the same layout so
+// that the vector and scalar HighwayTreeHash have the same result.
+//
+// "remaining_size" is the number of accessible/remaining bytes
+// (size % kPacketSize).
+//
+// Primary template; the specialization for AVX-2 is faster. Intended as an
+// implementation detail, do not call directly.
+template <class State>
+HH_INLINE void PaddedUpdate(const HH_U64 size, const char* remaining_bytes,
+ const HH_U64 remaining_size, State* state) {
HH_ALIGNAS(32) char final_packet[State::kPacketSize] = {0};
-
- // This layout matches the AVX-2 specialization in highway_tree_hash.h.
- uint32_t packet4 = static_cast<uint32_t>(size) << 24;
-
- const size_t remainder_mod4 = remaining_size & 3;
- if (remainder_mod4 != 0) {
- const char* final_bytes = remaining_bytes + remaining_size - remainder_mod4;
- packet4 += static_cast<uint32_t>(final_bytes[0]);
- const int idx1 = remainder_mod4 >> 1;
- const int idx2 = remainder_mod4 - 1;
- packet4 += static_cast<uint32_t>(final_bytes[idx1]) << 8;
- packet4 += static_cast<uint32_t>(final_bytes[idx2]) << 16;
- }
-
- memcpy(final_packet, remaining_bytes, remaining_size - remainder_mod4);
- memcpy(final_packet + State::kPacketSize - 4, &packet4, sizeof(packet4));
-
- state->Update(final_packet);
-}
-
-// Updates hash state for every whole packet, and once more for the final
-// padded packet.
-template <class State>
-HH_INLINE void UpdateState(const char* bytes, const HH_U64 size, State* state) {
- // Feed entire packets.
- const int kPacketSize = State::kPacketSize;
- static_assert((kPacketSize & (kPacketSize - 1)) == 0, "Size must be 2^i.");
- const size_t remainder = size & (kPacketSize - 1);
- const size_t truncated_size = size - remainder;
- for (size_t i = 0; i < truncated_size; i += kPacketSize) {
- state->Update(bytes + i);
- }
-
- PaddedUpdate(size, bytes + truncated_size, remainder, state);
-}
-
-// Convenience function for updating with the bytes of a string.
-template <class String, class State>
-HH_INLINE void UpdateState(const String& s, State* state) {
- const char* bytes = reinterpret_cast<const char*>(s.data());
- const size_t size = s.length() * sizeof(typename String::value_type);
- UpdateState(bytes, size, state);
-}
-
-// Computes a hash of a byte array using the given hash State class.
-//
-// Example: const SipHashState::Key key = { 1, 2 }; char data[4];
-// ComputeHash<SipHashState>(key, data, sizeof(data));
-//
-// This function avoids duplicating Update/Finalize in every call site.
-// Callers wanting to combine multiple hashes should repeatedly UpdateState()
-// and only call State::Finalize once.
-template <class State>
-HH_U64 ComputeHash(const typename State::Key& key, const char* bytes,
- const HH_U64 size) {
- State state(key);
- UpdateState(bytes, size, &state);
- return state.Finalize();
-}
-
-// Computes a hash of a string's bytes using the given hash State class.
-//
-// Example: const SipHashState::Key key = { 1, 2 };
-// StringHasher<SipHashState>()(key, std::u16string(u"abc"));
-//
-// A struct with nested function template enables deduction of the String type.
-template <class State>
-struct StringHasher {
- template <class String>
- HH_U64 operator()(const typename State::Key& key, const String& s) {
- State state(key);
- UpdateState(s, &state);
- return state.Finalize();
- }
-};
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_STATE_H_
+
+ // This layout matches the AVX-2 specialization in highway_tree_hash.h.
+ uint32_t packet4 = static_cast<uint32_t>(size) << 24;
+
+ const size_t remainder_mod4 = remaining_size & 3;
+ if (remainder_mod4 != 0) {
+ const char* final_bytes = remaining_bytes + remaining_size - remainder_mod4;
+ packet4 += static_cast<uint32_t>(final_bytes[0]);
+ const int idx1 = remainder_mod4 >> 1;
+ const int idx2 = remainder_mod4 - 1;
+ packet4 += static_cast<uint32_t>(final_bytes[idx1]) << 8;
+ packet4 += static_cast<uint32_t>(final_bytes[idx2]) << 16;
+ }
+
+ memcpy(final_packet, remaining_bytes, remaining_size - remainder_mod4);
+ memcpy(final_packet + State::kPacketSize - 4, &packet4, sizeof(packet4));
+
+ state->Update(final_packet);
+}
+
+// Updates hash state for every whole packet, and once more for the final
+// padded packet.
+template <class State>
+HH_INLINE void UpdateState(const char* bytes, const HH_U64 size, State* state) {
+ // Feed entire packets.
+ const int kPacketSize = State::kPacketSize;
+ static_assert((kPacketSize & (kPacketSize - 1)) == 0, "Size must be 2^i.");
+ const size_t remainder = size & (kPacketSize - 1);
+ const size_t truncated_size = size - remainder;
+ for (size_t i = 0; i < truncated_size; i += kPacketSize) {
+ state->Update(bytes + i);
+ }
+
+ PaddedUpdate(size, bytes + truncated_size, remainder, state);
+}
+
+// Convenience function for updating with the bytes of a string.
+template <class String, class State>
+HH_INLINE void UpdateState(const String& s, State* state) {
+ const char* bytes = reinterpret_cast<const char*>(s.data());
+ const size_t size = s.length() * sizeof(typename String::value_type);
+ UpdateState(bytes, size, state);
+}
+
+// Computes a hash of a byte array using the given hash State class.
+//
+// Example: const SipHashState::Key key = { 1, 2 }; char data[4];
+// ComputeHash<SipHashState>(key, data, sizeof(data));
+//
+// This function avoids duplicating Update/Finalize in every call site.
+// Callers wanting to combine multiple hashes should repeatedly UpdateState()
+// and only call State::Finalize once.
+template <class State>
+HH_U64 ComputeHash(const typename State::Key& key, const char* bytes,
+ const HH_U64 size) {
+ State state(key);
+ UpdateState(bytes, size, &state);
+ return state.Finalize();
+}
+
+// Computes a hash of a string's bytes using the given hash State class.
+//
+// Example: const SipHashState::Key key = { 1, 2 };
+// StringHasher<SipHashState>()(key, std::u16string(u"abc"));
+//
+// A struct with nested function template enables deduction of the String type.
+template <class State>
+struct StringHasher {
+ template <class String>
+ HH_U64 operator()(const typename State::Key& key, const String& s) {
+ State state(key);
+ UpdateState(s, &state);
+ return state.Finalize();
+ }
+};
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_STATE_H_
diff --git a/contrib/libs/highwayhash/highwayhash/tsc_timer.h b/contrib/libs/highwayhash/highwayhash/tsc_timer.h
index 4a88c0f8e6..6a4b4a4bdb 100644
--- a/contrib/libs/highwayhash/highwayhash/tsc_timer.h
+++ b/contrib/libs/highwayhash/highwayhash/tsc_timer.h
@@ -1,204 +1,204 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_TSC_TIMER_H_
-#define HIGHWAYHASH_TSC_TIMER_H_
-
-// High-resolution (~10 ns) timestamps, using fences to prevent reordering and
-// ensure exactly the desired regions are measured.
-
-#include <stdint.h>
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-
-#if HH_ARCH_X64 && HH_MSC_VERSION
-#include <emmintrin.h> // _mm_lfence
-#include <intrin.h>
-#endif
-
-namespace highwayhash {
-
-// Start/Stop return absolute timestamps and must be placed immediately before
-// and after the region to measure. We provide separate Start/Stop functions
-// because they use different fences.
-//
-// Background: RDTSC is not 'serializing'; earlier instructions may complete
-// after it, and/or later instructions may complete before it. 'Fences' ensure
-// regions' elapsed times are independent of such reordering. The only
-// documented unprivileged serializing instruction is CPUID, which acts as a
-// full fence (no reordering across it in either direction). Unfortunately
-// the latency of CPUID varies wildly (perhaps made worse by not initializing
-// its EAX input). Because it cannot reliably be deducted from the region's
-// elapsed time, it must not be included in the region to measure (i.e.
-// between the two RDTSC).
-//
-// The newer RDTSCP is sometimes described as serializing, but it actually
-// only serves as a half-fence with release semantics. Although all
-// instructions in the region will complete before the final timestamp is
-// captured, subsequent instructions may leak into the region and increase the
-// elapsed time. Inserting another fence after the final RDTSCP would prevent
-// such reordering without affecting the measured region.
-//
-// Fortunately, such a fence exists. The LFENCE instruction is only documented
-// to delay later loads until earlier loads are visible. However, Intel's
-// reference manual says it acts as a full fence (waiting until all earlier
-// instructions have completed, and delaying later instructions until it
-// completes). AMD assigns the same behavior to MFENCE.
-//
-// We need a fence before the initial RDTSC to prevent earlier instructions
-// from leaking into the region, and arguably another after RDTSC to avoid
-// region instructions from completing before the timestamp is recorded.
-// When surrounded by fences, the additional RDTSCP half-fence provides no
-// benefit, so the initial timestamp can be recorded via RDTSC, which has
-// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
-// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
-//
-// Using Start+Start leads to higher variance and overhead than Stop+Stop.
-// However, Stop+Stop includes an LFENCE in the region measurements, which
-// adds a delay dependent on earlier loads. The combination of Start+Stop
-// is faster than Start+Start and more consistent than Stop+Stop because
-// the first LFENCE already delayed subsequent loads before the measured
-// region. This combination seems not to have been considered in prior work:
-// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
-//
-// Note: performance counters can measure 'exact' instructions-retired or
-// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
-// requires fences. Unfortunately, it is not accessible on all OSes and we
-// prefer to avoid kernel-mode drivers. Performance counters are also affected
-// by several under/over-count errata, so we use the TSC instead.
-
-// Primary templates; must use one of the specializations.
-template <typename T>
-inline T Start();
-
-template <typename T>
-inline T Stop();
-
-template <>
-inline uint64_t Start<uint64_t>() {
- uint64_t t;
-#if HH_ARCH_PPC
- asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
-#elif HH_ARCH_AARCH64
- asm volatile("mrs %0, cntvct_el0" : "=r"(t));
-#elif HH_ARCH_X64 && HH_MSC_VERSION
- _mm_lfence();
- HH_COMPILER_FENCE;
- t = __rdtsc();
- _mm_lfence();
- HH_COMPILER_FENCE;
-#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
- asm volatile(
- "lfence\n\t"
- "rdtsc\n\t"
- "shl $32, %%rdx\n\t"
- "or %%rdx, %0\n\t"
- "lfence"
- : "=a"(t)
- :
- // "memory" avoids reordering. rdx = TSC >> 32.
- // "cc" = flags modified by SHL.
- : "rdx", "memory", "cc");
-#else
-#error "Port"
-#endif
- return t;
-}
-
-template <>
-inline uint64_t Stop<uint64_t>() {
- uint64_t t;
-#if HH_ARCH_PPC
- asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
-#elif HH_ARCH_AARCH64
- asm volatile("mrs %0, cntvct_el0" : "=r"(t));
-#elif HH_ARCH_X64 && HH_MSC_VERSION
- HH_COMPILER_FENCE;
- unsigned aux;
- t = __rdtscp(&aux);
- _mm_lfence();
- HH_COMPILER_FENCE;
-#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
- // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
- asm volatile(
- "rdtscp\n\t"
- "shl $32, %%rdx\n\t"
- "or %%rdx, %0\n\t"
- "lfence"
- : "=a"(t)
- :
- // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
- // "cc" = flags modified by SHL.
- : "rcx", "rdx", "memory", "cc");
-#else
-#error "Port"
-#endif
- return t;
-}
-
-// Returns a 32-bit timestamp with about 4 cycles less overhead than
-// Start<uint64_t>. Only suitable for measuring very short regions because the
-// timestamp overflows about once a second.
-template <>
-inline uint32_t Start<uint32_t>() {
- uint32_t t;
-#if HH_ARCH_X64 && HH_MSC_VERSION
- _mm_lfence();
- HH_COMPILER_FENCE;
- t = static_cast<uint32_t>(__rdtsc());
- _mm_lfence();
- HH_COMPILER_FENCE;
-#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
- asm volatile(
- "lfence\n\t"
- "rdtsc\n\t"
- "lfence"
- : "=a"(t)
- :
- // "memory" avoids reordering. rdx = TSC >> 32.
- : "rdx", "memory");
-#else
- t = static_cast<uint32_t>(Start<uint64_t>());
-#endif
- return t;
-}
-
-template <>
-inline uint32_t Stop<uint32_t>() {
- uint32_t t;
-#if HH_ARCH_X64 && HH_MSC_VERSION
- HH_COMPILER_FENCE;
- unsigned aux;
- t = static_cast<uint32_t>(__rdtscp(&aux));
- _mm_lfence();
- HH_COMPILER_FENCE;
-#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
- // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
- asm volatile(
- "rdtscp\n\t"
- "lfence"
- : "=a"(t)
- :
- // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
- : "rcx", "rdx", "memory");
-#else
- t = static_cast<uint32_t>(Stop<uint64_t>());
-#endif
- return t;
-}
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_TSC_TIMER_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_TSC_TIMER_H_
+#define HIGHWAYHASH_TSC_TIMER_H_
+
+// High-resolution (~10 ns) timestamps, using fences to prevent reordering and
+// ensure exactly the desired regions are measured.
+
+#include <stdint.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+#if HH_ARCH_X64 && HH_MSC_VERSION
+#include <emmintrin.h> // _mm_lfence
+#include <intrin.h>
+#endif
+
+namespace highwayhash {
+
+// Start/Stop return absolute timestamps and must be placed immediately before
+// and after the region to measure. We provide separate Start/Stop functions
+// because they use different fences.
+//
+// Background: RDTSC is not 'serializing'; earlier instructions may complete
+// after it, and/or later instructions may complete before it. 'Fences' ensure
+// regions' elapsed times are independent of such reordering. The only
+// documented unprivileged serializing instruction is CPUID, which acts as a
+// full fence (no reordering across it in either direction). Unfortunately
+// the latency of CPUID varies wildly (perhaps made worse by not initializing
+// its EAX input). Because it cannot reliably be deducted from the region's
+// elapsed time, it must not be included in the region to measure (i.e.
+// between the two RDTSC).
+//
+// The newer RDTSCP is sometimes described as serializing, but it actually
+// only serves as a half-fence with release semantics. Although all
+// instructions in the region will complete before the final timestamp is
+// captured, subsequent instructions may leak into the region and increase the
+// elapsed time. Inserting another fence after the final RDTSCP would prevent
+// such reordering without affecting the measured region.
+//
+// Fortunately, such a fence exists. The LFENCE instruction is only documented
+// to delay later loads until earlier loads are visible. However, Intel's
+// reference manual says it acts as a full fence (waiting until all earlier
+// instructions have completed, and delaying later instructions until it
+// completes). AMD assigns the same behavior to MFENCE.
+//
+// We need a fence before the initial RDTSC to prevent earlier instructions
+// from leaking into the region, and arguably another after RDTSC to avoid
+// region instructions from completing before the timestamp is recorded.
+// When surrounded by fences, the additional RDTSCP half-fence provides no
+// benefit, so the initial timestamp can be recorded via RDTSC, which has
+// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
+// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
+//
+// Using Start+Start leads to higher variance and overhead than Stop+Stop.
+// However, Stop+Stop includes an LFENCE in the region measurements, which
+// adds a delay dependent on earlier loads. The combination of Start+Stop
+// is faster than Start+Start and more consistent than Stop+Stop because
+// the first LFENCE already delayed subsequent loads before the measured
+// region. This combination seems not to have been considered in prior work:
+// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
+//
+// Note: performance counters can measure 'exact' instructions-retired or
+// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
+// requires fences. Unfortunately, it is not accessible on all OSes and we
+// prefer to avoid kernel-mode drivers. Performance counters are also affected
+// by several under/over-count errata, so we use the TSC instead.
+
+// Primary templates; must use one of the specializations.
+template <typename T>
+inline T Start();
+
+template <typename T>
+inline T Stop();
+
+template <>
+inline uint64_t Start<uint64_t>() {
+ uint64_t t;
+#if HH_ARCH_PPC
+ asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HH_ARCH_AARCH64
+ asm volatile("mrs %0, cntvct_el0" : "=r"(t));
+#elif HH_ARCH_X64 && HH_MSC_VERSION
+ _mm_lfence();
+ HH_COMPILER_FENCE;
+ t = __rdtsc();
+ _mm_lfence();
+ HH_COMPILER_FENCE;
+#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
+ asm volatile(
+ "lfence\n\t"
+ "rdtsc\n\t"
+ "shl $32, %%rdx\n\t"
+ "or %%rdx, %0\n\t"
+ "lfence"
+ : "=a"(t)
+ :
+ // "memory" avoids reordering. rdx = TSC >> 32.
+ // "cc" = flags modified by SHL.
+ : "rdx", "memory", "cc");
+#else
+#error "Port"
+#endif
+ return t;
+}
+
+template <>
+inline uint64_t Stop<uint64_t>() {
+ uint64_t t;
+#if HH_ARCH_PPC
+ asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HH_ARCH_AARCH64
+ asm volatile("mrs %0, cntvct_el0" : "=r"(t));
+#elif HH_ARCH_X64 && HH_MSC_VERSION
+ HH_COMPILER_FENCE;
+ unsigned aux;
+ t = __rdtscp(&aux);
+ _mm_lfence();
+ HH_COMPILER_FENCE;
+#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
+ // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
+ asm volatile(
+ "rdtscp\n\t"
+ "shl $32, %%rdx\n\t"
+ "or %%rdx, %0\n\t"
+ "lfence"
+ : "=a"(t)
+ :
+ // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
+ // "cc" = flags modified by SHL.
+ : "rcx", "rdx", "memory", "cc");
+#else
+#error "Port"
+#endif
+ return t;
+}
+
+// Returns a 32-bit timestamp with about 4 cycles less overhead than
+// Start<uint64_t>. Only suitable for measuring very short regions because the
+// timestamp overflows about once a second.
+template <>
+inline uint32_t Start<uint32_t>() {
+ uint32_t t;
+#if HH_ARCH_X64 && HH_MSC_VERSION
+ _mm_lfence();
+ HH_COMPILER_FENCE;
+ t = static_cast<uint32_t>(__rdtsc());
+ _mm_lfence();
+ HH_COMPILER_FENCE;
+#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
+ asm volatile(
+ "lfence\n\t"
+ "rdtsc\n\t"
+ "lfence"
+ : "=a"(t)
+ :
+ // "memory" avoids reordering. rdx = TSC >> 32.
+ : "rdx", "memory");
+#else
+ t = static_cast<uint32_t>(Start<uint64_t>());
+#endif
+ return t;
+}
+
+template <>
+inline uint32_t Stop<uint32_t>() {
+ uint32_t t;
+#if HH_ARCH_X64 && HH_MSC_VERSION
+ HH_COMPILER_FENCE;
+ unsigned aux;
+ t = static_cast<uint32_t>(__rdtscp(&aux));
+ _mm_lfence();
+ HH_COMPILER_FENCE;
+#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
+ // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
+ asm volatile(
+ "rdtscp\n\t"
+ "lfence"
+ : "=a"(t)
+ :
+ // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
+ : "rcx", "rdx", "memory");
+#else
+ t = static_cast<uint32_t>(Stop<uint64_t>());
+#endif
+ return t;
+}
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_TSC_TIMER_H_
diff --git a/contrib/libs/highwayhash/highwayhash/vector128.h b/contrib/libs/highwayhash/highwayhash/vector128.h
index 53eb9f164c..24c30859cd 100644
--- a/contrib/libs/highwayhash/highwayhash/vector128.h
+++ b/contrib/libs/highwayhash/highwayhash/vector128.h
@@ -1,796 +1,796 @@
-// Copyright 2016 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_VECTOR128_H_
-#define HIGHWAYHASH_VECTOR128_H_
-
-// Defines SIMD vector classes ("V2x64U") with overloaded arithmetic operators:
-// const V2x64U masked_sum = (a + b) & m;
-// This is shorter and more readable than compiler intrinsics:
-// const __m128i masked_sum = _mm_and_si128(_mm_add_epi64(a, b), m);
-// There is typically no runtime cost for these abstractions.
-//
-// The naming convention is VNxBBT where N is the number of lanes, BB the
-// number of bits per lane and T is the lane type: unsigned integer (U),
-// signed integer (I), or floating-point (F).
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-
-// For auto-dependency generation, we need to include all headers but not their
-// contents (otherwise compilation fails because -msse4.1 is not specified).
-#ifndef HH_DISABLE_TARGET_SPECIFIC
-
-// WARNING: smmintrin.h will also be included through immintrin.h in the AVX2
-// translation unit, which is compiled with different flags. This risks ODR
-// violations, and can cause crashes when functions are not inlined and the
-// linker selects the AVX2 version. Unfortunately this include cannot reside
-// within a namespace due to conflicts with other system headers. We need to
-// assume all the intrinsic functions (defined as static inline by Clang's
-// library and as extern inline by GCC) are in fact inlined. targets.bzl
-// generates a test that verifies this by detecting duplicate symbols.
-#include <smmintrin.h> // SSE4.1
-
-namespace highwayhash {
-// To prevent ODR violations when including this from multiple translation
-// units (TU) that are compiled with different flags, the contents must reside
-// in a namespace whose name is unique to the TU. NOTE: this behavior is
-// incompatible with precompiled modules and requires textual inclusion instead.
-namespace HH_TARGET_NAME {
-
-// Primary template for 128-bit SSE4.1 vectors; only specializations are used.
-template <typename T>
-class V128 {};
-
-template <>
-class V128<uint8_t> {
- public:
- using Intrinsic = __m128i;
- using T = uint8_t;
- static constexpr size_t N = 16;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE V128() {}
-
- // Broadcasts i to all lanes (usually by loading from memory).
- HH_INLINE explicit V128(T i) : v_(_mm_set1_epi8(i)) {}
-
- // Copy from other vector.
- HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
- template <typename U>
- HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
- HH_INLINE V128& operator=(const V128& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE V128(const Intrinsic& v) : v_(v) {}
- HH_INLINE V128& operator=(const Intrinsic& v) {
- v_ = v;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return v_; }
-
- // There are no greater-than comparison instructions for unsigned T.
- HH_INLINE V128 operator==(const V128& other) const {
- return V128(_mm_cmpeq_epi8(v_, other.v_));
- }
-
- HH_INLINE V128& operator+=(const V128& other) {
- v_ = _mm_add_epi8(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator-=(const V128& other) {
- v_ = _mm_sub_epi8(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V128& operator&=(const V128& other) {
- v_ = _mm_and_si128(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator|=(const V128& other) {
- v_ = _mm_or_si128(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator^=(const V128& other) {
- v_ = _mm_xor_si128(v_, other.v_);
- return *this;
- }
-
- private:
- Intrinsic v_;
-};
-
-template <>
-class V128<uint16_t> {
- public:
- using Intrinsic = __m128i;
- using T = uint16_t;
- static constexpr size_t N = 8;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE V128() {}
-
- // Lane 0 (p_0) is the lowest.
- HH_INLINE V128(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
- : v_(_mm_set_epi16(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {}
-
- // Broadcasts i to all lanes (usually by loading from memory).
- HH_INLINE explicit V128(T i) : v_(_mm_set1_epi16(i)) {}
-
- // Copy from other vector.
- HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
- template <typename U>
- HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
- HH_INLINE V128& operator=(const V128& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE V128(const Intrinsic& v) : v_(v) {}
- HH_INLINE V128& operator=(const Intrinsic& v) {
- v_ = v;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return v_; }
-
- // There are no greater-than comparison instructions for unsigned T.
- HH_INLINE V128 operator==(const V128& other) const {
- return V128(_mm_cmpeq_epi16(v_, other.v_));
- }
-
- HH_INLINE V128& operator+=(const V128& other) {
- v_ = _mm_add_epi16(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator-=(const V128& other) {
- v_ = _mm_sub_epi16(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V128& operator&=(const V128& other) {
- v_ = _mm_and_si128(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator|=(const V128& other) {
- v_ = _mm_or_si128(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator^=(const V128& other) {
- v_ = _mm_xor_si128(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V128& operator<<=(const int count) {
- v_ = _mm_slli_epi16(v_, count);
- return *this;
- }
- HH_INLINE V128& operator<<=(const Intrinsic& count) {
- v_ = _mm_sll_epi16(v_, count);
- return *this;
- }
-
- HH_INLINE V128& operator>>=(const int count) {
- v_ = _mm_srli_epi16(v_, count);
- return *this;
- }
- HH_INLINE V128& operator>>=(const Intrinsic& count) {
- v_ = _mm_srl_epi16(v_, count);
- return *this;
- }
-
- private:
- Intrinsic v_;
-};
-
-template <>
-class V128<uint32_t> {
- public:
- using Intrinsic = __m128i;
- using T = uint32_t;
- static constexpr size_t N = 4;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE V128() {}
-
- // Lane 0 (p_0) is the lowest.
- HH_INLINE V128(T p_3, T p_2, T p_1, T p_0)
- : v_(_mm_set_epi32(p_3, p_2, p_1, p_0)) {}
-
- // Broadcasts i to all lanes (usually by loading from memory).
- HH_INLINE explicit V128(T i) : v_(_mm_set1_epi32(i)) {}
-
- // Copy from other vector.
- HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
- template <typename U>
- HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
- HH_INLINE V128& operator=(const V128& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE V128(const Intrinsic& v) : v_(v) {}
- HH_INLINE V128& operator=(const Intrinsic& v) {
- v_ = v;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return v_; }
-
- // There are no greater-than comparison instructions for unsigned T.
- HH_INLINE V128 operator==(const V128& other) const {
- return V128(_mm_cmpeq_epi32(v_, other.v_));
- }
-
- HH_INLINE V128& operator+=(const V128& other) {
- v_ = _mm_add_epi32(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator-=(const V128& other) {
- v_ = _mm_sub_epi32(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V128& operator&=(const V128& other) {
- v_ = _mm_and_si128(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator|=(const V128& other) {
- v_ = _mm_or_si128(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator^=(const V128& other) {
- v_ = _mm_xor_si128(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V128& operator<<=(const int count) {
- v_ = _mm_slli_epi32(v_, count);
- return *this;
- }
- HH_INLINE V128& operator<<=(const Intrinsic& count) {
- v_ = _mm_sll_epi32(v_, count);
- return *this;
- }
-
- HH_INLINE V128& operator>>=(const int count) {
- v_ = _mm_srli_epi32(v_, count);
- return *this;
- }
- HH_INLINE V128& operator>>=(const Intrinsic& count) {
- v_ = _mm_srl_epi32(v_, count);
- return *this;
- }
-
- private:
- Intrinsic v_;
-};
-
-template <>
-class V128<uint64_t> {
- public:
- using Intrinsic = __m128i;
- using T = uint64_t;
- static constexpr size_t N = 2;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE V128() {}
-
- // Lane 0 (p_0) is the lowest.
- HH_INLINE V128(T p_1, T p_0) : v_(_mm_set_epi64x(p_1, p_0)) {}
-
- // Broadcasts i to all lanes (usually by loading from memory).
- HH_INLINE explicit V128(T i) : v_(_mm_set_epi64x(i, i)) {}
-
- // Copy from other vector.
- HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
- template <typename U>
- HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
- HH_INLINE V128& operator=(const V128& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE V128(const Intrinsic& v) : v_(v) {}
- HH_INLINE V128& operator=(const Intrinsic& v) {
- v_ = v;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return v_; }
-
- // There are no greater-than comparison instructions for unsigned T.
- HH_INLINE V128 operator==(const V128& other) const {
- return V128(_mm_cmpeq_epi64(v_, other.v_));
- }
-
- HH_INLINE V128& operator+=(const V128& other) {
- v_ = _mm_add_epi64(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator-=(const V128& other) {
- v_ = _mm_sub_epi64(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V128& operator&=(const V128& other) {
- v_ = _mm_and_si128(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator|=(const V128& other) {
- v_ = _mm_or_si128(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator^=(const V128& other) {
- v_ = _mm_xor_si128(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V128& operator<<=(const int count) {
- v_ = _mm_slli_epi64(v_, count);
- return *this;
- }
- HH_INLINE V128& operator<<=(const Intrinsic& count) {
- v_ = _mm_sll_epi64(v_, count);
- return *this;
- }
-
- HH_INLINE V128& operator>>=(const int count) {
- v_ = _mm_srli_epi64(v_, count);
- return *this;
- }
- HH_INLINE V128& operator>>=(const Intrinsic& count) {
- v_ = _mm_srl_epi64(v_, count);
- return *this;
- }
-
- private:
- Intrinsic v_;
-};
-
-template <>
-class V128<float> {
- public:
- using Intrinsic = __m128;
- using T = float;
- static constexpr size_t N = 4;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE V128() {}
-
- // Lane 0 (p_0) is the lowest.
- HH_INLINE V128(T p_3, T p_2, T p_1, T p_0)
- : v_(_mm_set_ps(p_3, p_2, p_1, p_0)) {}
-
- // Broadcasts to all lanes.
- HH_INLINE explicit V128(T f) : v_(_mm_set1_ps(f)) {}
-
- // Copy from other vector.
- HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
- template <typename U>
- HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
- HH_INLINE V128& operator=(const V128& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE V128(const Intrinsic& v) : v_(v) {}
- HH_INLINE V128& operator=(const Intrinsic& v) {
- v_ = v;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return v_; }
-
- HH_INLINE V128 operator==(const V128& other) const {
- return V128(_mm_cmpeq_ps(v_, other.v_));
- }
- HH_INLINE V128 operator<(const V128& other) const {
- return V128(_mm_cmplt_ps(v_, other.v_));
- }
- HH_INLINE V128 operator>(const V128& other) const {
- return V128(_mm_cmplt_ps(other.v_, v_));
- }
-
- HH_INLINE V128& operator*=(const V128& other) {
- v_ = _mm_mul_ps(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator/=(const V128& other) {
- v_ = _mm_div_ps(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator+=(const V128& other) {
- v_ = _mm_add_ps(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator-=(const V128& other) {
- v_ = _mm_sub_ps(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V128& operator&=(const V128& other) {
- v_ = _mm_and_ps(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator|=(const V128& other) {
- v_ = _mm_or_ps(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator^=(const V128& other) {
- v_ = _mm_xor_ps(v_, other.v_);
- return *this;
- }
-
- private:
- Intrinsic v_;
-};
-
-template <>
-class V128<double> {
- public:
- using Intrinsic = __m128d;
- using T = double;
- static constexpr size_t N = 2;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE V128() {}
-
- // Lane 0 (p_0) is the lowest.
- HH_INLINE V128(T p_1, T p_0) : v_(_mm_set_pd(p_1, p_0)) {}
-
- // Broadcasts to all lanes.
- HH_INLINE explicit V128(T f) : v_(_mm_set1_pd(f)) {}
-
- // Copy from other vector.
- HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
- template <typename U>
- HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
- HH_INLINE V128& operator=(const V128& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE V128(const Intrinsic& v) : v_(v) {}
- HH_INLINE V128& operator=(const Intrinsic& v) {
- v_ = v;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return v_; }
-
- HH_INLINE V128 operator==(const V128& other) const {
- return V128(_mm_cmpeq_pd(v_, other.v_));
- }
- HH_INLINE V128 operator<(const V128& other) const {
- return V128(_mm_cmplt_pd(v_, other.v_));
- }
- HH_INLINE V128 operator>(const V128& other) const {
- return V128(_mm_cmplt_pd(other.v_, v_));
- }
-
- HH_INLINE V128& operator*=(const V128& other) {
- v_ = _mm_mul_pd(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator/=(const V128& other) {
- v_ = _mm_div_pd(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator+=(const V128& other) {
- v_ = _mm_add_pd(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator-=(const V128& other) {
- v_ = _mm_sub_pd(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V128& operator&=(const V128& other) {
- v_ = _mm_and_pd(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator|=(const V128& other) {
- v_ = _mm_or_pd(v_, other.v_);
- return *this;
- }
- HH_INLINE V128& operator^=(const V128& other) {
- v_ = _mm_xor_pd(v_, other.v_);
- return *this;
- }
-
- private:
- Intrinsic v_;
-};
-
-// Nonmember functions for any V128 via member functions.
-
-template <typename T>
-HH_INLINE V128<T> operator*(const V128<T>& left, const V128<T>& right) {
- V128<T> t(left);
- return t *= right;
-}
-
-template <typename T>
-HH_INLINE V128<T> operator/(const V128<T>& left, const V128<T>& right) {
- V128<T> t(left);
- return t /= right;
-}
-
-template <typename T>
-HH_INLINE V128<T> operator+(const V128<T>& left, const V128<T>& right) {
- V128<T> t(left);
- return t += right;
-}
-
-template <typename T>
-HH_INLINE V128<T> operator-(const V128<T>& left, const V128<T>& right) {
- V128<T> t(left);
- return t -= right;
-}
-
-template <typename T>
-HH_INLINE V128<T> operator&(const V128<T>& left, const V128<T>& right) {
- V128<T> t(left);
- return t &= right;
-}
-
-template <typename T>
-HH_INLINE V128<T> operator|(const V128<T>& left, const V128<T>& right) {
- V128<T> t(left);
- return t |= right;
-}
-
-template <typename T>
-HH_INLINE V128<T> operator^(const V128<T>& left, const V128<T>& right) {
- V128<T> t(left);
- return t ^= right;
-}
-
-template <typename T>
-HH_INLINE V128<T> operator<<(const V128<T>& v, const int count) {
- V128<T> t(v);
- return t <<= count;
-}
-
-template <typename T>
-HH_INLINE V128<T> operator>>(const V128<T>& v, const int count) {
- V128<T> t(v);
- return t >>= count;
-}
-
-template <typename T>
-HH_INLINE V128<T> operator<<(const V128<T>& v, const __m128i& count) {
- V128<T> t(v);
- return t <<= count;
-}
-
-template <typename T>
-HH_INLINE V128<T> operator>>(const V128<T>& v, const __m128i& count) {
- V128<T> t(v);
- return t >>= count;
-}
-
-using V16x8U = V128<uint8_t>;
-using V8x16U = V128<uint16_t>;
-using V4x32U = V128<uint32_t>;
-using V2x64U = V128<uint64_t>;
-using V4x32F = V128<float>;
-using V2x64F = V128<double>;
-
-// Load/Store for any V128.
-
-// We differentiate between targets' vector types via template specialization.
-// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may
-// generate better code in unoptimized builds. Only declare the primary
-// templates to avoid needing mutual exclusion with vector256.
-
-template <class V>
-HH_INLINE V Load(const typename V::T* const HH_RESTRICT from);
-
-template <class V>
-HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from);
-
-// "from" must be vector-aligned.
-template <>
-HH_INLINE V16x8U Load<V16x8U>(const V16x8U::T* const HH_RESTRICT from) {
- const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
- return V16x8U(_mm_load_si128(p));
-}
-template <>
-HH_INLINE V8x16U Load<V8x16U>(const V8x16U::T* const HH_RESTRICT from) {
- const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
- return V8x16U(_mm_load_si128(p));
-}
-template <>
-HH_INLINE V4x32U Load<V4x32U>(const V4x32U::T* const HH_RESTRICT from) {
- const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
- return V4x32U(_mm_load_si128(p));
-}
-template <>
-HH_INLINE V2x64U Load<V2x64U>(const V2x64U::T* const HH_RESTRICT from) {
- const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
- return V2x64U(_mm_load_si128(p));
-}
-template <>
-HH_INLINE V4x32F Load<V4x32F>(const V4x32F::T* const HH_RESTRICT from) {
- return V4x32F(_mm_load_ps(from));
-}
-template <>
-HH_INLINE V2x64F Load<V2x64F>(const V2x64F::T* const HH_RESTRICT from) {
- return V2x64F(_mm_load_pd(from));
-}
-
-template <>
-HH_INLINE V16x8U
-LoadUnaligned<V16x8U>(const V16x8U::T* const HH_RESTRICT from) {
- const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
- return V16x8U(_mm_loadu_si128(p));
-}
-template <>
-HH_INLINE V8x16U
-LoadUnaligned<V8x16U>(const V8x16U::T* const HH_RESTRICT from) {
- const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
- return V8x16U(_mm_loadu_si128(p));
-}
-template <>
-HH_INLINE V4x32U
-LoadUnaligned<V4x32U>(const V4x32U::T* const HH_RESTRICT from) {
- const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
- return V4x32U(_mm_loadu_si128(p));
-}
-template <>
-HH_INLINE V2x64U
-LoadUnaligned<V2x64U>(const V2x64U::T* const HH_RESTRICT from) {
- const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
- return V2x64U(_mm_loadu_si128(p));
-}
-template <>
-HH_INLINE V4x32F
-LoadUnaligned<V4x32F>(const V4x32F::T* const HH_RESTRICT from) {
- return V4x32F(_mm_loadu_ps(from));
-}
-template <>
-HH_INLINE V2x64F
-LoadUnaligned<V2x64F>(const V2x64F::T* const HH_RESTRICT from) {
- return V2x64F(_mm_loadu_pd(from));
-}
-
-// "to" must be vector-aligned.
-template <typename T>
-HH_INLINE void Store(const V128<T>& v, T* const HH_RESTRICT to) {
- _mm_store_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v);
-}
-HH_INLINE void Store(const V128<float>& v, float* const HH_RESTRICT to) {
- _mm_store_ps(to, v);
-}
-HH_INLINE void Store(const V128<double>& v, double* const HH_RESTRICT to) {
- _mm_store_pd(to, v);
-}
-
-template <typename T>
-HH_INLINE void StoreUnaligned(const V128<T>& v, T* const HH_RESTRICT to) {
- _mm_storeu_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v);
-}
-HH_INLINE void StoreUnaligned(const V128<float>& v,
- float* const HH_RESTRICT to) {
- _mm_storeu_ps(to, v);
-}
-HH_INLINE void StoreUnaligned(const V128<double>& v,
- double* const HH_RESTRICT to) {
- _mm_storeu_pd(to, v);
-}
-
-// Writes directly to (aligned) memory, bypassing the cache. This is useful for
-// data that will not be read again in the near future.
-template <typename T>
-HH_INLINE void Stream(const V128<T>& v, T* const HH_RESTRICT to) {
- _mm_stream_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v);
-}
-HH_INLINE void Stream(const V128<float>& v, float* const HH_RESTRICT to) {
- _mm_stream_ps(to, v);
-}
-HH_INLINE void Stream(const V128<double>& v, double* const HH_RESTRICT to) {
- _mm_stream_pd(to, v);
-}
-
-// Miscellaneous functions.
-
-template <typename T>
-HH_INLINE V128<T> RotateLeft(const V128<T>& v, const int count) {
- constexpr size_t num_bits = sizeof(T) * 8;
- return (v << count) | (v >> (num_bits - count));
-}
-
-template <typename T>
-HH_INLINE V128<T> AndNot(const V128<T>& neg_mask, const V128<T>& values) {
- return V128<T>(_mm_andnot_si128(neg_mask, values));
-}
-template <>
-HH_INLINE V128<float> AndNot(const V128<float>& neg_mask,
- const V128<float>& values) {
- return V128<float>(_mm_andnot_ps(neg_mask, values));
-}
-template <>
-HH_INLINE V128<double> AndNot(const V128<double>& neg_mask,
- const V128<double>& values) {
- return V128<double>(_mm_andnot_pd(neg_mask, values));
-}
-
-HH_INLINE V4x32F Select(const V4x32F& a, const V4x32F& b, const V4x32F& mask) {
- return V4x32F(_mm_blendv_ps(a, b, mask));
-}
-
-HH_INLINE V2x64F Select(const V2x64F& a, const V2x64F& b, const V2x64F& mask) {
- return V2x64F(_mm_blendv_pd(a, b, mask));
-}
-
-// Min/Max
-
-HH_INLINE V16x8U Min(const V16x8U& v0, const V16x8U& v1) {
- return V16x8U(_mm_min_epu8(v0, v1));
-}
-
-HH_INLINE V16x8U Max(const V16x8U& v0, const V16x8U& v1) {
- return V16x8U(_mm_max_epu8(v0, v1));
-}
-
-HH_INLINE V8x16U Min(const V8x16U& v0, const V8x16U& v1) {
- return V8x16U(_mm_min_epu16(v0, v1));
-}
-
-HH_INLINE V8x16U Max(const V8x16U& v0, const V8x16U& v1) {
- return V8x16U(_mm_max_epu16(v0, v1));
-}
-
-HH_INLINE V4x32U Min(const V4x32U& v0, const V4x32U& v1) {
- return V4x32U(_mm_min_epu32(v0, v1));
-}
-
-HH_INLINE V4x32U Max(const V4x32U& v0, const V4x32U& v1) {
- return V4x32U(_mm_max_epu32(v0, v1));
-}
-
-HH_INLINE V4x32F Min(const V4x32F& v0, const V4x32F& v1) {
- return V4x32F(_mm_min_ps(v0, v1));
-}
-
-HH_INLINE V4x32F Max(const V4x32F& v0, const V4x32F& v1) {
- return V4x32F(_mm_max_ps(v0, v1));
-}
-
-HH_INLINE V2x64F Min(const V2x64F& v0, const V2x64F& v1) {
- return V2x64F(_mm_min_pd(v0, v1));
-}
-
-HH_INLINE V2x64F Max(const V2x64F& v0, const V2x64F& v1) {
- return V2x64F(_mm_max_pd(v0, v1));
-}
-
-} // namespace HH_TARGET_NAME
-} // namespace highwayhash
-
-#endif // HH_DISABLE_TARGET_SPECIFIC
-#endif // HIGHWAYHASH_VECTOR128_H_
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_VECTOR128_H_
+#define HIGHWAYHASH_VECTOR128_H_
+
+// Defines SIMD vector classes ("V2x64U") with overloaded arithmetic operators:
+// const V2x64U masked_sum = (a + b) & m;
+// This is shorter and more readable than compiler intrinsics:
+// const __m128i masked_sum = _mm_and_si128(_mm_add_epi64(a, b), m);
+// There is typically no runtime cost for these abstractions.
+//
+// The naming convention is VNxBBT where N is the number of lanes, BB the
+// number of bits per lane and T is the lane type: unsigned integer (U),
+// signed integer (I), or floating-point (F).
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents (otherwise compilation fails because -msse4.1 is not specified).
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+// WARNING: smmintrin.h will also be included through immintrin.h in the AVX2
+// translation unit, which is compiled with different flags. This risks ODR
+// violations, and can cause crashes when functions are not inlined and the
+// linker selects the AVX2 version. Unfortunately this include cannot reside
+// within a namespace due to conflicts with other system headers. We need to
+// assume all the intrinsic functions (defined as static inline by Clang's
+// library and as extern inline by GCC) are in fact inlined. targets.bzl
+// generates a test that verifies this by detecting duplicate symbols.
+#include <smmintrin.h> // SSE4.1
+
+namespace highwayhash {
+// To prevent ODR violations when including this from multiple translation
+// units (TU) that are compiled with different flags, the contents must reside
+// in a namespace whose name is unique to the TU. NOTE: this behavior is
+// incompatible with precompiled modules and requires textual inclusion instead.
+namespace HH_TARGET_NAME {
+
+// Primary template for 128-bit SSE4.1 vectors; only specializations are used.
+template <typename T>
+class V128 {};
+
+template <>
+class V128<uint8_t> {
+ public:
+ using Intrinsic = __m128i;
+ using T = uint8_t;
+ static constexpr size_t N = 16;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE V128() {}
+
+ // Broadcasts i to all lanes (usually by loading from memory).
+ HH_INLINE explicit V128(T i) : v_(_mm_set1_epi8(i)) {}
+
+ // Copy from other vector.
+ HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+ template <typename U>
+ HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+ HH_INLINE V128& operator=(const V128& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+ HH_INLINE V128& operator=(const Intrinsic& v) {
+ v_ = v;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return v_; }
+
+ // There are no greater-than comparison instructions for unsigned T.
+ HH_INLINE V128 operator==(const V128& other) const {
+ return V128(_mm_cmpeq_epi8(v_, other.v_));
+ }
+
+ HH_INLINE V128& operator+=(const V128& other) {
+ v_ = _mm_add_epi8(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator-=(const V128& other) {
+ v_ = _mm_sub_epi8(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V128& operator&=(const V128& other) {
+ v_ = _mm_and_si128(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator|=(const V128& other) {
+ v_ = _mm_or_si128(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator^=(const V128& other) {
+ v_ = _mm_xor_si128(v_, other.v_);
+ return *this;
+ }
+
+ private:
+ Intrinsic v_;
+};
+
+template <>
+class V128<uint16_t> {
+ public:
+ using Intrinsic = __m128i;
+ using T = uint16_t;
+ static constexpr size_t N = 8;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE V128() {}
+
+ // Lane 0 (p_0) is the lowest.
+ HH_INLINE V128(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
+ : v_(_mm_set_epi16(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {}
+
+ // Broadcasts i to all lanes (usually by loading from memory).
+ HH_INLINE explicit V128(T i) : v_(_mm_set1_epi16(i)) {}
+
+ // Copy from other vector.
+ HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+ template <typename U>
+ HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+ HH_INLINE V128& operator=(const V128& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+ HH_INLINE V128& operator=(const Intrinsic& v) {
+ v_ = v;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return v_; }
+
+ // There are no greater-than comparison instructions for unsigned T.
+ HH_INLINE V128 operator==(const V128& other) const {
+ return V128(_mm_cmpeq_epi16(v_, other.v_));
+ }
+
+ HH_INLINE V128& operator+=(const V128& other) {
+ v_ = _mm_add_epi16(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator-=(const V128& other) {
+ v_ = _mm_sub_epi16(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V128& operator&=(const V128& other) {
+ v_ = _mm_and_si128(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator|=(const V128& other) {
+ v_ = _mm_or_si128(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator^=(const V128& other) {
+ v_ = _mm_xor_si128(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V128& operator<<=(const int count) {
+ v_ = _mm_slli_epi16(v_, count);
+ return *this;
+ }
+ HH_INLINE V128& operator<<=(const Intrinsic& count) {
+ v_ = _mm_sll_epi16(v_, count);
+ return *this;
+ }
+
+ HH_INLINE V128& operator>>=(const int count) {
+ v_ = _mm_srli_epi16(v_, count);
+ return *this;
+ }
+ HH_INLINE V128& operator>>=(const Intrinsic& count) {
+ v_ = _mm_srl_epi16(v_, count);
+ return *this;
+ }
+
+ private:
+ Intrinsic v_;
+};
+
+template <>
+class V128<uint32_t> {
+ public:
+ using Intrinsic = __m128i;
+ using T = uint32_t;
+ static constexpr size_t N = 4;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE V128() {}
+
+ // Lane 0 (p_0) is the lowest.
+ HH_INLINE V128(T p_3, T p_2, T p_1, T p_0)
+ : v_(_mm_set_epi32(p_3, p_2, p_1, p_0)) {}
+
+ // Broadcasts i to all lanes (usually by loading from memory).
+ HH_INLINE explicit V128(T i) : v_(_mm_set1_epi32(i)) {}
+
+ // Copy from other vector.
+ HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+ template <typename U>
+ HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+ HH_INLINE V128& operator=(const V128& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+ HH_INLINE V128& operator=(const Intrinsic& v) {
+ v_ = v;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return v_; }
+
+ // There are no greater-than comparison instructions for unsigned T.
+ HH_INLINE V128 operator==(const V128& other) const {
+ return V128(_mm_cmpeq_epi32(v_, other.v_));
+ }
+
+ HH_INLINE V128& operator+=(const V128& other) {
+ v_ = _mm_add_epi32(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator-=(const V128& other) {
+ v_ = _mm_sub_epi32(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V128& operator&=(const V128& other) {
+ v_ = _mm_and_si128(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator|=(const V128& other) {
+ v_ = _mm_or_si128(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator^=(const V128& other) {
+ v_ = _mm_xor_si128(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V128& operator<<=(const int count) {
+ v_ = _mm_slli_epi32(v_, count);
+ return *this;
+ }
+ HH_INLINE V128& operator<<=(const Intrinsic& count) {
+ v_ = _mm_sll_epi32(v_, count);
+ return *this;
+ }
+
+ HH_INLINE V128& operator>>=(const int count) {
+ v_ = _mm_srli_epi32(v_, count);
+ return *this;
+ }
+ HH_INLINE V128& operator>>=(const Intrinsic& count) {
+ v_ = _mm_srl_epi32(v_, count);
+ return *this;
+ }
+
+ private:
+ Intrinsic v_;
+};
+
+template <>
+class V128<uint64_t> {
+ public:
+ using Intrinsic = __m128i;
+ using T = uint64_t;
+ static constexpr size_t N = 2;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE V128() {}
+
+ // Lane 0 (p_0) is the lowest.
+ HH_INLINE V128(T p_1, T p_0) : v_(_mm_set_epi64x(p_1, p_0)) {}
+
+ // Broadcasts i to all lanes (usually by loading from memory).
+ HH_INLINE explicit V128(T i) : v_(_mm_set_epi64x(i, i)) {}
+
+ // Copy from other vector.
+ HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+ template <typename U>
+ HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+ HH_INLINE V128& operator=(const V128& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+ HH_INLINE V128& operator=(const Intrinsic& v) {
+ v_ = v;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return v_; }
+
+ // There are no greater-than comparison instructions for unsigned T.
+ HH_INLINE V128 operator==(const V128& other) const {
+ return V128(_mm_cmpeq_epi64(v_, other.v_));
+ }
+
+ HH_INLINE V128& operator+=(const V128& other) {
+ v_ = _mm_add_epi64(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator-=(const V128& other) {
+ v_ = _mm_sub_epi64(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V128& operator&=(const V128& other) {
+ v_ = _mm_and_si128(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator|=(const V128& other) {
+ v_ = _mm_or_si128(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator^=(const V128& other) {
+ v_ = _mm_xor_si128(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V128& operator<<=(const int count) {
+ v_ = _mm_slli_epi64(v_, count);
+ return *this;
+ }
+ HH_INLINE V128& operator<<=(const Intrinsic& count) {
+ v_ = _mm_sll_epi64(v_, count);
+ return *this;
+ }
+
+ HH_INLINE V128& operator>>=(const int count) {
+ v_ = _mm_srli_epi64(v_, count);
+ return *this;
+ }
+ HH_INLINE V128& operator>>=(const Intrinsic& count) {
+ v_ = _mm_srl_epi64(v_, count);
+ return *this;
+ }
+
+ private:
+ Intrinsic v_;
+};
+
+template <>
+class V128<float> {
+ public:
+ using Intrinsic = __m128;
+ using T = float;
+ static constexpr size_t N = 4;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE V128() {}
+
+ // Lane 0 (p_0) is the lowest.
+ HH_INLINE V128(T p_3, T p_2, T p_1, T p_0)
+ : v_(_mm_set_ps(p_3, p_2, p_1, p_0)) {}
+
+ // Broadcasts to all lanes.
+ HH_INLINE explicit V128(T f) : v_(_mm_set1_ps(f)) {}
+
+ // Copy from other vector.
+ HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+ template <typename U>
+ HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+ HH_INLINE V128& operator=(const V128& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+ HH_INLINE V128& operator=(const Intrinsic& v) {
+ v_ = v;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return v_; }
+
+ HH_INLINE V128 operator==(const V128& other) const {
+ return V128(_mm_cmpeq_ps(v_, other.v_));
+ }
+ HH_INLINE V128 operator<(const V128& other) const {
+ return V128(_mm_cmplt_ps(v_, other.v_));
+ }
+ HH_INLINE V128 operator>(const V128& other) const {
+ return V128(_mm_cmplt_ps(other.v_, v_));
+ }
+
+ HH_INLINE V128& operator*=(const V128& other) {
+ v_ = _mm_mul_ps(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator/=(const V128& other) {
+ v_ = _mm_div_ps(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator+=(const V128& other) {
+ v_ = _mm_add_ps(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator-=(const V128& other) {
+ v_ = _mm_sub_ps(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V128& operator&=(const V128& other) {
+ v_ = _mm_and_ps(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator|=(const V128& other) {
+ v_ = _mm_or_ps(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator^=(const V128& other) {
+ v_ = _mm_xor_ps(v_, other.v_);
+ return *this;
+ }
+
+ private:
+ Intrinsic v_;
+};
+
+template <>
+class V128<double> {
+ public:
+ using Intrinsic = __m128d;
+ using T = double;
+ static constexpr size_t N = 2;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE V128() {}
+
+ // Lane 0 (p_0) is the lowest.
+ HH_INLINE V128(T p_1, T p_0) : v_(_mm_set_pd(p_1, p_0)) {}
+
+ // Broadcasts to all lanes.
+ HH_INLINE explicit V128(T f) : v_(_mm_set1_pd(f)) {}
+
+ // Copy from other vector.
+ HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+ template <typename U>
+ HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+ HH_INLINE V128& operator=(const V128& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+ HH_INLINE V128& operator=(const Intrinsic& v) {
+ v_ = v;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return v_; }
+
+ HH_INLINE V128 operator==(const V128& other) const {
+ return V128(_mm_cmpeq_pd(v_, other.v_));
+ }
+ HH_INLINE V128 operator<(const V128& other) const {
+ return V128(_mm_cmplt_pd(v_, other.v_));
+ }
+ HH_INLINE V128 operator>(const V128& other) const {
+ return V128(_mm_cmplt_pd(other.v_, v_));
+ }
+
+ HH_INLINE V128& operator*=(const V128& other) {
+ v_ = _mm_mul_pd(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator/=(const V128& other) {
+ v_ = _mm_div_pd(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator+=(const V128& other) {
+ v_ = _mm_add_pd(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator-=(const V128& other) {
+ v_ = _mm_sub_pd(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V128& operator&=(const V128& other) {
+ v_ = _mm_and_pd(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator|=(const V128& other) {
+ v_ = _mm_or_pd(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V128& operator^=(const V128& other) {
+ v_ = _mm_xor_pd(v_, other.v_);
+ return *this;
+ }
+
+ private:
+ Intrinsic v_;
+};
+
+// Nonmember functions for any V128 via member functions.
+
+template <typename T>
+HH_INLINE V128<T> operator*(const V128<T>& left, const V128<T>& right) {
+ V128<T> t(left);
+ return t *= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator/(const V128<T>& left, const V128<T>& right) {
+ V128<T> t(left);
+ return t /= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator+(const V128<T>& left, const V128<T>& right) {
+ V128<T> t(left);
+ return t += right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator-(const V128<T>& left, const V128<T>& right) {
+ V128<T> t(left);
+ return t -= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator&(const V128<T>& left, const V128<T>& right) {
+ V128<T> t(left);
+ return t &= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator|(const V128<T>& left, const V128<T>& right) {
+ V128<T> t(left);
+ return t |= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator^(const V128<T>& left, const V128<T>& right) {
+ V128<T> t(left);
+ return t ^= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator<<(const V128<T>& v, const int count) {
+ V128<T> t(v);
+ return t <<= count;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator>>(const V128<T>& v, const int count) {
+ V128<T> t(v);
+ return t >>= count;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator<<(const V128<T>& v, const __m128i& count) {
+ V128<T> t(v);
+ return t <<= count;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator>>(const V128<T>& v, const __m128i& count) {
+ V128<T> t(v);
+ return t >>= count;
+}
+
+using V16x8U = V128<uint8_t>;
+using V8x16U = V128<uint16_t>;
+using V4x32U = V128<uint32_t>;
+using V2x64U = V128<uint64_t>;
+using V4x32F = V128<float>;
+using V2x64F = V128<double>;
+
+// Load/Store for any V128.
+
+// We differentiate between targets' vector types via template specialization.
+// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may
+// generate better code in unoptimized builds. Only declare the primary
+// templates to avoid needing mutual exclusion with vector256.
+
+template <class V>
+HH_INLINE V Load(const typename V::T* const HH_RESTRICT from);
+
+template <class V>
+HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from);
+
+// "from" must be vector-aligned.
+template <>
+HH_INLINE V16x8U Load<V16x8U>(const V16x8U::T* const HH_RESTRICT from) {
+ const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+ return V16x8U(_mm_load_si128(p));
+}
+template <>
+HH_INLINE V8x16U Load<V8x16U>(const V8x16U::T* const HH_RESTRICT from) {
+ const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+ return V8x16U(_mm_load_si128(p));
+}
+template <>
+HH_INLINE V4x32U Load<V4x32U>(const V4x32U::T* const HH_RESTRICT from) {
+ const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+ return V4x32U(_mm_load_si128(p));
+}
+template <>
+HH_INLINE V2x64U Load<V2x64U>(const V2x64U::T* const HH_RESTRICT from) {
+ const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+ return V2x64U(_mm_load_si128(p));
+}
+template <>
+HH_INLINE V4x32F Load<V4x32F>(const V4x32F::T* const HH_RESTRICT from) {
+ return V4x32F(_mm_load_ps(from));
+}
+template <>
+HH_INLINE V2x64F Load<V2x64F>(const V2x64F::T* const HH_RESTRICT from) {
+ return V2x64F(_mm_load_pd(from));
+}
+
+template <>
+HH_INLINE V16x8U
+LoadUnaligned<V16x8U>(const V16x8U::T* const HH_RESTRICT from) {
+ const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+ return V16x8U(_mm_loadu_si128(p));
+}
+template <>
+HH_INLINE V8x16U
+LoadUnaligned<V8x16U>(const V8x16U::T* const HH_RESTRICT from) {
+ const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+ return V8x16U(_mm_loadu_si128(p));
+}
+template <>
+HH_INLINE V4x32U
+LoadUnaligned<V4x32U>(const V4x32U::T* const HH_RESTRICT from) {
+ const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+ return V4x32U(_mm_loadu_si128(p));
+}
+template <>
+HH_INLINE V2x64U
+LoadUnaligned<V2x64U>(const V2x64U::T* const HH_RESTRICT from) {
+ const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+ return V2x64U(_mm_loadu_si128(p));
+}
+template <>
+HH_INLINE V4x32F
+LoadUnaligned<V4x32F>(const V4x32F::T* const HH_RESTRICT from) {
+ return V4x32F(_mm_loadu_ps(from));
+}
+template <>
+HH_INLINE V2x64F
+LoadUnaligned<V2x64F>(const V2x64F::T* const HH_RESTRICT from) {
+ return V2x64F(_mm_loadu_pd(from));
+}
+
+// "to" must be vector-aligned.
+template <typename T>
+HH_INLINE void Store(const V128<T>& v, T* const HH_RESTRICT to) {
+ _mm_store_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void Store(const V128<float>& v, float* const HH_RESTRICT to) {
+ _mm_store_ps(to, v);
+}
+HH_INLINE void Store(const V128<double>& v, double* const HH_RESTRICT to) {
+ _mm_store_pd(to, v);
+}
+
+template <typename T>
+HH_INLINE void StoreUnaligned(const V128<T>& v, T* const HH_RESTRICT to) {
+ _mm_storeu_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void StoreUnaligned(const V128<float>& v,
+ float* const HH_RESTRICT to) {
+ _mm_storeu_ps(to, v);
+}
+HH_INLINE void StoreUnaligned(const V128<double>& v,
+ double* const HH_RESTRICT to) {
+ _mm_storeu_pd(to, v);
+}
+
+// Writes directly to (aligned) memory, bypassing the cache. This is useful for
+// data that will not be read again in the near future.
+template <typename T>
+HH_INLINE void Stream(const V128<T>& v, T* const HH_RESTRICT to) {
+ _mm_stream_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void Stream(const V128<float>& v, float* const HH_RESTRICT to) {
+ _mm_stream_ps(to, v);
+}
+HH_INLINE void Stream(const V128<double>& v, double* const HH_RESTRICT to) {
+ _mm_stream_pd(to, v);
+}
+
+// Miscellaneous functions.
+
+template <typename T>
+HH_INLINE V128<T> RotateLeft(const V128<T>& v, const int count) {
+ constexpr size_t num_bits = sizeof(T) * 8;
+ return (v << count) | (v >> (num_bits - count));
+}
+
+template <typename T>
+HH_INLINE V128<T> AndNot(const V128<T>& neg_mask, const V128<T>& values) {
+ return V128<T>(_mm_andnot_si128(neg_mask, values));
+}
+template <>
+HH_INLINE V128<float> AndNot(const V128<float>& neg_mask,
+ const V128<float>& values) {
+ return V128<float>(_mm_andnot_ps(neg_mask, values));
+}
+template <>
+HH_INLINE V128<double> AndNot(const V128<double>& neg_mask,
+ const V128<double>& values) {
+ return V128<double>(_mm_andnot_pd(neg_mask, values));
+}
+
+HH_INLINE V4x32F Select(const V4x32F& a, const V4x32F& b, const V4x32F& mask) {
+ return V4x32F(_mm_blendv_ps(a, b, mask));
+}
+
+HH_INLINE V2x64F Select(const V2x64F& a, const V2x64F& b, const V2x64F& mask) {
+ return V2x64F(_mm_blendv_pd(a, b, mask));
+}
+
+// Min/Max
+
+HH_INLINE V16x8U Min(const V16x8U& v0, const V16x8U& v1) {
+ return V16x8U(_mm_min_epu8(v0, v1));
+}
+
+HH_INLINE V16x8U Max(const V16x8U& v0, const V16x8U& v1) {
+ return V16x8U(_mm_max_epu8(v0, v1));
+}
+
+HH_INLINE V8x16U Min(const V8x16U& v0, const V8x16U& v1) {
+ return V8x16U(_mm_min_epu16(v0, v1));
+}
+
+HH_INLINE V8x16U Max(const V8x16U& v0, const V8x16U& v1) {
+ return V8x16U(_mm_max_epu16(v0, v1));
+}
+
+HH_INLINE V4x32U Min(const V4x32U& v0, const V4x32U& v1) {
+ return V4x32U(_mm_min_epu32(v0, v1));
+}
+
+HH_INLINE V4x32U Max(const V4x32U& v0, const V4x32U& v1) {
+ return V4x32U(_mm_max_epu32(v0, v1));
+}
+
+HH_INLINE V4x32F Min(const V4x32F& v0, const V4x32F& v1) {
+ return V4x32F(_mm_min_ps(v0, v1));
+}
+
+HH_INLINE V4x32F Max(const V4x32F& v0, const V4x32F& v1) {
+ return V4x32F(_mm_max_ps(v0, v1));
+}
+
+HH_INLINE V2x64F Min(const V2x64F& v0, const V2x64F& v1) {
+ return V2x64F(_mm_min_pd(v0, v1));
+}
+
+HH_INLINE V2x64F Max(const V2x64F& v0, const V2x64F& v1) {
+ return V2x64F(_mm_max_pd(v0, v1));
+}
+
+} // namespace HH_TARGET_NAME
+} // namespace highwayhash
+
+#endif // HH_DISABLE_TARGET_SPECIFIC
+#endif // HIGHWAYHASH_VECTOR128_H_
diff --git a/contrib/libs/highwayhash/highwayhash/vector256.h b/contrib/libs/highwayhash/highwayhash/vector256.h
index d1ccec49ef..29199ddf00 100644
--- a/contrib/libs/highwayhash/highwayhash/vector256.h
+++ b/contrib/libs/highwayhash/highwayhash/vector256.h
@@ -1,758 +1,758 @@
-// Copyright 2016 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_VECTOR256_H_
-#define HIGHWAYHASH_VECTOR256_H_
-
-// Defines SIMD vector classes ("V4x64U") with overloaded arithmetic operators:
-// const V4x64U masked_sum = (a + b) & m;
-// This is shorter and more readable than compiler intrinsics:
-// const __m256i masked_sum = _mm256_and_si256(_mm256_add_epi64(a, b), m);
-// There is typically no runtime cost for these abstractions.
-//
-// The naming convention is VNxBBT where N is the number of lanes, BB the
-// number of bits per lane and T is the lane type: unsigned integer (U),
-// signed integer (I), or floating-point (F).
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/compiler_specific.h"
-
-// For auto-dependency generation, we need to include all headers but not their
-// contents (otherwise compilation fails because -mavx2 is not specified).
-#ifndef HH_DISABLE_TARGET_SPECIFIC
-
-// (This include cannot be moved within a namespace due to conflicts with
-// other system headers; see the comment in hh_sse41.h.)
-#include <immintrin.h>
-
-namespace highwayhash {
-// To prevent ODR violations when including this from multiple translation
-// units (TU) that are compiled with different flags, the contents must reside
-// in a namespace whose name is unique to the TU. NOTE: this behavior is
-// incompatible with precompiled modules and requires textual inclusion instead.
-namespace HH_TARGET_NAME {
-
-// Primary template for 256-bit AVX2 vectors; only specializations are used.
-template <typename T>
-class V256 {};
-
-template <>
-class V256<uint8_t> {
- public:
- using Intrinsic = __m256i;
- using T = uint8_t;
- static constexpr size_t N = 32;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE V256() {}
-
- // Broadcasts i to all lanes.
- HH_INLINE explicit V256(T i)
- : v_(_mm256_broadcastb_epi8(_mm_cvtsi32_si128(i))) {}
-
- // Copy from other vector.
- HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
- template <typename U>
- HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
- HH_INLINE V256& operator=(const V256& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE V256(const Intrinsic& v) : v_(v) {}
- HH_INLINE V256& operator=(const Intrinsic& v) {
- v_ = v;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return v_; }
-
- // There are no greater-than comparison instructions for unsigned T.
- HH_INLINE V256 operator==(const V256& other) const {
- return V256(_mm256_cmpeq_epi8(v_, other.v_));
- }
-
- HH_INLINE V256& operator+=(const V256& other) {
- v_ = _mm256_add_epi8(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator-=(const V256& other) {
- v_ = _mm256_sub_epi8(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V256& operator&=(const V256& other) {
- v_ = _mm256_and_si256(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator|=(const V256& other) {
- v_ = _mm256_or_si256(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator^=(const V256& other) {
- v_ = _mm256_xor_si256(v_, other.v_);
- return *this;
- }
-
- private:
- Intrinsic v_;
-};
-
-template <>
-class V256<uint16_t> {
- public:
- using Intrinsic = __m256i;
- using T = uint16_t;
- static constexpr size_t N = 16;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE V256() {}
-
- // Lane 0 (p_0) is the lowest.
- HH_INLINE V256(T p_F, T p_E, T p_D, T p_C, T p_B, T p_A, T p_9, T p_8, T p_7,
- T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
- : v_(_mm256_set_epi16(p_F, p_E, p_D, p_C, p_B, p_A, p_9, p_8, p_7, p_6,
- p_5, p_4, p_3, p_2, p_1, p_0)) {}
-
- // Broadcasts i to all lanes.
- HH_INLINE explicit V256(T i)
- : v_(_mm256_broadcastw_epi16(_mm_cvtsi32_si128(i))) {}
-
- // Copy from other vector.
- HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
- template <typename U>
- HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
- HH_INLINE V256& operator=(const V256& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE V256(const Intrinsic& v) : v_(v) {}
- HH_INLINE V256& operator=(const Intrinsic& v) {
- v_ = v;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return v_; }
-
- // There are no greater-than comparison instructions for unsigned T.
- HH_INLINE V256 operator==(const V256& other) const {
- return V256(_mm256_cmpeq_epi16(v_, other.v_));
- }
-
- HH_INLINE V256& operator+=(const V256& other) {
- v_ = _mm256_add_epi16(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator-=(const V256& other) {
- v_ = _mm256_sub_epi16(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V256& operator&=(const V256& other) {
- v_ = _mm256_and_si256(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator|=(const V256& other) {
- v_ = _mm256_or_si256(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator^=(const V256& other) {
- v_ = _mm256_xor_si256(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V256& operator<<=(const int count) {
- v_ = _mm256_slli_epi16(v_, count);
- return *this;
- }
-
- HH_INLINE V256& operator>>=(const int count) {
- v_ = _mm256_srli_epi16(v_, count);
- return *this;
- }
-
- private:
- Intrinsic v_;
-};
-
-template <>
-class V256<uint32_t> {
- public:
- using Intrinsic = __m256i;
- using T = uint32_t;
- static constexpr size_t N = 8;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE V256() {}
-
- // Lane 0 (p_0) is the lowest.
- HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
- : v_(_mm256_set_epi32(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {}
-
- // Broadcasts i to all lanes.
- HH_INLINE explicit V256(T i)
- : v_(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(i))) {}
-
- // Copy from other vector.
- HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
- template <typename U>
- HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
- HH_INLINE V256& operator=(const V256& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE V256(const Intrinsic& v) : v_(v) {}
- HH_INLINE V256& operator=(const Intrinsic& v) {
- v_ = v;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return v_; }
-
- // There are no greater-than comparison instructions for unsigned T.
- HH_INLINE V256 operator==(const V256& other) const {
- return V256(_mm256_cmpeq_epi32(v_, other.v_));
- }
-
- HH_INLINE V256& operator+=(const V256& other) {
- v_ = _mm256_add_epi32(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator-=(const V256& other) {
- v_ = _mm256_sub_epi32(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V256& operator&=(const V256& other) {
- v_ = _mm256_and_si256(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator|=(const V256& other) {
- v_ = _mm256_or_si256(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator^=(const V256& other) {
- v_ = _mm256_xor_si256(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V256& operator<<=(const int count) {
- v_ = _mm256_slli_epi32(v_, count);
- return *this;
- }
-
- HH_INLINE V256& operator>>=(const int count) {
- v_ = _mm256_srli_epi32(v_, count);
- return *this;
- }
-
- private:
- Intrinsic v_;
-};
-
-template <>
-class V256<uint64_t> {
- public:
- using Intrinsic = __m256i;
- using T = uint64_t;
- static constexpr size_t N = 4;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE V256() {}
-
- // Lane 0 (p_0) is the lowest.
- HH_INLINE V256(T p_3, T p_2, T p_1, T p_0)
- : v_(_mm256_set_epi64x(p_3, p_2, p_1, p_0)) {}
-
- // Broadcasts i to all lanes.
- HH_INLINE explicit V256(T i)
- : v_(_mm256_broadcastq_epi64(_mm_cvtsi64_si128(i))) {}
-
- // Copy from other vector.
- HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
- template <typename U>
- HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
- HH_INLINE V256& operator=(const V256& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE V256(const Intrinsic& v) : v_(v) {}
- HH_INLINE V256& operator=(const Intrinsic& v) {
- v_ = v;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return v_; }
-
- // There are no greater-than comparison instructions for unsigned T.
- HH_INLINE V256 operator==(const V256& other) const {
- return V256(_mm256_cmpeq_epi64(v_, other.v_));
- }
-
- HH_INLINE V256& operator+=(const V256& other) {
- v_ = _mm256_add_epi64(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator-=(const V256& other) {
- v_ = _mm256_sub_epi64(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V256& operator&=(const V256& other) {
- v_ = _mm256_and_si256(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator|=(const V256& other) {
- v_ = _mm256_or_si256(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator^=(const V256& other) {
- v_ = _mm256_xor_si256(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V256& operator<<=(const int count) {
- v_ = _mm256_slli_epi64(v_, count);
- return *this;
- }
-
- HH_INLINE V256& operator>>=(const int count) {
- v_ = _mm256_srli_epi64(v_, count);
- return *this;
- }
-
- private:
- Intrinsic v_;
-};
-
-template <>
-class V256<float> {
- public:
- using Intrinsic = __m256;
- using T = float;
- static constexpr size_t N = 8;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE V256() {}
-
- // Lane 0 (p_0) is the lowest.
- HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
- : v_(_mm256_set_ps(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {}
-
- // Broadcasts to all lanes.
- HH_INLINE explicit V256(T f) : v_(_mm256_set1_ps(f)) {}
-
- // Copy from other vector.
- HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
- template <typename U>
- HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
- HH_INLINE V256& operator=(const V256& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE V256(const Intrinsic& v) : v_(v) {}
- HH_INLINE V256& operator=(const Intrinsic& v) {
- v_ = v;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return v_; }
-
- HH_INLINE V256 operator==(const V256& other) const {
- return V256(_mm256_cmp_ps(v_, other.v_, 0));
- }
- HH_INLINE V256 operator<(const V256& other) const {
- return V256(_mm256_cmp_ps(v_, other.v_, 1));
- }
- HH_INLINE V256 operator>(const V256& other) const {
- return V256(_mm256_cmp_ps(other.v_, v_, 1));
- }
-
- HH_INLINE V256& operator*=(const V256& other) {
- v_ = _mm256_mul_ps(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator/=(const V256& other) {
- v_ = _mm256_div_ps(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator+=(const V256& other) {
- v_ = _mm256_add_ps(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator-=(const V256& other) {
- v_ = _mm256_sub_ps(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V256& operator&=(const V256& other) {
- v_ = _mm256_and_ps(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator|=(const V256& other) {
- v_ = _mm256_or_ps(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator^=(const V256& other) {
- v_ = _mm256_xor_ps(v_, other.v_);
- return *this;
- }
-
- private:
- Intrinsic v_;
-};
-
-template <>
-class V256<double> {
- public:
- using Intrinsic = __m256d;
- using T = double;
- static constexpr size_t N = 4;
-
- // Leaves v_ uninitialized - typically used for output parameters.
- HH_INLINE V256() {}
-
- // Lane 0 (p_0) is the lowest.
- HH_INLINE V256(T p_3, T p_2, T p_1, T p_0)
- : v_(_mm256_set_pd(p_3, p_2, p_1, p_0)) {}
-
- // Broadcasts to all lanes.
- HH_INLINE explicit V256(T f) : v_(_mm256_set1_pd(f)) {}
-
- // Copy from other vector.
- HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
- template <typename U>
- HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
- HH_INLINE V256& operator=(const V256& other) {
- v_ = other.v_;
- return *this;
- }
-
- // Convert from/to intrinsics.
- HH_INLINE V256(const Intrinsic& v) : v_(v) {}
- HH_INLINE V256& operator=(const Intrinsic& v) {
- v_ = v;
- return *this;
- }
- HH_INLINE operator Intrinsic() const { return v_; }
-
- HH_INLINE V256 operator==(const V256& other) const {
- return V256(_mm256_cmp_pd(v_, other.v_, 0));
- }
- HH_INLINE V256 operator<(const V256& other) const {
- return V256(_mm256_cmp_pd(v_, other.v_, 1));
- }
- HH_INLINE V256 operator>(const V256& other) const {
- return V256(_mm256_cmp_pd(other.v_, v_, 1));
- }
-
- HH_INLINE V256& operator*=(const V256& other) {
- v_ = _mm256_mul_pd(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator/=(const V256& other) {
- v_ = _mm256_div_pd(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator+=(const V256& other) {
- v_ = _mm256_add_pd(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator-=(const V256& other) {
- v_ = _mm256_sub_pd(v_, other.v_);
- return *this;
- }
-
- HH_INLINE V256& operator&=(const V256& other) {
- v_ = _mm256_and_pd(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator|=(const V256& other) {
- v_ = _mm256_or_pd(v_, other.v_);
- return *this;
- }
- HH_INLINE V256& operator^=(const V256& other) {
- v_ = _mm256_xor_pd(v_, other.v_);
- return *this;
- }
-
- private:
- Intrinsic v_;
-};
-
-// Nonmember functions for any V256 via member functions.
-
-template <typename T>
-HH_INLINE V256<T> operator*(const V256<T>& left, const V256<T>& right) {
- V256<T> t(left);
- return t *= right;
-}
-
-template <typename T>
-HH_INLINE V256<T> operator/(const V256<T>& left, const V256<T>& right) {
- V256<T> t(left);
- return t /= right;
-}
-
-template <typename T>
-HH_INLINE V256<T> operator+(const V256<T>& left, const V256<T>& right) {
- V256<T> t(left);
- return t += right;
-}
-
-template <typename T>
-HH_INLINE V256<T> operator-(const V256<T>& left, const V256<T>& right) {
- V256<T> t(left);
- return t -= right;
-}
-
-template <typename T>
-HH_INLINE V256<T> operator&(const V256<T>& left, const V256<T>& right) {
- V256<T> t(left);
- return t &= right;
-}
-
-template <typename T>
-HH_INLINE V256<T> operator|(const V256<T> left, const V256<T>& right) {
- V256<T> t(left);
- return t |= right;
-}
-
-template <typename T>
-HH_INLINE V256<T> operator^(const V256<T>& left, const V256<T>& right) {
- V256<T> t(left);
- return t ^= right;
-}
-
-template <typename T>
-HH_INLINE V256<T> operator<<(const V256<T>& v, const int count) {
- V256<T> t(v);
- return t <<= count;
-}
-
-template <typename T>
-HH_INLINE V256<T> operator>>(const V256<T>& v, const int count) {
- V256<T> t(v);
- return t >>= count;
-}
-
-// We do not provide operator<<(V, __m128i) because it has 4 cycle latency
-// (to broadcast the shift count). It is faster to use sllv_epi64 etc. instead.
-
-using V32x8U = V256<uint8_t>;
-using V16x16U = V256<uint16_t>;
-using V8x32U = V256<uint32_t>;
-using V4x64U = V256<uint64_t>;
-using V8x32F = V256<float>;
-using V4x64F = V256<double>;
-
-// Load/Store for any V256.
-
-// We differentiate between targets' vector types via template specialization.
-// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may
-// generate better code in unoptimized builds. Only declare the primary
-// templates to avoid needing mutual exclusion with vector128.
-
-template <class V>
-HH_INLINE V Load(const typename V::T* const HH_RESTRICT from);
-
-template <class V>
-HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from);
-
-template <>
-HH_INLINE V32x8U Load(const V32x8U::T* const HH_RESTRICT from) {
- const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
- return V32x8U(_mm256_load_si256(p));
-}
-template <>
-HH_INLINE V16x16U Load(const V16x16U::T* const HH_RESTRICT from) {
- const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
- return V16x16U(_mm256_load_si256(p));
-}
-template <>
-HH_INLINE V8x32U Load(const V8x32U::T* const HH_RESTRICT from) {
- const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
- return V8x32U(_mm256_load_si256(p));
-}
-template <>
-HH_INLINE V4x64U Load(const V4x64U::T* const HH_RESTRICT from) {
- const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
- return V4x64U(_mm256_load_si256(p));
-}
-template <>
-HH_INLINE V8x32F Load(const V8x32F::T* const HH_RESTRICT from) {
- return V8x32F(_mm256_load_ps(from));
-}
-template <>
-HH_INLINE V4x64F Load(const V4x64F::T* const HH_RESTRICT from) {
- return V4x64F(_mm256_load_pd(from));
-}
-
-template <>
-HH_INLINE V32x8U LoadUnaligned(const V32x8U::T* const HH_RESTRICT from) {
- const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
- return V32x8U(_mm256_loadu_si256(p));
-}
-template <>
-HH_INLINE V16x16U LoadUnaligned(const V16x16U::T* const HH_RESTRICT from) {
- const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
- return V16x16U(_mm256_loadu_si256(p));
-}
-template <>
-HH_INLINE V8x32U LoadUnaligned(const V8x32U::T* const HH_RESTRICT from) {
- const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
- return V8x32U(_mm256_loadu_si256(p));
-}
-template <>
-HH_INLINE V4x64U LoadUnaligned(const V4x64U::T* const HH_RESTRICT from) {
- const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
- return V4x64U(_mm256_loadu_si256(p));
-}
-template <>
-HH_INLINE V8x32F LoadUnaligned(const V8x32F::T* const HH_RESTRICT from) {
- return V8x32F(_mm256_loadu_ps(from));
-}
-template <>
-HH_INLINE V4x64F LoadUnaligned(const V4x64F::T* const HH_RESTRICT from) {
- return V4x64F(_mm256_loadu_pd(from));
-}
-
-// "to" must be vector-aligned.
-template <typename T>
-HH_INLINE void Store(const V256<T>& v, T* const HH_RESTRICT to) {
- _mm256_store_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v);
-}
-HH_INLINE void Store(const V256<float>& v, float* const HH_RESTRICT to) {
- _mm256_store_ps(to, v);
-}
-HH_INLINE void Store(const V256<double>& v, double* const HH_RESTRICT to) {
- _mm256_store_pd(to, v);
-}
-
-template <typename T>
-HH_INLINE void StoreUnaligned(const V256<T>& v, T* const HH_RESTRICT to) {
- _mm256_storeu_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v);
-}
-HH_INLINE void StoreUnaligned(const V256<float>& v,
- float* const HH_RESTRICT to) {
- _mm256_storeu_ps(to, v);
-}
-HH_INLINE void StoreUnaligned(const V256<double>& v,
- double* const HH_RESTRICT to) {
- _mm256_storeu_pd(to, v);
-}
-
-// Writes directly to (aligned) memory, bypassing the cache. This is useful for
-// data that will not be read again in the near future.
-template <typename T>
-HH_INLINE void Stream(const V256<T>& v, T* const HH_RESTRICT to) {
- _mm256_stream_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v);
-}
-HH_INLINE void Stream(const V256<float>& v, float* const HH_RESTRICT to) {
- _mm256_stream_ps(to, v);
-}
-HH_INLINE void Stream(const V256<double>& v, double* const HH_RESTRICT to) {
- _mm256_stream_pd(to, v);
-}
-
-// Miscellaneous functions.
-
-template <typename T>
-HH_INLINE V256<T> RotateLeft(const V256<T>& v, const int count) {
- constexpr size_t num_bits = sizeof(T) * 8;
- return (v << count) | (v >> (num_bits - count));
-}
-
-template <typename T>
-HH_INLINE V256<T> AndNot(const V256<T>& neg_mask, const V256<T>& values) {
- return V256<T>(_mm256_andnot_si256(neg_mask, values));
-}
-template <>
-HH_INLINE V256<float> AndNot(const V256<float>& neg_mask,
- const V256<float>& values) {
- return V256<float>(_mm256_andnot_ps(neg_mask, values));
-}
-template <>
-HH_INLINE V256<double> AndNot(const V256<double>& neg_mask,
- const V256<double>& values) {
- return V256<double>(_mm256_andnot_pd(neg_mask, values));
-}
-
-HH_INLINE V8x32F Select(const V8x32F& a, const V8x32F& b, const V8x32F& mask) {
- return V8x32F(_mm256_blendv_ps(a, b, mask));
-}
-
-HH_INLINE V4x64F Select(const V4x64F& a, const V4x64F& b, const V4x64F& mask) {
- return V4x64F(_mm256_blendv_pd(a, b, mask));
-}
-
-// Min/Max
-
-HH_INLINE V32x8U Min(const V32x8U& v0, const V32x8U& v1) {
- return V32x8U(_mm256_min_epu8(v0, v1));
-}
-
-HH_INLINE V32x8U Max(const V32x8U& v0, const V32x8U& v1) {
- return V32x8U(_mm256_max_epu8(v0, v1));
-}
-
-HH_INLINE V16x16U Min(const V16x16U& v0, const V16x16U& v1) {
- return V16x16U(_mm256_min_epu16(v0, v1));
-}
-
-HH_INLINE V16x16U Max(const V16x16U& v0, const V16x16U& v1) {
- return V16x16U(_mm256_max_epu16(v0, v1));
-}
-
-HH_INLINE V8x32U Min(const V8x32U& v0, const V8x32U& v1) {
- return V8x32U(_mm256_min_epu32(v0, v1));
-}
-
-HH_INLINE V8x32U Max(const V8x32U& v0, const V8x32U& v1) {
- return V8x32U(_mm256_max_epu32(v0, v1));
-}
-
-HH_INLINE V8x32F Min(const V8x32F& v0, const V8x32F& v1) {
- return V8x32F(_mm256_min_ps(v0, v1));
-}
-
-HH_INLINE V8x32F Max(const V8x32F& v0, const V8x32F& v1) {
- return V8x32F(_mm256_max_ps(v0, v1));
-}
-
-HH_INLINE V4x64F Min(const V4x64F& v0, const V4x64F& v1) {
- return V4x64F(_mm256_min_pd(v0, v1));
-}
-
-HH_INLINE V4x64F Max(const V4x64F& v0, const V4x64F& v1) {
- return V4x64F(_mm256_max_pd(v0, v1));
-}
-
-} // namespace HH_TARGET_NAME
-} // namespace highwayhash
-
-#endif // HH_DISABLE_TARGET_SPECIFIC
-#endif // HIGHWAYHASH_VECTOR256_H_
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_VECTOR256_H_
+#define HIGHWAYHASH_VECTOR256_H_
+
+// Defines SIMD vector classes ("V4x64U") with overloaded arithmetic operators:
+// const V4x64U masked_sum = (a + b) & m;
+// This is shorter and more readable than compiler intrinsics:
+// const __m256i masked_sum = _mm256_and_si256(_mm256_add_epi64(a, b), m);
+// There is typically no runtime cost for these abstractions.
+//
+// The naming convention is VNxBBT where N is the number of lanes, BB the
+// number of bits per lane and T is the lane type: unsigned integer (U),
+// signed integer (I), or floating-point (F).
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents (otherwise compilation fails because -mavx2 is not specified).
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+// (This include cannot be moved within a namespace due to conflicts with
+// other system headers; see the comment in hh_sse41.h.)
+#include <immintrin.h>
+
+namespace highwayhash {
+// To prevent ODR violations when including this from multiple translation
+// units (TU) that are compiled with different flags, the contents must reside
+// in a namespace whose name is unique to the TU. NOTE: this behavior is
+// incompatible with precompiled modules and requires textual inclusion instead.
+namespace HH_TARGET_NAME {
+
+// Primary template for 256-bit AVX2 vectors; only specializations are used.
+template <typename T>
+class V256 {};
+
+template <>
+class V256<uint8_t> {
+ public:
+ using Intrinsic = __m256i;
+ using T = uint8_t;
+ static constexpr size_t N = 32;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE V256() {}
+
+ // Broadcasts i to all lanes.
+ HH_INLINE explicit V256(T i)
+ : v_(_mm256_broadcastb_epi8(_mm_cvtsi32_si128(i))) {}
+
+ // Copy from other vector.
+ HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
+ template <typename U>
+ HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
+ HH_INLINE V256& operator=(const V256& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE V256(const Intrinsic& v) : v_(v) {}
+ HH_INLINE V256& operator=(const Intrinsic& v) {
+ v_ = v;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return v_; }
+
+ // There are no greater-than comparison instructions for unsigned T.
+ HH_INLINE V256 operator==(const V256& other) const {
+ return V256(_mm256_cmpeq_epi8(v_, other.v_));
+ }
+
+ HH_INLINE V256& operator+=(const V256& other) {
+ v_ = _mm256_add_epi8(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator-=(const V256& other) {
+ v_ = _mm256_sub_epi8(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V256& operator&=(const V256& other) {
+ v_ = _mm256_and_si256(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator|=(const V256& other) {
+ v_ = _mm256_or_si256(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator^=(const V256& other) {
+ v_ = _mm256_xor_si256(v_, other.v_);
+ return *this;
+ }
+
+ private:
+ Intrinsic v_;
+};
+
+template <>
+class V256<uint16_t> {
+ public:
+ using Intrinsic = __m256i;
+ using T = uint16_t;
+ static constexpr size_t N = 16;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE V256() {}
+
+ // Lane 0 (p_0) is the lowest.
+ HH_INLINE V256(T p_F, T p_E, T p_D, T p_C, T p_B, T p_A, T p_9, T p_8, T p_7,
+ T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
+ : v_(_mm256_set_epi16(p_F, p_E, p_D, p_C, p_B, p_A, p_9, p_8, p_7, p_6,
+ p_5, p_4, p_3, p_2, p_1, p_0)) {}
+
+ // Broadcasts i to all lanes.
+ HH_INLINE explicit V256(T i)
+ : v_(_mm256_broadcastw_epi16(_mm_cvtsi32_si128(i))) {}
+
+ // Copy from other vector.
+ HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
+ template <typename U>
+ HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
+ HH_INLINE V256& operator=(const V256& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE V256(const Intrinsic& v) : v_(v) {}
+ HH_INLINE V256& operator=(const Intrinsic& v) {
+ v_ = v;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return v_; }
+
+ // There are no greater-than comparison instructions for unsigned T.
+ HH_INLINE V256 operator==(const V256& other) const {
+ return V256(_mm256_cmpeq_epi16(v_, other.v_));
+ }
+
+ HH_INLINE V256& operator+=(const V256& other) {
+ v_ = _mm256_add_epi16(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator-=(const V256& other) {
+ v_ = _mm256_sub_epi16(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V256& operator&=(const V256& other) {
+ v_ = _mm256_and_si256(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator|=(const V256& other) {
+ v_ = _mm256_or_si256(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator^=(const V256& other) {
+ v_ = _mm256_xor_si256(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V256& operator<<=(const int count) {
+ v_ = _mm256_slli_epi16(v_, count);
+ return *this;
+ }
+
+ HH_INLINE V256& operator>>=(const int count) {
+ v_ = _mm256_srli_epi16(v_, count);
+ return *this;
+ }
+
+ private:
+ Intrinsic v_;
+};
+
+template <>
+class V256<uint32_t> {
+ public:
+ using Intrinsic = __m256i;
+ using T = uint32_t;
+ static constexpr size_t N = 8;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE V256() {}
+
+ // Lane 0 (p_0) is the lowest.
+ HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
+ : v_(_mm256_set_epi32(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {}
+
+ // Broadcasts i to all lanes.
+ HH_INLINE explicit V256(T i)
+ : v_(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(i))) {}
+
+ // Copy from other vector.
+ HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
+ template <typename U>
+ HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
+ HH_INLINE V256& operator=(const V256& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE V256(const Intrinsic& v) : v_(v) {}
+ HH_INLINE V256& operator=(const Intrinsic& v) {
+ v_ = v;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return v_; }
+
+ // There are no greater-than comparison instructions for unsigned T.
+ HH_INLINE V256 operator==(const V256& other) const {
+ return V256(_mm256_cmpeq_epi32(v_, other.v_));
+ }
+
+ HH_INLINE V256& operator+=(const V256& other) {
+ v_ = _mm256_add_epi32(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator-=(const V256& other) {
+ v_ = _mm256_sub_epi32(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V256& operator&=(const V256& other) {
+ v_ = _mm256_and_si256(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator|=(const V256& other) {
+ v_ = _mm256_or_si256(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator^=(const V256& other) {
+ v_ = _mm256_xor_si256(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V256& operator<<=(const int count) {
+ v_ = _mm256_slli_epi32(v_, count);
+ return *this;
+ }
+
+ HH_INLINE V256& operator>>=(const int count) {
+ v_ = _mm256_srli_epi32(v_, count);
+ return *this;
+ }
+
+ private:
+ Intrinsic v_;
+};
+
+template <>
+class V256<uint64_t> {
+ public:
+ using Intrinsic = __m256i;
+ using T = uint64_t;
+ static constexpr size_t N = 4;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE V256() {}
+
+ // Lane 0 (p_0) is the lowest.
+ HH_INLINE V256(T p_3, T p_2, T p_1, T p_0)
+ : v_(_mm256_set_epi64x(p_3, p_2, p_1, p_0)) {}
+
+ // Broadcasts i to all lanes.
+ HH_INLINE explicit V256(T i)
+ : v_(_mm256_broadcastq_epi64(_mm_cvtsi64_si128(i))) {}
+
+ // Copy from other vector.
+ HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
+ template <typename U>
+ HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
+ HH_INLINE V256& operator=(const V256& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE V256(const Intrinsic& v) : v_(v) {}
+ HH_INLINE V256& operator=(const Intrinsic& v) {
+ v_ = v;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return v_; }
+
+ // There are no greater-than comparison instructions for unsigned T.
+ HH_INLINE V256 operator==(const V256& other) const {
+ return V256(_mm256_cmpeq_epi64(v_, other.v_));
+ }
+
+ HH_INLINE V256& operator+=(const V256& other) {
+ v_ = _mm256_add_epi64(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator-=(const V256& other) {
+ v_ = _mm256_sub_epi64(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V256& operator&=(const V256& other) {
+ v_ = _mm256_and_si256(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator|=(const V256& other) {
+ v_ = _mm256_or_si256(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator^=(const V256& other) {
+ v_ = _mm256_xor_si256(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V256& operator<<=(const int count) {
+ v_ = _mm256_slli_epi64(v_, count);
+ return *this;
+ }
+
+ HH_INLINE V256& operator>>=(const int count) {
+ v_ = _mm256_srli_epi64(v_, count);
+ return *this;
+ }
+
+ private:
+ Intrinsic v_;
+};
+
+template <>
+class V256<float> {
+ public:
+ using Intrinsic = __m256;
+ using T = float;
+ static constexpr size_t N = 8;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE V256() {}
+
+ // Lane 0 (p_0) is the lowest.
+ HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
+ : v_(_mm256_set_ps(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {}
+
+ // Broadcasts to all lanes.
+ HH_INLINE explicit V256(T f) : v_(_mm256_set1_ps(f)) {}
+
+ // Copy from other vector.
+ HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
+ template <typename U>
+ HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
+ HH_INLINE V256& operator=(const V256& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE V256(const Intrinsic& v) : v_(v) {}
+ HH_INLINE V256& operator=(const Intrinsic& v) {
+ v_ = v;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return v_; }
+
+ HH_INLINE V256 operator==(const V256& other) const {
+ return V256(_mm256_cmp_ps(v_, other.v_, 0));
+ }
+ HH_INLINE V256 operator<(const V256& other) const {
+ return V256(_mm256_cmp_ps(v_, other.v_, 1));
+ }
+ HH_INLINE V256 operator>(const V256& other) const {
+ return V256(_mm256_cmp_ps(other.v_, v_, 1));
+ }
+
+ HH_INLINE V256& operator*=(const V256& other) {
+ v_ = _mm256_mul_ps(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator/=(const V256& other) {
+ v_ = _mm256_div_ps(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator+=(const V256& other) {
+ v_ = _mm256_add_ps(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator-=(const V256& other) {
+ v_ = _mm256_sub_ps(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V256& operator&=(const V256& other) {
+ v_ = _mm256_and_ps(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator|=(const V256& other) {
+ v_ = _mm256_or_ps(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator^=(const V256& other) {
+ v_ = _mm256_xor_ps(v_, other.v_);
+ return *this;
+ }
+
+ private:
+ Intrinsic v_;
+};
+
+template <>
+class V256<double> {
+ public:
+ using Intrinsic = __m256d;
+ using T = double;
+ static constexpr size_t N = 4;
+
+ // Leaves v_ uninitialized - typically used for output parameters.
+ HH_INLINE V256() {}
+
+ // Lane 0 (p_0) is the lowest.
+ HH_INLINE V256(T p_3, T p_2, T p_1, T p_0)
+ : v_(_mm256_set_pd(p_3, p_2, p_1, p_0)) {}
+
+ // Broadcasts to all lanes.
+ HH_INLINE explicit V256(T f) : v_(_mm256_set1_pd(f)) {}
+
+ // Copy from other vector.
+ HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
+ template <typename U>
+ HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
+ HH_INLINE V256& operator=(const V256& other) {
+ v_ = other.v_;
+ return *this;
+ }
+
+ // Convert from/to intrinsics.
+ HH_INLINE V256(const Intrinsic& v) : v_(v) {}
+ HH_INLINE V256& operator=(const Intrinsic& v) {
+ v_ = v;
+ return *this;
+ }
+ HH_INLINE operator Intrinsic() const { return v_; }
+
+ HH_INLINE V256 operator==(const V256& other) const {
+ return V256(_mm256_cmp_pd(v_, other.v_, 0));
+ }
+ HH_INLINE V256 operator<(const V256& other) const {
+ return V256(_mm256_cmp_pd(v_, other.v_, 1));
+ }
+ HH_INLINE V256 operator>(const V256& other) const {
+ return V256(_mm256_cmp_pd(other.v_, v_, 1));
+ }
+
+ HH_INLINE V256& operator*=(const V256& other) {
+ v_ = _mm256_mul_pd(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator/=(const V256& other) {
+ v_ = _mm256_div_pd(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator+=(const V256& other) {
+ v_ = _mm256_add_pd(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator-=(const V256& other) {
+ v_ = _mm256_sub_pd(v_, other.v_);
+ return *this;
+ }
+
+ HH_INLINE V256& operator&=(const V256& other) {
+ v_ = _mm256_and_pd(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator|=(const V256& other) {
+ v_ = _mm256_or_pd(v_, other.v_);
+ return *this;
+ }
+ HH_INLINE V256& operator^=(const V256& other) {
+ v_ = _mm256_xor_pd(v_, other.v_);
+ return *this;
+ }
+
+ private:
+ Intrinsic v_;
+};
+
+// Nonmember functions for any V256 via member functions.
+
+template <typename T>
+HH_INLINE V256<T> operator*(const V256<T>& left, const V256<T>& right) {
+ V256<T> t(left);
+ return t *= right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator/(const V256<T>& left, const V256<T>& right) {
+ V256<T> t(left);
+ return t /= right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator+(const V256<T>& left, const V256<T>& right) {
+ V256<T> t(left);
+ return t += right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator-(const V256<T>& left, const V256<T>& right) {
+ V256<T> t(left);
+ return t -= right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator&(const V256<T>& left, const V256<T>& right) {
+ V256<T> t(left);
+ return t &= right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator|(const V256<T> left, const V256<T>& right) {
+ V256<T> t(left);
+ return t |= right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator^(const V256<T>& left, const V256<T>& right) {
+ V256<T> t(left);
+ return t ^= right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator<<(const V256<T>& v, const int count) {
+ V256<T> t(v);
+ return t <<= count;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator>>(const V256<T>& v, const int count) {
+ V256<T> t(v);
+ return t >>= count;
+}
+
+// We do not provide operator<<(V, __m128i) because it has 4 cycle latency
+// (to broadcast the shift count). It is faster to use sllv_epi64 etc. instead.
+
+using V32x8U = V256<uint8_t>;
+using V16x16U = V256<uint16_t>;
+using V8x32U = V256<uint32_t>;
+using V4x64U = V256<uint64_t>;
+using V8x32F = V256<float>;
+using V4x64F = V256<double>;
+
+// Load/Store for any V256.
+
+// We differentiate between targets' vector types via template specialization.
+// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may
+// generate better code in unoptimized builds. Only declare the primary
+// templates to avoid needing mutual exclusion with vector128.
+
+template <class V>
+HH_INLINE V Load(const typename V::T* const HH_RESTRICT from);
+
+template <class V>
+HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from);
+
+template <>
+HH_INLINE V32x8U Load(const V32x8U::T* const HH_RESTRICT from) {
+ const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+ return V32x8U(_mm256_load_si256(p));
+}
+template <>
+HH_INLINE V16x16U Load(const V16x16U::T* const HH_RESTRICT from) {
+ const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+ return V16x16U(_mm256_load_si256(p));
+}
+template <>
+HH_INLINE V8x32U Load(const V8x32U::T* const HH_RESTRICT from) {
+ const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+ return V8x32U(_mm256_load_si256(p));
+}
+template <>
+HH_INLINE V4x64U Load(const V4x64U::T* const HH_RESTRICT from) {
+ const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+ return V4x64U(_mm256_load_si256(p));
+}
+template <>
+HH_INLINE V8x32F Load(const V8x32F::T* const HH_RESTRICT from) {
+ return V8x32F(_mm256_load_ps(from));
+}
+template <>
+HH_INLINE V4x64F Load(const V4x64F::T* const HH_RESTRICT from) {
+ return V4x64F(_mm256_load_pd(from));
+}
+
+template <>
+HH_INLINE V32x8U LoadUnaligned(const V32x8U::T* const HH_RESTRICT from) {
+ const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+ return V32x8U(_mm256_loadu_si256(p));
+}
+template <>
+HH_INLINE V16x16U LoadUnaligned(const V16x16U::T* const HH_RESTRICT from) {
+ const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+ return V16x16U(_mm256_loadu_si256(p));
+}
+template <>
+HH_INLINE V8x32U LoadUnaligned(const V8x32U::T* const HH_RESTRICT from) {
+ const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+ return V8x32U(_mm256_loadu_si256(p));
+}
+template <>
+HH_INLINE V4x64U LoadUnaligned(const V4x64U::T* const HH_RESTRICT from) {
+ const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+ return V4x64U(_mm256_loadu_si256(p));
+}
+template <>
+HH_INLINE V8x32F LoadUnaligned(const V8x32F::T* const HH_RESTRICT from) {
+ return V8x32F(_mm256_loadu_ps(from));
+}
+template <>
+HH_INLINE V4x64F LoadUnaligned(const V4x64F::T* const HH_RESTRICT from) {
+ return V4x64F(_mm256_loadu_pd(from));
+}
+
+// "to" must be vector-aligned.
+template <typename T>
+HH_INLINE void Store(const V256<T>& v, T* const HH_RESTRICT to) {
+ _mm256_store_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void Store(const V256<float>& v, float* const HH_RESTRICT to) {
+ _mm256_store_ps(to, v);
+}
+HH_INLINE void Store(const V256<double>& v, double* const HH_RESTRICT to) {
+ _mm256_store_pd(to, v);
+}
+
+template <typename T>
+HH_INLINE void StoreUnaligned(const V256<T>& v, T* const HH_RESTRICT to) {
+ _mm256_storeu_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void StoreUnaligned(const V256<float>& v,
+ float* const HH_RESTRICT to) {
+ _mm256_storeu_ps(to, v);
+}
+HH_INLINE void StoreUnaligned(const V256<double>& v,
+ double* const HH_RESTRICT to) {
+ _mm256_storeu_pd(to, v);
+}
+
+// Writes directly to (aligned) memory, bypassing the cache. This is useful for
+// data that will not be read again in the near future.
+template <typename T>
+HH_INLINE void Stream(const V256<T>& v, T* const HH_RESTRICT to) {
+ _mm256_stream_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void Stream(const V256<float>& v, float* const HH_RESTRICT to) {
+ _mm256_stream_ps(to, v);
+}
+HH_INLINE void Stream(const V256<double>& v, double* const HH_RESTRICT to) {
+ _mm256_stream_pd(to, v);
+}
+
+// Miscellaneous functions.
+
+template <typename T>
+HH_INLINE V256<T> RotateLeft(const V256<T>& v, const int count) {
+ constexpr size_t num_bits = sizeof(T) * 8;
+ return (v << count) | (v >> (num_bits - count));
+}
+
+template <typename T>
+HH_INLINE V256<T> AndNot(const V256<T>& neg_mask, const V256<T>& values) {
+ return V256<T>(_mm256_andnot_si256(neg_mask, values));
+}
+template <>
+HH_INLINE V256<float> AndNot(const V256<float>& neg_mask,
+ const V256<float>& values) {
+ return V256<float>(_mm256_andnot_ps(neg_mask, values));
+}
+template <>
+HH_INLINE V256<double> AndNot(const V256<double>& neg_mask,
+ const V256<double>& values) {
+ return V256<double>(_mm256_andnot_pd(neg_mask, values));
+}
+
+HH_INLINE V8x32F Select(const V8x32F& a, const V8x32F& b, const V8x32F& mask) {
+ return V8x32F(_mm256_blendv_ps(a, b, mask));
+}
+
+HH_INLINE V4x64F Select(const V4x64F& a, const V4x64F& b, const V4x64F& mask) {
+ return V4x64F(_mm256_blendv_pd(a, b, mask));
+}
+
+// Min/Max
+
+HH_INLINE V32x8U Min(const V32x8U& v0, const V32x8U& v1) {
+ return V32x8U(_mm256_min_epu8(v0, v1));
+}
+
+HH_INLINE V32x8U Max(const V32x8U& v0, const V32x8U& v1) {
+ return V32x8U(_mm256_max_epu8(v0, v1));
+}
+
+HH_INLINE V16x16U Min(const V16x16U& v0, const V16x16U& v1) {
+ return V16x16U(_mm256_min_epu16(v0, v1));
+}
+
+HH_INLINE V16x16U Max(const V16x16U& v0, const V16x16U& v1) {
+ return V16x16U(_mm256_max_epu16(v0, v1));
+}
+
+HH_INLINE V8x32U Min(const V8x32U& v0, const V8x32U& v1) {
+ return V8x32U(_mm256_min_epu32(v0, v1));
+}
+
+HH_INLINE V8x32U Max(const V8x32U& v0, const V8x32U& v1) {
+ return V8x32U(_mm256_max_epu32(v0, v1));
+}
+
+HH_INLINE V8x32F Min(const V8x32F& v0, const V8x32F& v1) {
+ return V8x32F(_mm256_min_ps(v0, v1));
+}
+
+HH_INLINE V8x32F Max(const V8x32F& v0, const V8x32F& v1) {
+ return V8x32F(_mm256_max_ps(v0, v1));
+}
+
+HH_INLINE V4x64F Min(const V4x64F& v0, const V4x64F& v1) {
+ return V4x64F(_mm256_min_pd(v0, v1));
+}
+
+HH_INLINE V4x64F Max(const V4x64F& v0, const V4x64F& v1) {
+ return V4x64F(_mm256_max_pd(v0, v1));
+}
+
+} // namespace HH_TARGET_NAME
+} // namespace highwayhash
+
+#endif // HH_DISABLE_TARGET_SPECIFIC
+#endif // HIGHWAYHASH_VECTOR256_H_
diff --git a/contrib/libs/highwayhash/highwayhash/vector_test.cc b/contrib/libs/highwayhash/highwayhash/vector_test.cc
index d9f02567be..a8bdfacac2 100644
--- a/contrib/libs/highwayhash/highwayhash/vector_test.cc
+++ b/contrib/libs/highwayhash/highwayhash/vector_test.cc
@@ -1,59 +1,59 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdio.h>
-
-#ifdef HH_GOOGLETEST
-#include "testing/base/public/gmock.h"
-#include "testing/base/public/gunit.h"
-#endif
-
-#include "highwayhash/instruction_sets.h"
-#include "highwayhash/vector_test_target.h"
-
-namespace highwayhash {
-namespace {
-
-void NotifyFailure(const char* target, const size_t size) {
- const size_t lane_bits = (size & 0xFF) * 8;
- const size_t lane_index = size >> 8;
-#ifdef HH_GOOGLETEST
- EXPECT_TRUE(false) << "VectorTest failed for " << target << " T=" << lane_bits
- << ", lane " << lane_index;
-#else
- printf("VectorTest failed for %10s T=%zu, lane=%zu\n", target, lane_bits,
- lane_index);
-#endif
-}
-
-void RunTests() {
- const TargetBits tested = InstructionSets::RunAll<VectorTest>(&NotifyFailure);
- HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) {
- printf("%10s: done\n", TargetName(target));
- });
-}
-
-#ifdef HH_GOOGLETEST
-TEST(VectorTest, Run) { RunTests(); }
-#endif
-
-} // namespace
-} // namespace highwayhash
-
-#ifndef HH_GOOGLETEST
-int main(int argc, char* argv[]) {
- highwayhash::RunTests();
- return 0;
-}
-#endif
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+
+#ifdef HH_GOOGLETEST
+#include "testing/base/public/gmock.h"
+#include "testing/base/public/gunit.h"
+#endif
+
+#include "highwayhash/instruction_sets.h"
+#include "highwayhash/vector_test_target.h"
+
+namespace highwayhash {
+namespace {
+
+void NotifyFailure(const char* target, const size_t size) {
+ const size_t lane_bits = (size & 0xFF) * 8;
+ const size_t lane_index = size >> 8;
+#ifdef HH_GOOGLETEST
+ EXPECT_TRUE(false) << "VectorTest failed for " << target << " T=" << lane_bits
+ << ", lane " << lane_index;
+#else
+ printf("VectorTest failed for %10s T=%zu, lane=%zu\n", target, lane_bits,
+ lane_index);
+#endif
+}
+
+void RunTests() {
+ const TargetBits tested = InstructionSets::RunAll<VectorTest>(&NotifyFailure);
+ HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) {
+ printf("%10s: done\n", TargetName(target));
+ });
+}
+
+#ifdef HH_GOOGLETEST
+TEST(VectorTest, Run) { RunTests(); }
+#endif
+
+} // namespace
+} // namespace highwayhash
+
+#ifndef HH_GOOGLETEST
+int main(int argc, char* argv[]) {
+ highwayhash::RunTests();
+ return 0;
+}
+#endif
diff --git a/contrib/libs/highwayhash/highwayhash/vector_test_avx2.cc b/contrib/libs/highwayhash/highwayhash/vector_test_avx2.cc
index 30ce2c992c..86a017f7f5 100644
--- a/contrib/libs/highwayhash/highwayhash/vector_test_avx2.cc
+++ b/contrib/libs/highwayhash/highwayhash/vector_test_avx2.cc
@@ -1,19 +1,19 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// WARNING: this is a "restricted" source file; avoid including any headers
-// unless they are also restricted. See arch_specific.h for details.
-
-#define HH_TARGET_NAME AVX2
-#include "highwayhash/vector_test_target.cc"
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME AVX2
+#include "highwayhash/vector_test_target.cc"
diff --git a/contrib/libs/highwayhash/highwayhash/vector_test_portable.cc b/contrib/libs/highwayhash/highwayhash/vector_test_portable.cc
index a742b4be80..df23c28070 100644
--- a/contrib/libs/highwayhash/highwayhash/vector_test_portable.cc
+++ b/contrib/libs/highwayhash/highwayhash/vector_test_portable.cc
@@ -1,19 +1,19 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// WARNING: this is a "restricted" source file; avoid including any headers
-// unless they are also restricted. See arch_specific.h for details.
-
-#define HH_TARGET_NAME Portable
-#include "highwayhash/vector_test_target.cc"
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME Portable
+#include "highwayhash/vector_test_target.cc"
diff --git a/contrib/libs/highwayhash/highwayhash/vector_test_sse41.cc b/contrib/libs/highwayhash/highwayhash/vector_test_sse41.cc
index 80e11b5d9c..4d6fbee2b4 100644
--- a/contrib/libs/highwayhash/highwayhash/vector_test_sse41.cc
+++ b/contrib/libs/highwayhash/highwayhash/vector_test_sse41.cc
@@ -1,19 +1,19 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// WARNING: this is a "restricted" source file; avoid including any headers
-// unless they are also restricted. See arch_specific.h for details.
-
-#define HH_TARGET_NAME SSE41
-#include "highwayhash/vector_test_target.cc"
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME SSE41
+#include "highwayhash/vector_test_target.cc"
diff --git a/contrib/libs/highwayhash/highwayhash/vector_test_target.cc b/contrib/libs/highwayhash/highwayhash/vector_test_target.cc
index f9eed7f59a..16d6ef1825 100644
--- a/contrib/libs/highwayhash/highwayhash/vector_test_target.cc
+++ b/contrib/libs/highwayhash/highwayhash/vector_test_target.cc
@@ -1,220 +1,220 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// WARNING: this is a "restricted" source file; avoid including any headers
-// unless they are also restricted. See arch_specific.h for details.
-
-#include "highwayhash/vector_test_target.h"
-
-#include "highwayhash/arch_specific.h"
-
-#if HH_TARGET == HH_TARGET_AVX2
-#include "highwayhash/vector256.h"
-#elif HH_TARGET == HH_TARGET_SSE41
-#include "highwayhash/vector128.h"
-#elif HH_TARGET == HH_TARGET_Portable
-#include "highwayhash/scalar.h"
-#else
-#error "Unknown target, add its include here."
-#endif
-
-#ifndef HH_DISABLE_TARGET_SPECIFIC
-namespace highwayhash {
-namespace HH_TARGET_NAME {
-namespace {
-
-#if HH_TARGET == HH_TARGET_AVX2
-template <typename T>
-using V = V256<T>;
-#elif HH_TARGET == HH_TARGET_SSE41
-template <typename T>
-using V = V128<T>;
-#elif HH_TARGET == HH_TARGET_Portable
-template <typename T>
-using V = Scalar<T>;
-#else
-#error "Unknown target, add its vector typedef here."
-#endif
-
-template <class T>
-void NotifyIfUnequal(const V<T>& v, const T expected, const HHNotify notify) {
- T lanes[V<T>::N] HH_ALIGNAS(32);
- Store(v, lanes);
- for (size_t i = 0; i < V<T>::N; ++i) {
- if (lanes[i] != expected) {
- notify(TargetName(HH_TARGET), (i << 8) | sizeof(T));
- }
- }
-}
-
-template <class T>
-void NotifyIfUnequal(const T& t, const T expected, const HHNotify notify) {
- if (t != expected) {
- notify(TargetName(HH_TARGET), sizeof(T));
- }
-}
-
-// MaxValue<T>()() replaces std::numeric_limits<T>::max().
-template <typename T>
-struct MaxValue;
-template <>
-struct MaxValue<uint8_t> {
- constexpr uint8_t operator()() const { return 0xFFu; }
-};
-template <>
-struct MaxValue<uint16_t> {
- constexpr uint16_t operator()() const { return 0xFFFFu; }
-};
-template <>
-struct MaxValue<uint32_t> {
- constexpr uint32_t operator()() const { return 0xFFFFFFFFu; }
-};
-template <>
-struct MaxValue<uint64_t> {
- constexpr uint64_t operator()() const { return 0xFFFFFFFFFFFFFFFFull; }
-};
-
-template <typename T>
-void TestMembersAndBinaryOperatorsExceptShifts(const HHNotify notify) {
- // uninitialized
- V<T> v;
-
- // broadcast
- const V<T> v2(2);
- NotifyIfUnequal(v2, T(2), notify);
-
- // assign from V
- const V<T> v3(3);
- V<T> v3b;
- v3b = v3;
- NotifyIfUnequal(v3b, T(3), notify);
-
- // equal
- const V<T> veq(v3 == v3b);
- NotifyIfUnequal(veq, MaxValue<T>()(), notify);
-
- // Copying to, and constructing from intrinsic yields same result.
- typename V<T>::Intrinsic nv2 = v2;
- V<T> v2b(nv2);
- NotifyIfUnequal(v2b, T(2), notify);
-
- // .. assignment also works.
- V<T> v2c;
- v2c = nv2;
- NotifyIfUnequal(v2c, T(2), notify);
-
- const V<T> add = v2 + v3;
- NotifyIfUnequal(add, T(5), notify);
-
- const V<T> sub = v3 - v2;
- NotifyIfUnequal(sub, T(1), notify);
-
- const V<T> vand = v3 & v2;
- NotifyIfUnequal(vand, T(2), notify);
-
- const V<T> vor = add | v2;
- NotifyIfUnequal(vor, T(7), notify);
-
- const V<T> vxor = v3 ^ v2;
- NotifyIfUnequal(vxor, T(1), notify);
-}
-
-// SSE does not allow shifting uint8_t, so instantiate for all other types.
-template <class T>
-void TestShifts(const HHNotify notify) {
- const V<T> v1(1);
- // Shifting out of right side => zero
- NotifyIfUnequal(v1 >> 1, T(0), notify);
-
- // Simple left shift
- NotifyIfUnequal(v1 << 1, T(2), notify);
-
- // Sign bit
- constexpr int kSign = (sizeof(T) * 8) - 1;
- constexpr T max = MaxValue<T>()();
- constexpr T sign = ~(max >> 1);
- NotifyIfUnequal(v1 << kSign, sign, notify);
-
- // Shifting out of left side => zero
- NotifyIfUnequal(v1 << (kSign + 1), T(0), notify);
-}
-
-template <class T>
-void TestLoadStore(const HHNotify notify) {
- const size_t n = V<T>::N;
- T lanes[2 * n] HH_ALIGNAS(32);
- for (size_t i = 0; i < n; ++i) {
- lanes[i] = 4;
- }
- for (size_t i = n; i < 2 * n; ++i) {
- lanes[i] = 5;
- }
- // Aligned load
- const V<T> v4 = Load<V<T>>(lanes);
- NotifyIfUnequal(v4, T(4), notify);
-
- // Aligned store
- T lanes4[n] HH_ALIGNAS(32);
- Store(v4, lanes4);
- NotifyIfUnequal(Load<V<T>>(lanes4), T(4), notify);
-
- // Unaligned load
- const V<T> vu = LoadUnaligned<V<T>>(lanes + 1);
- Store(vu, lanes4);
- NotifyIfUnequal(lanes4[n - 1], T(5), notify);
- for (size_t i = 1; i < n - 1; ++i) {
- NotifyIfUnequal(lanes4[i], T(4), notify);
- }
-
- // Unaligned store
- StoreUnaligned(v4, lanes + n / 2);
- size_t i;
- for (i = 0; i < 3 * n / 2; ++i) {
- NotifyIfUnequal(lanes[i], T(4), notify);
- }
- // Subsequent values remain unchanged.
- for (; i < 2 * n; ++i) {
- NotifyIfUnequal(lanes[i], T(5), notify);
- }
-}
-
-void TestAll(const HHNotify notify) {
- TestMembersAndBinaryOperatorsExceptShifts<uint8_t>(notify);
- TestMembersAndBinaryOperatorsExceptShifts<uint16_t>(notify);
- TestMembersAndBinaryOperatorsExceptShifts<uint32_t>(notify);
- TestMembersAndBinaryOperatorsExceptShifts<uint64_t>(notify);
-
- TestShifts<uint16_t>(notify);
- TestShifts<uint32_t>(notify);
- TestShifts<uint64_t>(notify);
-
- TestLoadStore<uint8_t>(notify);
- TestLoadStore<uint16_t>(notify);
- TestLoadStore<uint32_t>(notify);
- TestLoadStore<uint64_t>(notify);
-}
-
-} // namespace
-} // namespace HH_TARGET_NAME
-
-template <TargetBits Target>
-void VectorTest<Target>::operator()(const HHNotify notify) const {
- HH_TARGET_NAME::TestAll(notify);
-}
-
-// Instantiate for the current target.
-template struct VectorTest<HH_TARGET>;
-
-} // namespace highwayhash
-#endif // HH_DISABLE_TARGET_SPECIFIC
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#include "highwayhash/vector_test_target.h"
+
+#include "highwayhash/arch_specific.h"
+
+#if HH_TARGET == HH_TARGET_AVX2
+#include "highwayhash/vector256.h"
+#elif HH_TARGET == HH_TARGET_SSE41
+#include "highwayhash/vector128.h"
+#elif HH_TARGET == HH_TARGET_Portable
+#include "highwayhash/scalar.h"
+#else
+#error "Unknown target, add its include here."
+#endif
+
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+namespace highwayhash {
+namespace HH_TARGET_NAME {
+namespace {
+
+#if HH_TARGET == HH_TARGET_AVX2
+template <typename T>
+using V = V256<T>;
+#elif HH_TARGET == HH_TARGET_SSE41
+template <typename T>
+using V = V128<T>;
+#elif HH_TARGET == HH_TARGET_Portable
+template <typename T>
+using V = Scalar<T>;
+#else
+#error "Unknown target, add its vector typedef here."
+#endif
+
+template <class T>
+void NotifyIfUnequal(const V<T>& v, const T expected, const HHNotify notify) {
+ T lanes[V<T>::N] HH_ALIGNAS(32);
+ Store(v, lanes);
+ for (size_t i = 0; i < V<T>::N; ++i) {
+ if (lanes[i] != expected) {
+ notify(TargetName(HH_TARGET), (i << 8) | sizeof(T));
+ }
+ }
+}
+
+template <class T>
+void NotifyIfUnequal(const T& t, const T expected, const HHNotify notify) {
+ if (t != expected) {
+ notify(TargetName(HH_TARGET), sizeof(T));
+ }
+}
+
+// MaxValue<T>()() replaces std::numeric_limits<T>::max().
+template <typename T>
+struct MaxValue;
+template <>
+struct MaxValue<uint8_t> {
+ constexpr uint8_t operator()() const { return 0xFFu; }
+};
+template <>
+struct MaxValue<uint16_t> {
+ constexpr uint16_t operator()() const { return 0xFFFFu; }
+};
+template <>
+struct MaxValue<uint32_t> {
+ constexpr uint32_t operator()() const { return 0xFFFFFFFFu; }
+};
+template <>
+struct MaxValue<uint64_t> {
+ constexpr uint64_t operator()() const { return 0xFFFFFFFFFFFFFFFFull; }
+};
+
+template <typename T>
+void TestMembersAndBinaryOperatorsExceptShifts(const HHNotify notify) {
+ // uninitialized
+ V<T> v;
+
+ // broadcast
+ const V<T> v2(2);
+ NotifyIfUnequal(v2, T(2), notify);
+
+ // assign from V
+ const V<T> v3(3);
+ V<T> v3b;
+ v3b = v3;
+ NotifyIfUnequal(v3b, T(3), notify);
+
+ // equal
+ const V<T> veq(v3 == v3b);
+ NotifyIfUnequal(veq, MaxValue<T>()(), notify);
+
+ // Copying to, and constructing from intrinsic yields same result.
+ typename V<T>::Intrinsic nv2 = v2;
+ V<T> v2b(nv2);
+ NotifyIfUnequal(v2b, T(2), notify);
+
+ // .. assignment also works.
+ V<T> v2c;
+ v2c = nv2;
+ NotifyIfUnequal(v2c, T(2), notify);
+
+ const V<T> add = v2 + v3;
+ NotifyIfUnequal(add, T(5), notify);
+
+ const V<T> sub = v3 - v2;
+ NotifyIfUnequal(sub, T(1), notify);
+
+ const V<T> vand = v3 & v2;
+ NotifyIfUnequal(vand, T(2), notify);
+
+ const V<T> vor = add | v2;
+ NotifyIfUnequal(vor, T(7), notify);
+
+ const V<T> vxor = v3 ^ v2;
+ NotifyIfUnequal(vxor, T(1), notify);
+}
+
+// SSE does not allow shifting uint8_t, so instantiate for all other types.
+template <class T>
+void TestShifts(const HHNotify notify) {
+ const V<T> v1(1);
+ // Shifting out of right side => zero
+ NotifyIfUnequal(v1 >> 1, T(0), notify);
+
+ // Simple left shift
+ NotifyIfUnequal(v1 << 1, T(2), notify);
+
+ // Sign bit
+ constexpr int kSign = (sizeof(T) * 8) - 1;
+ constexpr T max = MaxValue<T>()();
+ constexpr T sign = ~(max >> 1);
+ NotifyIfUnequal(v1 << kSign, sign, notify);
+
+ // Shifting out of left side => zero
+ NotifyIfUnequal(v1 << (kSign + 1), T(0), notify);
+}
+
+template <class T>
+void TestLoadStore(const HHNotify notify) {
+ const size_t n = V<T>::N;
+ T lanes[2 * n] HH_ALIGNAS(32);
+ for (size_t i = 0; i < n; ++i) {
+ lanes[i] = 4;
+ }
+ for (size_t i = n; i < 2 * n; ++i) {
+ lanes[i] = 5;
+ }
+ // Aligned load
+ const V<T> v4 = Load<V<T>>(lanes);
+ NotifyIfUnequal(v4, T(4), notify);
+
+ // Aligned store
+ T lanes4[n] HH_ALIGNAS(32);
+ Store(v4, lanes4);
+ NotifyIfUnequal(Load<V<T>>(lanes4), T(4), notify);
+
+ // Unaligned load
+ const V<T> vu = LoadUnaligned<V<T>>(lanes + 1);
+ Store(vu, lanes4);
+ NotifyIfUnequal(lanes4[n - 1], T(5), notify);
+ for (size_t i = 1; i < n - 1; ++i) {
+ NotifyIfUnequal(lanes4[i], T(4), notify);
+ }
+
+ // Unaligned store
+ StoreUnaligned(v4, lanes + n / 2);
+ size_t i;
+ for (i = 0; i < 3 * n / 2; ++i) {
+ NotifyIfUnequal(lanes[i], T(4), notify);
+ }
+ // Subsequent values remain unchanged.
+ for (; i < 2 * n; ++i) {
+ NotifyIfUnequal(lanes[i], T(5), notify);
+ }
+}
+
+void TestAll(const HHNotify notify) {
+ TestMembersAndBinaryOperatorsExceptShifts<uint8_t>(notify);
+ TestMembersAndBinaryOperatorsExceptShifts<uint16_t>(notify);
+ TestMembersAndBinaryOperatorsExceptShifts<uint32_t>(notify);
+ TestMembersAndBinaryOperatorsExceptShifts<uint64_t>(notify);
+
+ TestShifts<uint16_t>(notify);
+ TestShifts<uint32_t>(notify);
+ TestShifts<uint64_t>(notify);
+
+ TestLoadStore<uint8_t>(notify);
+ TestLoadStore<uint16_t>(notify);
+ TestLoadStore<uint32_t>(notify);
+ TestLoadStore<uint64_t>(notify);
+}
+
+} // namespace
+} // namespace HH_TARGET_NAME
+
+template <TargetBits Target>
+void VectorTest<Target>::operator()(const HHNotify notify) const {
+ HH_TARGET_NAME::TestAll(notify);
+}
+
+// Instantiate for the current target.
+template struct VectorTest<HH_TARGET>;
+
+} // namespace highwayhash
+#endif // HH_DISABLE_TARGET_SPECIFIC
diff --git a/contrib/libs/highwayhash/highwayhash/vector_test_target.h b/contrib/libs/highwayhash/highwayhash/vector_test_target.h
index f1ff6382dc..c26f876912 100644
--- a/contrib/libs/highwayhash/highwayhash/vector_test_target.h
+++ b/contrib/libs/highwayhash/highwayhash/vector_test_target.h
@@ -1,37 +1,37 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAYHASH_VECTOR_TEST_TARGET_H_
-#define HIGHWAYHASH_VECTOR_TEST_TARGET_H_
-
-// WARNING: this is a "restricted" header because it is included from
-// translation units compiled with different flags. This header and its
-// dependencies must not define any function unless it is static inline and/or
-// within namespace HH_TARGET_NAME. See arch_specific.h for details.
-
-#include "highwayhash/arch_specific.h"
-#include "highwayhash/hh_types.h"
-
-namespace highwayhash {
-
-// Usage: InstructionSets::RunAll<VectorTest>(). Calls "notify" for each test
-// failure.
-template <TargetBits Target>
-struct VectorTest {
- void operator()(const HHNotify notify) const;
-};
-
-} // namespace highwayhash
-
-#endif // HIGHWAYHASH_VECTOR_TEST_TARGET_H_
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_VECTOR_TEST_TARGET_H_
+#define HIGHWAYHASH_VECTOR_TEST_TARGET_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/hh_types.h"
+
+namespace highwayhash {
+
+// Usage: InstructionSets::RunAll<VectorTest>(). Calls "notify" for each test
+// failure.
+template <TargetBits Target>
+struct VectorTest {
+ void operator()(const HHNotify notify) const;
+};
+
+} // namespace highwayhash
+
+#endif // HIGHWAYHASH_VECTOR_TEST_TARGET_H_
diff --git a/contrib/libs/highwayhash/ya.make b/contrib/libs/highwayhash/ya.make
index 4f6dad6193..aec086bdcb 100644
--- a/contrib/libs/highwayhash/ya.make
+++ b/contrib/libs/highwayhash/ya.make
@@ -1,46 +1,46 @@
-LIBRARY()
-
-LICENSE(Apache-2.0)
-
+LIBRARY()
+
+LICENSE(Apache-2.0)
+
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
VERSION(2017-05-08-2b666ae078292b01024453d01480f3b362a2a012)
OWNER(somov)
-
-NO_COMPILER_WARNINGS()
-
+
+NO_COMPILER_WARNINGS()
+
ADDINCL(GLOBAL contrib/libs/highwayhash)
-
-SRCDIR(contrib/libs/highwayhash/highwayhash)
-
-SRCS(
- # Dispatcher
- arch_specific.cc
- instruction_sets.cc
- nanobenchmark.cc
- os_specific.cc
- # SipHash
- sip_hash.cc
- scalar_sip_tree_hash.cc
- # sip_tree_hash.cc with AVX2 if available
- # HighwayHash
- hh_portable.cc
- # hh_avx2.cc with AVX2
- # hh_sse41.cc with SSE4.1
- # Library
- c_bindings.cc
-)
-
-IF (ARCH_X86_64)
- PEERDIR(
- contrib/libs/highwayhash/arch/avx2
- contrib/libs/highwayhash/arch/sse41
- )
-ELSE()
- SRCS(
- sip_tree_hash.cc
- )
-ENDIF()
-
-END()
+
+SRCDIR(contrib/libs/highwayhash/highwayhash)
+
+SRCS(
+ # Dispatcher
+ arch_specific.cc
+ instruction_sets.cc
+ nanobenchmark.cc
+ os_specific.cc
+ # SipHash
+ sip_hash.cc
+ scalar_sip_tree_hash.cc
+ # sip_tree_hash.cc with AVX2 if available
+ # HighwayHash
+ hh_portable.cc
+ # hh_avx2.cc with AVX2
+ # hh_sse41.cc with SSE4.1
+ # Library
+ c_bindings.cc
+)
+
+IF (ARCH_X86_64)
+ PEERDIR(
+ contrib/libs/highwayhash/arch/avx2
+ contrib/libs/highwayhash/arch/sse41
+ )
+ELSE()
+ SRCS(
+ sip_tree_hash.cc
+ )
+ENDIF()
+
+END()