aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/digest/argonish/internal
diff options
context:
space:
mode:
authore-sidorov <e-sidorov@yandex-team.ru>2022-02-10 16:46:05 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:46:05 +0300
commit1ec091f8998d76a211c6015ba6865a73b29d676a (patch)
tree6c72f0309888be2dd18d007d19c490ed87740d66 /library/cpp/digest/argonish/internal
parent3b241dd57cf58f20bbbd63fa6a0a758dbec09b68 (diff)
downloadydb-1ec091f8998d76a211c6015ba6865a73b29d676a.tar.gz
Restoring authorship annotation for <e-sidorov@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/digest/argonish/internal')
-rw-r--r--library/cpp/digest/argonish/internal/argon2/argon2_avx2.h232
-rw-r--r--library/cpp/digest/argonish/internal/argon2/argon2_base.h748
-rw-r--r--library/cpp/digest/argonish/internal/argon2/argon2_ref.h174
-rw-r--r--library/cpp/digest/argonish/internal/argon2/argon2_sse2.h200
-rw-r--r--library/cpp/digest/argonish/internal/argon2/argon2_sse41.h200
-rw-r--r--library/cpp/digest/argonish/internal/argon2/argon2_ssse3.h202
-rw-r--r--library/cpp/digest/argonish/internal/argon2/ya.make16
-rw-r--r--library/cpp/digest/argonish/internal/blake2b/blake2b.h368
-rw-r--r--library/cpp/digest/argonish/internal/blake2b/blake2b_avx2.h206
-rw-r--r--library/cpp/digest/argonish/internal/blake2b/blake2b_ref.h164
-rw-r--r--library/cpp/digest/argonish/internal/blake2b/blake2b_sse2.h324
-rw-r--r--library/cpp/digest/argonish/internal/blake2b/blake2b_sse41.h342
-rw-r--r--library/cpp/digest/argonish/internal/blake2b/blake2b_ssse3.h340
-rw-r--r--library/cpp/digest/argonish/internal/blake2b/load_sse41.h602
-rw-r--r--library/cpp/digest/argonish/internal/blake2b/ya.make12
-rw-r--r--library/cpp/digest/argonish/internal/blamka/blamka_avx2.h270
-rw-r--r--library/cpp/digest/argonish/internal/blamka/blamka_sse2.h188
-rw-r--r--library/cpp/digest/argonish/internal/blamka/blamka_ssse3.h204
-rw-r--r--library/cpp/digest/argonish/internal/blamka/ya.make12
-rw-r--r--library/cpp/digest/argonish/internal/proxies/avx2/proxy_avx2.cpp28
-rw-r--r--library/cpp/digest/argonish/internal/proxies/avx2/proxy_avx2.h16
-rw-r--r--library/cpp/digest/argonish/internal/proxies/avx2/ya.make36
-rw-r--r--library/cpp/digest/argonish/internal/proxies/macro/proxy_macros.h382
-rw-r--r--library/cpp/digest/argonish/internal/proxies/macro/ya.make10
-rw-r--r--library/cpp/digest/argonish/internal/proxies/ref/proxy_ref.cpp32
-rw-r--r--library/cpp/digest/argonish/internal/proxies/ref/proxy_ref.h16
-rw-r--r--library/cpp/digest/argonish/internal/proxies/ref/ya.make28
-rw-r--r--library/cpp/digest/argonish/internal/proxies/sse2/proxy_sse2.cpp28
-rw-r--r--library/cpp/digest/argonish/internal/proxies/sse2/proxy_sse2.h16
-rw-r--r--library/cpp/digest/argonish/internal/proxies/sse2/ya.make36
-rw-r--r--library/cpp/digest/argonish/internal/proxies/sse41/proxy_sse41.cpp28
-rw-r--r--library/cpp/digest/argonish/internal/proxies/sse41/proxy_sse41.h16
-rw-r--r--library/cpp/digest/argonish/internal/proxies/sse41/ya.make36
-rw-r--r--library/cpp/digest/argonish/internal/proxies/ssse3/proxy_ssse3.cpp28
-rw-r--r--library/cpp/digest/argonish/internal/proxies/ssse3/proxy_ssse3.h16
-rw-r--r--library/cpp/digest/argonish/internal/proxies/ssse3/ya.make38
-rw-r--r--library/cpp/digest/argonish/internal/proxies/ya.make16
-rw-r--r--library/cpp/digest/argonish/internal/rotations/rotations_avx2.h60
-rw-r--r--library/cpp/digest/argonish/internal/rotations/rotations_ref.h14
-rw-r--r--library/cpp/digest/argonish/internal/rotations/rotations_sse2.h54
-rw-r--r--library/cpp/digest/argonish/internal/rotations/rotations_ssse3.h56
-rw-r--r--library/cpp/digest/argonish/internal/rotations/ya.make10
-rw-r--r--library/cpp/digest/argonish/internal/ya.make14
43 files changed, 2909 insertions, 2909 deletions
diff --git a/library/cpp/digest/argonish/internal/argon2/argon2_avx2.h b/library/cpp/digest/argonish/internal/argon2/argon2_avx2.h
index 8bf5367817..4ce2712e85 100644
--- a/library/cpp/digest/argonish/internal/argon2/argon2_avx2.h
+++ b/library/cpp/digest/argonish/internal/argon2/argon2_avx2.h
@@ -1,117 +1,117 @@
-#pragma once
-
-#include <immintrin.h>
-#include "argon2_base.h"
+#pragma once
+
+#include <immintrin.h>
+#include "argon2_base.h"
#include <library/cpp/digest/argonish/internal/blamka/blamka_avx2.h>
-
-namespace NArgonish {
- template <ui32 mcost, ui32 threads>
- class TArgon2AVX2 final: public TArgon2<EInstructionSet::AVX2, mcost, threads> {
- public:
- TArgon2AVX2(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
- : TArgon2<EInstructionSet::AVX2, mcost, threads>(atype, tcost, key, keylen)
- {
- }
-
- protected:
- virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
- __m256i* mdst = (__m256i*)dst;
- __m256i* msrc = (__m256i*)src;
-
- for (ui32 i = 0; i < ARGON2_HWORDS_IN_BLOCK; ++i)
- XorValues(mdst + i, mdst + i, msrc + i);
- }
-
- virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
- memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
- }
-
- virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool with_xor) const override {
- __m256i blockxy[ARGON2_HWORDS_IN_BLOCK];
- __m256i state[ARGON2_HWORDS_IN_BLOCK];
-
- memcpy(state, prevBlock, ARGON2_BLOCK_SIZE);
-
- if (with_xor) {
- for (ui32 i = 0; i < ARGON2_HWORDS_IN_BLOCK; ++i) {
- state[i] = _mm256_xor_si256(state[i], _mm256_loadu_si256((const __m256i*)refBlock->V + i));
- blockxy[i] = _mm256_xor_si256(state[i], _mm256_loadu_si256((const __m256i*)nextBlock->V + i));
- }
- } else {
- for (ui32 i = 0; i < ARGON2_HWORDS_IN_BLOCK; ++i) {
- blockxy[i] = state[i] = _mm256_xor_si256(
- state[i], _mm256_loadu_si256((const __m256i*)refBlock->V + i));
- }
- }
-
- /**
- * state[ 8*i + 0 ] = ( v0_0, v1_0, v2_0, v3_0)
- * state[ 8*i + 1 ] = ( v4_0, v5_0, v6_0, v7_0)
- * state[ 8*i + 2 ] = ( v8_0, v9_0, v10_0, v11_0)
- * state[ 8*i + 3 ] = (v12_0, v13_0, v14_0, v15_0)
- * state[ 8*i + 4 ] = ( v0_1, v1_1, v2_1, v3_1)
- * state[ 8*i + 5 ] = ( v4_1, v5_1, v6_1, v7_1)
- * state[ 8*i + 6 ] = ( v8_1, v9_1, v10_1, v11_1)
- * state[ 8*i + 7 ] = (v12_1, v13_1, v14_1, v15_1)
- */
- for (ui32 i = 0; i < 4; ++i) {
- BlamkaG1AVX2(
- state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
- state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
- BlamkaG2AVX2(
- state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
- state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
- DiagonalizeAVX21(
- state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- BlamkaG1AVX2(
- state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
- state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
- BlamkaG2AVX2(
- state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
- state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
- UndiagonalizeAVX21(
- state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- }
-
- /**
- * state[ 0 + i] = ( v0_0, v1_0, v0_1, v1_1)
- * state[ 4 + i] = ( v2_0, v3_0, v2_1, v3_1)
- * state[ 8 + i] = ( v4_0, v5_0, v4_1, v5_1)
- * state[12 + i] = ( v6_0, v7_0, v6_1, v7_1)
- * state[16 + i] = ( v8_0, v9_0, v8_1, v9_1)
- * state[20 + i] = (v10_0, v11_0, v10_1, v11_1)
- * state[24 + i] = (v12_0, v13_0, v12_1, v13_1)
- * state[28 + i] = (v14_0, v15_0, v14_1, v15_1)
- */
- for (ui32 i = 0; i < 4; ++i) {
- BlamkaG1AVX2(
- state[0 + i], state[4 + i], state[8 + i], state[12 + i],
- state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
- BlamkaG2AVX2(
- state[0 + i], state[4 + i], state[8 + i], state[12 + i],
- state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
- DiagonalizeAVX22(
- state[8 + i], state[12 + i],
- state[16 + i], state[20 + i],
- state[24 + i], state[28 + i]);
- BlamkaG1AVX2(
- state[0 + i], state[4 + i], state[8 + i], state[12 + i],
- state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
- BlamkaG2AVX2(
- state[0 + i], state[4 + i], state[8 + i], state[12 + i],
- state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
- UndiagonalizeAVX22(
- state[8 + i], state[12 + i],
- state[16 + i], state[20 + i],
- state[24 + i], state[28 + i]);
- }
-
- for (ui32 i = 0; i < ARGON2_HWORDS_IN_BLOCK; ++i) {
- state[i] = _mm256_xor_si256(state[i], blockxy[i]);
- _mm256_storeu_si256((__m256i*)nextBlock->V + i, state[i]);
- }
- }
- };
-}
+
+namespace NArgonish {
+ template <ui32 mcost, ui32 threads>
+ class TArgon2AVX2 final: public TArgon2<EInstructionSet::AVX2, mcost, threads> {
+ public:
+ TArgon2AVX2(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
+ : TArgon2<EInstructionSet::AVX2, mcost, threads>(atype, tcost, key, keylen)
+ {
+ }
+
+ protected:
+ virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
+ __m256i* mdst = (__m256i*)dst;
+ __m256i* msrc = (__m256i*)src;
+
+ for (ui32 i = 0; i < ARGON2_HWORDS_IN_BLOCK; ++i)
+ XorValues(mdst + i, mdst + i, msrc + i);
+ }
+
+ virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
+ memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
+ }
+
+ virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool with_xor) const override {
+ __m256i blockxy[ARGON2_HWORDS_IN_BLOCK];
+ __m256i state[ARGON2_HWORDS_IN_BLOCK];
+
+ memcpy(state, prevBlock, ARGON2_BLOCK_SIZE);
+
+ if (with_xor) {
+ for (ui32 i = 0; i < ARGON2_HWORDS_IN_BLOCK; ++i) {
+ state[i] = _mm256_xor_si256(state[i], _mm256_loadu_si256((const __m256i*)refBlock->V + i));
+ blockxy[i] = _mm256_xor_si256(state[i], _mm256_loadu_si256((const __m256i*)nextBlock->V + i));
+ }
+ } else {
+ for (ui32 i = 0; i < ARGON2_HWORDS_IN_BLOCK; ++i) {
+ blockxy[i] = state[i] = _mm256_xor_si256(
+ state[i], _mm256_loadu_si256((const __m256i*)refBlock->V + i));
+ }
+ }
+
+ /**
+ * state[ 8*i + 0 ] = ( v0_0, v1_0, v2_0, v3_0)
+ * state[ 8*i + 1 ] = ( v4_0, v5_0, v6_0, v7_0)
+ * state[ 8*i + 2 ] = ( v8_0, v9_0, v10_0, v11_0)
+ * state[ 8*i + 3 ] = (v12_0, v13_0, v14_0, v15_0)
+ * state[ 8*i + 4 ] = ( v0_1, v1_1, v2_1, v3_1)
+ * state[ 8*i + 5 ] = ( v4_1, v5_1, v6_1, v7_1)
+ * state[ 8*i + 6 ] = ( v8_1, v9_1, v10_1, v11_1)
+ * state[ 8*i + 7 ] = (v12_1, v13_1, v14_1, v15_1)
+ */
+ for (ui32 i = 0; i < 4; ++i) {
+ BlamkaG1AVX2(
+ state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
+ state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
+ BlamkaG2AVX2(
+ state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
+ state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
+ DiagonalizeAVX21(
+ state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ BlamkaG1AVX2(
+ state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
+ state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
+ BlamkaG2AVX2(
+ state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
+ state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
+ UndiagonalizeAVX21(
+ state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ }
+
+ /**
+ * state[ 0 + i] = ( v0_0, v1_0, v0_1, v1_1)
+ * state[ 4 + i] = ( v2_0, v3_0, v2_1, v3_1)
+ * state[ 8 + i] = ( v4_0, v5_0, v4_1, v5_1)
+ * state[12 + i] = ( v6_0, v7_0, v6_1, v7_1)
+ * state[16 + i] = ( v8_0, v9_0, v8_1, v9_1)
+ * state[20 + i] = (v10_0, v11_0, v10_1, v11_1)
+ * state[24 + i] = (v12_0, v13_0, v12_1, v13_1)
+ * state[28 + i] = (v14_0, v15_0, v14_1, v15_1)
+ */
+ for (ui32 i = 0; i < 4; ++i) {
+ BlamkaG1AVX2(
+ state[0 + i], state[4 + i], state[8 + i], state[12 + i],
+ state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
+ BlamkaG2AVX2(
+ state[0 + i], state[4 + i], state[8 + i], state[12 + i],
+ state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
+ DiagonalizeAVX22(
+ state[8 + i], state[12 + i],
+ state[16 + i], state[20 + i],
+ state[24 + i], state[28 + i]);
+ BlamkaG1AVX2(
+ state[0 + i], state[4 + i], state[8 + i], state[12 + i],
+ state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
+ BlamkaG2AVX2(
+ state[0 + i], state[4 + i], state[8 + i], state[12 + i],
+ state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
+ UndiagonalizeAVX22(
+ state[8 + i], state[12 + i],
+ state[16 + i], state[20 + i],
+ state[24 + i], state[28 + i]);
+ }
+
+ for (ui32 i = 0; i < ARGON2_HWORDS_IN_BLOCK; ++i) {
+ state[i] = _mm256_xor_si256(state[i], blockxy[i]);
+ _mm256_storeu_si256((__m256i*)nextBlock->V + i, state[i]);
+ }
+ }
+ };
+}
diff --git a/library/cpp/digest/argonish/internal/argon2/argon2_base.h b/library/cpp/digest/argonish/internal/argon2/argon2_base.h
index 2385cc947c..8de5b6bb42 100644
--- a/library/cpp/digest/argonish/internal/argon2/argon2_base.h
+++ b/library/cpp/digest/argonish/internal/argon2/argon2_base.h
@@ -1,388 +1,388 @@
-#pragma once
-
-#include <util/generic/yexception.h>
+#pragma once
+
+#include <util/generic/yexception.h>
#include <library/cpp/digest/argonish/argon2.h>
#include <library/cpp/digest/argonish/internal/blake2b/blake2b.h>
#include <library/cpp/threading/poor_man_openmp/thread_helper.h>
-
-namespace NArgonish {
- const ui32 ARGON2_PREHASH_DIGEST_LENGTH = 64;
+
+namespace NArgonish {
+ const ui32 ARGON2_PREHASH_DIGEST_LENGTH = 64;
const ui32 ARGON2_SECRET_MAX_LENGTH = 64;
- const ui32 ARGON2_PREHASH_SEED_LENGTH = 72;
- const ui32 ARGON2_BLOCK_SIZE = 1024;
- const ui32 ARGON2_QWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8;
- const ui32 ARGON2_OWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 16;
- const ui32 ARGON2_HWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 32;
- const ui32 ARGON2_ADDRESSES_IN_BLOCK = 128;
- const ui32 ARGON2_SYNC_POINTS = 4;
- const ui32 ARGON2_SALT_MIN_LEN = 8;
- const ui32 ARGON2_MIN_OUTLEN = 4;
-
- struct TBlock {
- ui64 V[ARGON2_QWORDS_IN_BLOCK];
- };
-
- template <EInstructionSet instructionSet, ui32 mcost, ui32 threads>
- class TArgon2: public IArgon2Base {
- public:
- TArgon2(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
- : SecretLen_(keylen)
- , Tcost_(tcost)
- , Atype_(atype)
- {
- if (SecretLen_)
- memcpy(Secret_, key, keylen);
- }
-
- virtual ~TArgon2() override {
- if (SecretLen_) {
- SecureZeroMemory_(Secret_, SecretLen_);
- SecretLen_ = 0;
- }
- }
-
- virtual void Hash(const ui8* pwd, ui32 pwdlen, const ui8* salt, ui32 saltlen,
- ui8* out, ui32 outlen, const ui8* aad = nullptr, ui32 aadlen = 0) const override {
- TArrayHolder<TBlock> buffer(new TBlock[MemoryBlocks_]);
- InternalHash_(buffer.Get(), pwd, pwdlen, salt, saltlen, out, outlen, aad, aadlen);
- }
-
- virtual bool Verify(const ui8* pwd, ui32 pwdlen, const ui8* salt, ui32 saltlen,
- const ui8* hash, ui32 hashlen, const ui8* aad = nullptr, ui32 aadlen = 0) const override {
- TArrayHolder<ui8> hashResult(new ui8[hashlen]);
- Hash(pwd, pwdlen, salt, saltlen, hashResult.Get(), hashlen, aad, aadlen);
-
- return SecureCompare_(hash, hashResult.Get(), hashlen);
- }
-
- virtual void HashWithCustomMemory(ui8* memory, size_t mlen, const ui8* pwd, ui32 pwdlen,
- const ui8* salt, ui32 saltlen, ui8* out, ui32 outlen,
- const ui8* aad = nullptr, ui32 aadlen = 0) const override {
- if (memory == nullptr || mlen < sizeof(TBlock) * MemoryBlocks_)
- ythrow yexception() << "memory is null or its size is not enough";
-
- InternalHash_((TBlock*)memory, pwd, pwdlen, salt, saltlen, out, outlen, aad, aadlen);
- }
-
- virtual bool VerifyWithCustomMemory(ui8* memory, size_t mlen, const ui8* pwd, ui32 pwdlen,
- const ui8* salt, ui32 saltlen, const ui8* hash, ui32 hashlen,
- const ui8* aad = nullptr, ui32 aadlen = 0) const override {
- TArrayHolder<ui8> hashResult(new ui8[hashlen]);
- HashWithCustomMemory(memory, mlen, pwd, pwdlen, salt, saltlen, hashResult.Get(), hashlen, aad, aadlen);
-
- return SecureCompare_(hashResult.Get(), hash, hashlen);
- }
-
- virtual size_t GetMemorySize() const override {
- return MemoryBlocks_ * sizeof(TBlock);
- }
-
- protected: /* Constants */
- ui8 Secret_[ARGON2_SECRET_MAX_LENGTH] = {0};
- ui32 SecretLen_ = 0;
- ui32 Tcost_;
- EArgon2Type Atype_;
-
- static constexpr ui32 Lanes_ = threads;
- static constexpr ui32 MemoryBlocks_ = (mcost >= 2 * ARGON2_SYNC_POINTS * Lanes_) ? (mcost - mcost % (Lanes_ * ARGON2_SYNC_POINTS)) : 2 * ARGON2_SYNC_POINTS * Lanes_;
- static constexpr ui32 SegmentLength_ = MemoryBlocks_ / (Lanes_ * ARGON2_SYNC_POINTS);
- static constexpr ui32 LaneLength_ = SegmentLength_ * ARGON2_SYNC_POINTS;
-
- protected: /* Prototypes */
- virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock,
- TBlock* nextBlock, bool withXor) const = 0;
-
- virtual void CopyBlock_(TBlock* dst, const TBlock* src) const = 0;
- virtual void XorBlock_(TBlock* dst, const TBlock* src) const = 0;
-
- protected: /* Static functions */
- static bool SecureCompare_(const ui8* buffer1, const ui8* buffer2, ui32 len) {
- bool result = true;
- for (ui32 i = 0; i < len; ++i) {
- result &= (buffer1[i] == buffer2[i]);
- }
- return result;
- }
-
- static void SecureZeroMemory_(void* src, size_t len) {
- static void* (*const volatile memset_v)(void*, int, size_t) = &memset;
- memset_v(src, 0, len);
- }
-
- static void Store32_(ui32 value, void* mem) {
- *((ui32*)mem) = value;
- }
-
- static void Blake2BHash64_(ui8 out[BLAKE2B_OUTBYTES], const ui8 in[BLAKE2B_OUTBYTES]) {
- TBlake2B<instructionSet> hash(BLAKE2B_OUTBYTES);
- hash.Update(in, BLAKE2B_OUTBYTES);
- hash.Final(out, BLAKE2B_OUTBYTES);
- }
-
- static void ExpandBlockhash_(ui8 expanded[ARGON2_BLOCK_SIZE], const ui8 blockhash[ARGON2_PREHASH_SEED_LENGTH]) {
- ui8 out_buffer[BLAKE2B_OUTBYTES];
- ui8 in_buffer[BLAKE2B_OUTBYTES];
- const ui32 HALF_OUT_BYTES = BLAKE2B_OUTBYTES / 2;
- const ui32 HASH_BLOCKS_COUNT = ((ARGON2_BLOCK_SIZE / HALF_OUT_BYTES));
-
- TBlake2B<instructionSet> hash(BLAKE2B_OUTBYTES);
- hash.Update(ARGON2_BLOCK_SIZE);
- hash.Update(blockhash, ARGON2_PREHASH_SEED_LENGTH);
- hash.Final(out_buffer, BLAKE2B_OUTBYTES);
-
- memcpy(expanded, out_buffer, HALF_OUT_BYTES);
-
- for (ui32 i = 1; i < HASH_BLOCKS_COUNT - 2; ++i) {
- memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
- Blake2BHash64_(out_buffer, in_buffer);
- memcpy(expanded + (i * HALF_OUT_BYTES), out_buffer, HALF_OUT_BYTES);
- }
-
- Blake2BHash64_(in_buffer, out_buffer);
- memcpy(expanded + HALF_OUT_BYTES * (HASH_BLOCKS_COUNT - 2), in_buffer, BLAKE2B_OUTBYTES);
- }
-
- static void Blake2BLong_(ui8* out, ui32 outlen, const ui8* in, ui32 inlen) {
- if (outlen < BLAKE2B_OUTBYTES) {
- TBlake2B<instructionSet> hash(outlen);
- hash.Update(outlen);
- hash.Update(in, inlen);
- hash.Final(out, outlen);
- } else {
- ui8 out_buffer[BLAKE2B_OUTBYTES];
- ui8 in_buffer[BLAKE2B_OUTBYTES];
- ui32 toproduce = outlen - BLAKE2B_OUTBYTES / 2;
-
+ const ui32 ARGON2_PREHASH_SEED_LENGTH = 72;
+ const ui32 ARGON2_BLOCK_SIZE = 1024;
+ const ui32 ARGON2_QWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8;
+ const ui32 ARGON2_OWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 16;
+ const ui32 ARGON2_HWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 32;
+ const ui32 ARGON2_ADDRESSES_IN_BLOCK = 128;
+ const ui32 ARGON2_SYNC_POINTS = 4;
+ const ui32 ARGON2_SALT_MIN_LEN = 8;
+ const ui32 ARGON2_MIN_OUTLEN = 4;
+
+ struct TBlock {
+ ui64 V[ARGON2_QWORDS_IN_BLOCK];
+ };
+
+ template <EInstructionSet instructionSet, ui32 mcost, ui32 threads>
+ class TArgon2: public IArgon2Base {
+ public:
+ TArgon2(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
+ : SecretLen_(keylen)
+ , Tcost_(tcost)
+ , Atype_(atype)
+ {
+ if (SecretLen_)
+ memcpy(Secret_, key, keylen);
+ }
+
+ virtual ~TArgon2() override {
+ if (SecretLen_) {
+ SecureZeroMemory_(Secret_, SecretLen_);
+ SecretLen_ = 0;
+ }
+ }
+
+ virtual void Hash(const ui8* pwd, ui32 pwdlen, const ui8* salt, ui32 saltlen,
+ ui8* out, ui32 outlen, const ui8* aad = nullptr, ui32 aadlen = 0) const override {
+ TArrayHolder<TBlock> buffer(new TBlock[MemoryBlocks_]);
+ InternalHash_(buffer.Get(), pwd, pwdlen, salt, saltlen, out, outlen, aad, aadlen);
+ }
+
+ virtual bool Verify(const ui8* pwd, ui32 pwdlen, const ui8* salt, ui32 saltlen,
+ const ui8* hash, ui32 hashlen, const ui8* aad = nullptr, ui32 aadlen = 0) const override {
+ TArrayHolder<ui8> hashResult(new ui8[hashlen]);
+ Hash(pwd, pwdlen, salt, saltlen, hashResult.Get(), hashlen, aad, aadlen);
+
+ return SecureCompare_(hash, hashResult.Get(), hashlen);
+ }
+
+ virtual void HashWithCustomMemory(ui8* memory, size_t mlen, const ui8* pwd, ui32 pwdlen,
+ const ui8* salt, ui32 saltlen, ui8* out, ui32 outlen,
+ const ui8* aad = nullptr, ui32 aadlen = 0) const override {
+ if (memory == nullptr || mlen < sizeof(TBlock) * MemoryBlocks_)
+ ythrow yexception() << "memory is null or its size is not enough";
+
+ InternalHash_((TBlock*)memory, pwd, pwdlen, salt, saltlen, out, outlen, aad, aadlen);
+ }
+
+ virtual bool VerifyWithCustomMemory(ui8* memory, size_t mlen, const ui8* pwd, ui32 pwdlen,
+ const ui8* salt, ui32 saltlen, const ui8* hash, ui32 hashlen,
+ const ui8* aad = nullptr, ui32 aadlen = 0) const override {
+ TArrayHolder<ui8> hashResult(new ui8[hashlen]);
+ HashWithCustomMemory(memory, mlen, pwd, pwdlen, salt, saltlen, hashResult.Get(), hashlen, aad, aadlen);
+
+ return SecureCompare_(hashResult.Get(), hash, hashlen);
+ }
+
+ virtual size_t GetMemorySize() const override {
+ return MemoryBlocks_ * sizeof(TBlock);
+ }
+
+ protected: /* Constants */
+ ui8 Secret_[ARGON2_SECRET_MAX_LENGTH] = {0};
+ ui32 SecretLen_ = 0;
+ ui32 Tcost_;
+ EArgon2Type Atype_;
+
+ static constexpr ui32 Lanes_ = threads;
+ static constexpr ui32 MemoryBlocks_ = (mcost >= 2 * ARGON2_SYNC_POINTS * Lanes_) ? (mcost - mcost % (Lanes_ * ARGON2_SYNC_POINTS)) : 2 * ARGON2_SYNC_POINTS * Lanes_;
+ static constexpr ui32 SegmentLength_ = MemoryBlocks_ / (Lanes_ * ARGON2_SYNC_POINTS);
+ static constexpr ui32 LaneLength_ = SegmentLength_ * ARGON2_SYNC_POINTS;
+
+ protected: /* Prototypes */
+ virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock,
+ TBlock* nextBlock, bool withXor) const = 0;
+
+ virtual void CopyBlock_(TBlock* dst, const TBlock* src) const = 0;
+ virtual void XorBlock_(TBlock* dst, const TBlock* src) const = 0;
+
+ protected: /* Static functions */
+ static bool SecureCompare_(const ui8* buffer1, const ui8* buffer2, ui32 len) {
+ bool result = true;
+ for (ui32 i = 0; i < len; ++i) {
+ result &= (buffer1[i] == buffer2[i]);
+ }
+ return result;
+ }
+
+ static void SecureZeroMemory_(void* src, size_t len) {
+ static void* (*const volatile memset_v)(void*, int, size_t) = &memset;
+ memset_v(src, 0, len);
+ }
+
+ static void Store32_(ui32 value, void* mem) {
+ *((ui32*)mem) = value;
+ }
+
+ static void Blake2BHash64_(ui8 out[BLAKE2B_OUTBYTES], const ui8 in[BLAKE2B_OUTBYTES]) {
+ TBlake2B<instructionSet> hash(BLAKE2B_OUTBYTES);
+ hash.Update(in, BLAKE2B_OUTBYTES);
+ hash.Final(out, BLAKE2B_OUTBYTES);
+ }
+
+ static void ExpandBlockhash_(ui8 expanded[ARGON2_BLOCK_SIZE], const ui8 blockhash[ARGON2_PREHASH_SEED_LENGTH]) {
+ ui8 out_buffer[BLAKE2B_OUTBYTES];
+ ui8 in_buffer[BLAKE2B_OUTBYTES];
+ const ui32 HALF_OUT_BYTES = BLAKE2B_OUTBYTES / 2;
+ const ui32 HASH_BLOCKS_COUNT = ((ARGON2_BLOCK_SIZE / HALF_OUT_BYTES));
+
+ TBlake2B<instructionSet> hash(BLAKE2B_OUTBYTES);
+ hash.Update(ARGON2_BLOCK_SIZE);
+ hash.Update(blockhash, ARGON2_PREHASH_SEED_LENGTH);
+ hash.Final(out_buffer, BLAKE2B_OUTBYTES);
+
+ memcpy(expanded, out_buffer, HALF_OUT_BYTES);
+
+ for (ui32 i = 1; i < HASH_BLOCKS_COUNT - 2; ++i) {
+ memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+ Blake2BHash64_(out_buffer, in_buffer);
+ memcpy(expanded + (i * HALF_OUT_BYTES), out_buffer, HALF_OUT_BYTES);
+ }
+
+ Blake2BHash64_(in_buffer, out_buffer);
+ memcpy(expanded + HALF_OUT_BYTES * (HASH_BLOCKS_COUNT - 2), in_buffer, BLAKE2B_OUTBYTES);
+ }
+
+ static void Blake2BLong_(ui8* out, ui32 outlen, const ui8* in, ui32 inlen) {
+ if (outlen < BLAKE2B_OUTBYTES) {
+ TBlake2B<instructionSet> hash(outlen);
+ hash.Update(outlen);
+ hash.Update(in, inlen);
+ hash.Final(out, outlen);
+ } else {
+ ui8 out_buffer[BLAKE2B_OUTBYTES];
+ ui8 in_buffer[BLAKE2B_OUTBYTES];
+ ui32 toproduce = outlen - BLAKE2B_OUTBYTES / 2;
+
TBlake2B<instructionSet> hash1(BLAKE2B_OUTBYTES);
hash1.Update(outlen);
hash1.Update(in, inlen);
hash1.Final(out_buffer, BLAKE2B_OUTBYTES);
-
- memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
- out += BLAKE2B_OUTBYTES / 2;
-
- while (toproduce > BLAKE2B_OUTBYTES) {
- memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+
+ memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+ out += BLAKE2B_OUTBYTES / 2;
+
+ while (toproduce > BLAKE2B_OUTBYTES) {
+ memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
TBlake2B<instructionSet> hash2(BLAKE2B_OUTBYTES);
hash2.Update(in_buffer, BLAKE2B_OUTBYTES);
hash2.Final(out_buffer, BLAKE2B_OUTBYTES);
- memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
- out += BLAKE2B_OUTBYTES / 2;
- toproduce -= BLAKE2B_OUTBYTES / 2;
- }
-
- memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
- {
+ memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+ out += BLAKE2B_OUTBYTES / 2;
+ toproduce -= BLAKE2B_OUTBYTES / 2;
+ }
+
+ memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+ {
TBlake2B<instructionSet> hash3(toproduce);
hash3.Update(in_buffer, BLAKE2B_OUTBYTES);
hash3.Final(out_buffer, toproduce);
- memcpy(out, out_buffer, toproduce);
- }
- }
- }
-
- static void InitBlockValue_(TBlock* b, ui8 in) {
- memset(b->V, in, sizeof(b->V));
- }
-
- protected: /* Functions */
- void InternalHash_(TBlock* memory, const ui8* pwd, ui32 pwdlen,
- const ui8* salt, ui32 saltlen, ui8* out, ui32 outlen,
- const ui8* aad, ui32 aadlen) const {
- /*
- * all parameters checks are in proxy objects
- */
-
- Initialize_(memory, outlen, pwd, pwdlen, salt, saltlen, aad, aadlen);
- FillMemoryBlocks_(memory);
- Finalize_(memory, out, outlen);
- }
-
- void InitialHash_(ui8 blockhash[ARGON2_PREHASH_DIGEST_LENGTH],
- ui32 outlen, const ui8* pwd, ui32 pwdlen,
- const ui8* salt, ui32 saltlen, const ui8* aad, ui32 aadlen) const {
- TBlake2B<instructionSet> hash(ARGON2_PREHASH_DIGEST_LENGTH);
- /* lanes, but lanes == threads */
- hash.Update(Lanes_);
- /* outlen */
- hash.Update(outlen);
- /* m_cost */
- hash.Update(mcost);
- /* t_cost */
- hash.Update(Tcost_);
- /* version */
- hash.Update(0x00000013);
- /* Argon2 type */
- hash.Update((ui32)Atype_);
- /* pwdlen */
- hash.Update(pwdlen);
- /* pwd */
- hash.Update(pwd, pwdlen);
- /* saltlen */
- hash.Update(saltlen);
- /* salt */
- if (saltlen)
- hash.Update(salt, saltlen);
- /* secret */
- hash.Update(SecretLen_);
- if (SecretLen_)
- hash.Update((void*)Secret_, SecretLen_);
- /* aadlen */
- hash.Update(aadlen);
- if (aadlen)
- hash.Update((void*)aad, aadlen);
- hash.Final(blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
- }
-
- void FillFirstBlocks_(TBlock* blocks, ui8* blockhash) const {
- for (ui32 l = 0; l < Lanes_; l++) {
- /* fill the first block of the lane */
- Store32_(l, blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4);
- Store32_(0, blockhash + ARGON2_PREHASH_DIGEST_LENGTH);
- ExpandBlockhash_((ui8*)&(blocks[l * LaneLength_]), blockhash);
-
- /* fill the second block of the lane */
- Store32_(1, blockhash + ARGON2_PREHASH_DIGEST_LENGTH);
- ExpandBlockhash_((ui8*)&(blocks[l * LaneLength_ + 1]), blockhash);
- }
- }
-
- /* The 'if' will be optimized out as the number of threads is known at the compile time */
- void FillMemoryBlocks_(TBlock* memory) const {
- for (ui32 t = 0; t < Tcost_; ++t) {
- for (ui32 s = 0; s < ARGON2_SYNC_POINTS; ++s) {
- if (Lanes_ == 1)
- FillSegment_(memory, t, 0, s);
- else {
- NYmp::SetThreadCount(Lanes_);
- NYmp::ParallelForStaticAutoChunk<ui32>(0, Lanes_, [this, &memory, s, t](int k) {
- this->FillSegment_(memory, t, k, s);
- });
- }
- }
- }
- }
-
- void Initialize_(TBlock* memory, ui32 outlen, const ui8* pwd, ui32 pwdlen,
- const ui8* salt, ui32 saltlen, const ui8* aad, ui32 aadlen) const {
- ui8 blockhash[ARGON2_PREHASH_SEED_LENGTH];
- InitialHash_(blockhash, outlen, pwd, pwdlen, salt, saltlen, aad, aadlen);
- FillFirstBlocks_(memory, blockhash);
- }
-
- ui32 ComputeReferenceArea_(ui32 pass, ui32 slice, ui32 index, bool sameLane) const {
- ui32 passVal = pass == 0 ? (slice * SegmentLength_) : (LaneLength_ - SegmentLength_);
- return sameLane ? passVal + (index - 1) : passVal + (index == 0 ? -1 : 0);
- }
-
- ui32 IndexAlpha_(ui32 pass, ui32 slice, ui32 index, ui32 pseudoRand, bool sameLane) const {
- ui32 referenceAreaSize = ComputeReferenceArea_(pass, slice, index, sameLane);
-
- ui64 relativePosition = pseudoRand;
- relativePosition = relativePosition * relativePosition >> 32;
- relativePosition = referenceAreaSize - 1 - (referenceAreaSize * relativePosition >> 32);
-
- ui32 startPosition = 0;
- if (pass != 0)
- startPosition = (slice == ARGON2_SYNC_POINTS - 1) ? 0 : (slice + 1) * SegmentLength_;
-
- return (ui32)((startPosition + relativePosition) % LaneLength_);
- }
-
- void NextAddresses_(TBlock* addressBlock, TBlock* inputBlock, const TBlock* zeroBlock) const {
- inputBlock->V[6]++;
- FillBlock_(zeroBlock, inputBlock, addressBlock, false);
- FillBlock_(zeroBlock, addressBlock, addressBlock, false);
- }
-
- void Finalize_(const TBlock* memory, ui8* out, ui32 outlen) const {
- TBlock blockhash;
- CopyBlock_(&blockhash, memory + LaneLength_ - 1);
-
- /* XOR the last blocks */
- for (ui32 l = 1; l < Lanes_; ++l) {
- ui32 lastBlockInLane = l * LaneLength_ + (LaneLength_ - 1);
- XorBlock_(&blockhash, memory + lastBlockInLane);
- }
-
- Blake2BLong_(out, outlen, (ui8*)blockhash.V, ARGON2_BLOCK_SIZE);
- }
-
- /* The switch will be optimized out by the compiler as the type is known at the compile time */
- void FillSegment_(TBlock* memory, ui32 pass, ui32 lane, ui32 slice) const {
- switch (Atype_) {
- case EArgon2Type::Argon2d:
- FillSegmentD_(memory, pass, lane, slice);
- return;
- case EArgon2Type::Argon2i:
- FillSegmentI_(memory, pass, lane, slice, EArgon2Type::Argon2i);
- return;
- case EArgon2Type::Argon2id:
- if (pass == 0 && slice < ARGON2_SYNC_POINTS / 2)
- FillSegmentI_(memory, pass, lane, slice, EArgon2Type::Argon2id);
- else
- FillSegmentD_(memory, pass, lane, slice);
- return;
- }
- }
-
- void FillSegmentD_(TBlock* memory, ui32 pass, ui32 lane, ui32 slice) const {
- ui32 startingIndex = (pass == 0 && slice == 0) ? 2 : 0;
- ui32 currOffset = lane * LaneLength_ + slice * SegmentLength_ + startingIndex;
- ui32 prevOffset = currOffset + ((currOffset % LaneLength_ == 0) ? LaneLength_ : 0) - 1;
-
- for (ui32 i = startingIndex; i < SegmentLength_; ++i, ++currOffset, ++prevOffset) {
- if (currOffset % LaneLength_ == 1) {
- prevOffset = currOffset - 1;
- }
-
- ui64 pseudoRand = memory[prevOffset].V[0];
- ui64 refLane = (pass == 0 && slice == 0) ? lane : (((pseudoRand >> 32)) % Lanes_);
- ui64 refIndex = IndexAlpha_(pass, slice, i, (ui32)(pseudoRand & 0xFFFFFFFF), refLane == lane);
-
- TBlock* refBlock = memory + LaneLength_ * refLane + refIndex;
- FillBlock_(memory + prevOffset, refBlock, memory + currOffset, pass != 0);
- }
- }
-
- void FillSegmentI_(TBlock* memory, ui32 pass, ui32 lane, ui32 slice, EArgon2Type atp) const {
- TBlock addressBlock, inputBlock, zeroBlock;
- InitBlockValue_(&zeroBlock, 0);
- InitBlockValue_(&inputBlock, 0);
-
- inputBlock.V[0] = pass;
- inputBlock.V[1] = lane;
- inputBlock.V[2] = slice;
- inputBlock.V[3] = MemoryBlocks_;
- inputBlock.V[4] = Tcost_;
- inputBlock.V[5] = (ui64)atp;
-
- ui32 startingIndex = 0;
-
- if (pass == 0 && slice == 0) {
- startingIndex = 2;
- NextAddresses_(&addressBlock, &inputBlock, &zeroBlock);
- }
-
- ui32 currOffset = lane * LaneLength_ + slice * SegmentLength_ + startingIndex;
- ui32 prevOffset = currOffset + ((currOffset % LaneLength_ == 0) ? LaneLength_ : 0) - 1;
-
- for (ui32 i = startingIndex; i < SegmentLength_; ++i, ++currOffset, ++prevOffset) {
- if (currOffset % LaneLength_ == 1) {
- prevOffset = currOffset - 1;
- }
-
- if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
- NextAddresses_(&addressBlock, &inputBlock, &zeroBlock);
- }
-
- ui64 pseudoRand = addressBlock.V[i % ARGON2_ADDRESSES_IN_BLOCK];
- ui64 refLane = (pass == 0 && slice == 0) ? lane : (((pseudoRand >> 32)) % Lanes_);
- ui64 refIndex = IndexAlpha_(pass, slice, i, (ui32)(pseudoRand & 0xFFFFFFFF), refLane == lane);
-
- TBlock* refBlock = memory + LaneLength_ * refLane + refIndex;
- FillBlock_(memory + prevOffset, refBlock, memory + currOffset, pass != 0);
- }
- }
- };
-}
+ memcpy(out, out_buffer, toproduce);
+ }
+ }
+ }
+
+ static void InitBlockValue_(TBlock* b, ui8 in) {
+ memset(b->V, in, sizeof(b->V));
+ }
+
+ protected: /* Functions */
+ void InternalHash_(TBlock* memory, const ui8* pwd, ui32 pwdlen,
+ const ui8* salt, ui32 saltlen, ui8* out, ui32 outlen,
+ const ui8* aad, ui32 aadlen) const {
+ /*
+ * all parameters checks are in proxy objects
+ */
+
+ Initialize_(memory, outlen, pwd, pwdlen, salt, saltlen, aad, aadlen);
+ FillMemoryBlocks_(memory);
+ Finalize_(memory, out, outlen);
+ }
+
+ void InitialHash_(ui8 blockhash[ARGON2_PREHASH_DIGEST_LENGTH],
+ ui32 outlen, const ui8* pwd, ui32 pwdlen,
+ const ui8* salt, ui32 saltlen, const ui8* aad, ui32 aadlen) const {
+ TBlake2B<instructionSet> hash(ARGON2_PREHASH_DIGEST_LENGTH);
+ /* lanes, but lanes == threads */
+ hash.Update(Lanes_);
+ /* outlen */
+ hash.Update(outlen);
+ /* m_cost */
+ hash.Update(mcost);
+ /* t_cost */
+ hash.Update(Tcost_);
+ /* version */
+ hash.Update(0x00000013);
+ /* Argon2 type */
+ hash.Update((ui32)Atype_);
+ /* pwdlen */
+ hash.Update(pwdlen);
+ /* pwd */
+ hash.Update(pwd, pwdlen);
+ /* saltlen */
+ hash.Update(saltlen);
+ /* salt */
+ if (saltlen)
+ hash.Update(salt, saltlen);
+ /* secret */
+ hash.Update(SecretLen_);
+ if (SecretLen_)
+ hash.Update((void*)Secret_, SecretLen_);
+ /* aadlen */
+ hash.Update(aadlen);
+ if (aadlen)
+ hash.Update((void*)aad, aadlen);
+ hash.Final(blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
+ }
+
+ void FillFirstBlocks_(TBlock* blocks, ui8* blockhash) const {
+ for (ui32 l = 0; l < Lanes_; l++) {
+ /* fill the first block of the lane */
+ Store32_(l, blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4);
+ Store32_(0, blockhash + ARGON2_PREHASH_DIGEST_LENGTH);
+ ExpandBlockhash_((ui8*)&(blocks[l * LaneLength_]), blockhash);
+
+ /* fill the second block of the lane */
+ Store32_(1, blockhash + ARGON2_PREHASH_DIGEST_LENGTH);
+ ExpandBlockhash_((ui8*)&(blocks[l * LaneLength_ + 1]), blockhash);
+ }
+ }
+
+ /* The 'if' will be optimized out as the number of threads is known at the compile time */
+ void FillMemoryBlocks_(TBlock* memory) const {
+ for (ui32 t = 0; t < Tcost_; ++t) {
+ for (ui32 s = 0; s < ARGON2_SYNC_POINTS; ++s) {
+ if (Lanes_ == 1)
+ FillSegment_(memory, t, 0, s);
+ else {
+ NYmp::SetThreadCount(Lanes_);
+ NYmp::ParallelForStaticAutoChunk<ui32>(0, Lanes_, [this, &memory, s, t](int k) {
+ this->FillSegment_(memory, t, k, s);
+ });
+ }
+ }
+ }
+ }
+
+ void Initialize_(TBlock* memory, ui32 outlen, const ui8* pwd, ui32 pwdlen,
+ const ui8* salt, ui32 saltlen, const ui8* aad, ui32 aadlen) const {
+ ui8 blockhash[ARGON2_PREHASH_SEED_LENGTH];
+ InitialHash_(blockhash, outlen, pwd, pwdlen, salt, saltlen, aad, aadlen);
+ FillFirstBlocks_(memory, blockhash);
+ }
+
+ ui32 ComputeReferenceArea_(ui32 pass, ui32 slice, ui32 index, bool sameLane) const {
+ ui32 passVal = pass == 0 ? (slice * SegmentLength_) : (LaneLength_ - SegmentLength_);
+ return sameLane ? passVal + (index - 1) : passVal + (index == 0 ? -1 : 0);
+ }
+
+ ui32 IndexAlpha_(ui32 pass, ui32 slice, ui32 index, ui32 pseudoRand, bool sameLane) const {
+ ui32 referenceAreaSize = ComputeReferenceArea_(pass, slice, index, sameLane);
+
+ ui64 relativePosition = pseudoRand;
+ relativePosition = relativePosition * relativePosition >> 32;
+ relativePosition = referenceAreaSize - 1 - (referenceAreaSize * relativePosition >> 32);
+
+ ui32 startPosition = 0;
+ if (pass != 0)
+ startPosition = (slice == ARGON2_SYNC_POINTS - 1) ? 0 : (slice + 1) * SegmentLength_;
+
+ return (ui32)((startPosition + relativePosition) % LaneLength_);
+ }
+
+ void NextAddresses_(TBlock* addressBlock, TBlock* inputBlock, const TBlock* zeroBlock) const {
+ inputBlock->V[6]++;
+ FillBlock_(zeroBlock, inputBlock, addressBlock, false);
+ FillBlock_(zeroBlock, addressBlock, addressBlock, false);
+ }
+
+ void Finalize_(const TBlock* memory, ui8* out, ui32 outlen) const {
+ TBlock blockhash;
+ CopyBlock_(&blockhash, memory + LaneLength_ - 1);
+
+ /* XOR the last blocks */
+ for (ui32 l = 1; l < Lanes_; ++l) {
+ ui32 lastBlockInLane = l * LaneLength_ + (LaneLength_ - 1);
+ XorBlock_(&blockhash, memory + lastBlockInLane);
+ }
+
+ Blake2BLong_(out, outlen, (ui8*)blockhash.V, ARGON2_BLOCK_SIZE);
+ }
+
+ /* The switch will be optimized out by the compiler as the type is known at the compile time */
+ void FillSegment_(TBlock* memory, ui32 pass, ui32 lane, ui32 slice) const {
+ switch (Atype_) {
+ case EArgon2Type::Argon2d:
+ FillSegmentD_(memory, pass, lane, slice);
+ return;
+ case EArgon2Type::Argon2i:
+ FillSegmentI_(memory, pass, lane, slice, EArgon2Type::Argon2i);
+ return;
+ case EArgon2Type::Argon2id:
+ if (pass == 0 && slice < ARGON2_SYNC_POINTS / 2)
+ FillSegmentI_(memory, pass, lane, slice, EArgon2Type::Argon2id);
+ else
+ FillSegmentD_(memory, pass, lane, slice);
+ return;
+ }
+ }
+
+ void FillSegmentD_(TBlock* memory, ui32 pass, ui32 lane, ui32 slice) const {
+ ui32 startingIndex = (pass == 0 && slice == 0) ? 2 : 0;
+ ui32 currOffset = lane * LaneLength_ + slice * SegmentLength_ + startingIndex;
+ ui32 prevOffset = currOffset + ((currOffset % LaneLength_ == 0) ? LaneLength_ : 0) - 1;
+
+ for (ui32 i = startingIndex; i < SegmentLength_; ++i, ++currOffset, ++prevOffset) {
+ if (currOffset % LaneLength_ == 1) {
+ prevOffset = currOffset - 1;
+ }
+
+ ui64 pseudoRand = memory[prevOffset].V[0];
+ ui64 refLane = (pass == 0 && slice == 0) ? lane : (((pseudoRand >> 32)) % Lanes_);
+ ui64 refIndex = IndexAlpha_(pass, slice, i, (ui32)(pseudoRand & 0xFFFFFFFF), refLane == lane);
+
+ TBlock* refBlock = memory + LaneLength_ * refLane + refIndex;
+ FillBlock_(memory + prevOffset, refBlock, memory + currOffset, pass != 0);
+ }
+ }
+
+ void FillSegmentI_(TBlock* memory, ui32 pass, ui32 lane, ui32 slice, EArgon2Type atp) const {
+ TBlock addressBlock, inputBlock, zeroBlock;
+ InitBlockValue_(&zeroBlock, 0);
+ InitBlockValue_(&inputBlock, 0);
+
+ inputBlock.V[0] = pass;
+ inputBlock.V[1] = lane;
+ inputBlock.V[2] = slice;
+ inputBlock.V[3] = MemoryBlocks_;
+ inputBlock.V[4] = Tcost_;
+ inputBlock.V[5] = (ui64)atp;
+
+ ui32 startingIndex = 0;
+
+ if (pass == 0 && slice == 0) {
+ startingIndex = 2;
+ NextAddresses_(&addressBlock, &inputBlock, &zeroBlock);
+ }
+
+ ui32 currOffset = lane * LaneLength_ + slice * SegmentLength_ + startingIndex;
+ ui32 prevOffset = currOffset + ((currOffset % LaneLength_ == 0) ? LaneLength_ : 0) - 1;
+
+ for (ui32 i = startingIndex; i < SegmentLength_; ++i, ++currOffset, ++prevOffset) {
+ if (currOffset % LaneLength_ == 1) {
+ prevOffset = currOffset - 1;
+ }
+
+ if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
+ NextAddresses_(&addressBlock, &inputBlock, &zeroBlock);
+ }
+
+ ui64 pseudoRand = addressBlock.V[i % ARGON2_ADDRESSES_IN_BLOCK];
+ ui64 refLane = (pass == 0 && slice == 0) ? lane : (((pseudoRand >> 32)) % Lanes_);
+ ui64 refIndex = IndexAlpha_(pass, slice, i, (ui32)(pseudoRand & 0xFFFFFFFF), refLane == lane);
+
+ TBlock* refBlock = memory + LaneLength_ * refLane + refIndex;
+ FillBlock_(memory + prevOffset, refBlock, memory + currOffset, pass != 0);
+ }
+ }
+ };
+}
diff --git a/library/cpp/digest/argonish/internal/argon2/argon2_ref.h b/library/cpp/digest/argonish/internal/argon2/argon2_ref.h
index 8e5e3fa971..d0635b71ee 100644
--- a/library/cpp/digest/argonish/internal/argon2/argon2_ref.h
+++ b/library/cpp/digest/argonish/internal/argon2/argon2_ref.h
@@ -1,88 +1,88 @@
-#pragma once
-
-#include "argon2_base.h"
+#pragma once
+
+#include "argon2_base.h"
#include <library/cpp/digest/argonish/internal/rotations/rotations_ref.h>
-
-namespace NArgonish {
- static inline ui64 FBlaMka(ui64 x, ui64 y) {
- const ui64 m = 0xFFFFFFFF;
- const ui64 xy = (x & m) * (y & m);
- return x + y + 2 * xy;
- }
-
- static inline void BlamkaGRef(ui64& a, ui64& b, ui64& c, ui64& d) {
- a = FBlaMka(a, b);
- d = Rotr(d ^ a, 32);
- c = FBlaMka(c, d);
- b = Rotr(b ^ c, 24);
- a = FBlaMka(a, b);
- d = Rotr(d ^ a, 16);
- c = FBlaMka(c, d);
- b = Rotr(b ^ c, 63);
- }
-
- static inline void BlamkaRoundRef(
- ui64& v0, ui64& v1, ui64& v2, ui64& v3,
- ui64& v4, ui64& v5, ui64& v6, ui64& v7,
- ui64& v8, ui64& v9, ui64& v10, ui64& v11,
- ui64& v12, ui64& v13, ui64& v14, ui64& v15) {
- BlamkaGRef(v0, v4, v8, v12);
- BlamkaGRef(v1, v5, v9, v13);
- BlamkaGRef(v2, v6, v10, v14);
- BlamkaGRef(v3, v7, v11, v15);
- BlamkaGRef(v0, v5, v10, v15);
- BlamkaGRef(v1, v6, v11, v12);
- BlamkaGRef(v2, v7, v8, v13);
- BlamkaGRef(v3, v4, v9, v14);
- }
-
- template <ui32 mcost, ui32 threads>
- class TArgon2REF final: public TArgon2<EInstructionSet::REF, mcost, threads> {
- public:
- TArgon2REF(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
- : TArgon2<EInstructionSet::REF, mcost, threads>(atype, tcost, key, keylen)
- {
- }
-
- protected:
- virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
- for (ui32 i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
- dst->V[i] ^= src->V[i];
- }
- }
-
- virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
- memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
- }
-
- virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool withXor) const override {
- TBlock blockR, blockTmp;
- CopyBlock_(&blockR, refBlock);
- XorBlock_(&blockR, prevBlock);
- CopyBlock_(&blockTmp, &blockR);
-
- if (withXor) {
- XorBlock_(&blockTmp, nextBlock);
- }
-
- for (ui32 i = 0; i < 8; ++i) {
- BlamkaRoundRef(
- blockR.V[16 * i + 0], blockR.V[16 * i + 1], blockR.V[16 * i + 2], blockR.V[16 * i + 3],
- blockR.V[16 * i + 4], blockR.V[16 * i + 5], blockR.V[16 * i + 6], blockR.V[16 * i + 7],
- blockR.V[16 * i + 8], blockR.V[16 * i + 9], blockR.V[16 * i + 10], blockR.V[16 * i + 11],
- blockR.V[16 * i + 12], blockR.V[16 * i + 13], blockR.V[16 * i + 14], blockR.V[16 * i + 15]);
- }
-
- for (ui32 i = 0; i < 8; ++i) {
- BlamkaRoundRef(
- blockR.V[2 * i + 0], blockR.V[2 * i + 1], blockR.V[2 * i + 16], blockR.V[2 * i + 17],
- blockR.V[2 * i + 32], blockR.V[2 * i + 33], blockR.V[2 * i + 48], blockR.V[2 * i + 49],
- blockR.V[2 * i + 64], blockR.V[2 * i + 65], blockR.V[2 * i + 80], blockR.V[2 * i + 81],
- blockR.V[2 * i + 96], blockR.V[2 * i + 97], blockR.V[2 * i + 112], blockR.V[2 * i + 113]);
- }
-
- CopyBlock_(nextBlock, &blockTmp);
- XorBlock_(nextBlock, &blockR);
- }
- };
-}
+
+namespace NArgonish {
+ static inline ui64 FBlaMka(ui64 x, ui64 y) {
+ const ui64 m = 0xFFFFFFFF;
+ const ui64 xy = (x & m) * (y & m);
+ return x + y + 2 * xy;
+ }
+
+ static inline void BlamkaGRef(ui64& a, ui64& b, ui64& c, ui64& d) {
+ a = FBlaMka(a, b);
+ d = Rotr(d ^ a, 32);
+ c = FBlaMka(c, d);
+ b = Rotr(b ^ c, 24);
+ a = FBlaMka(a, b);
+ d = Rotr(d ^ a, 16);
+ c = FBlaMka(c, d);
+ b = Rotr(b ^ c, 63);
+ }
+
+ static inline void BlamkaRoundRef(
+ ui64& v0, ui64& v1, ui64& v2, ui64& v3,
+ ui64& v4, ui64& v5, ui64& v6, ui64& v7,
+ ui64& v8, ui64& v9, ui64& v10, ui64& v11,
+ ui64& v12, ui64& v13, ui64& v14, ui64& v15) {
+ BlamkaGRef(v0, v4, v8, v12);
+ BlamkaGRef(v1, v5, v9, v13);
+ BlamkaGRef(v2, v6, v10, v14);
+ BlamkaGRef(v3, v7, v11, v15);
+ BlamkaGRef(v0, v5, v10, v15);
+ BlamkaGRef(v1, v6, v11, v12);
+ BlamkaGRef(v2, v7, v8, v13);
+ BlamkaGRef(v3, v4, v9, v14);
+ }
+
+ template <ui32 mcost, ui32 threads>
+ class TArgon2REF final: public TArgon2<EInstructionSet::REF, mcost, threads> {
+ public:
+ TArgon2REF(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
+ : TArgon2<EInstructionSet::REF, mcost, threads>(atype, tcost, key, keylen)
+ {
+ }
+
+ protected:
+ virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
+ for (ui32 i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
+ dst->V[i] ^= src->V[i];
+ }
+ }
+
+ virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
+ memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
+ }
+
+ virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool withXor) const override {
+ TBlock blockR, blockTmp;
+ CopyBlock_(&blockR, refBlock);
+ XorBlock_(&blockR, prevBlock);
+ CopyBlock_(&blockTmp, &blockR);
+
+ if (withXor) {
+ XorBlock_(&blockTmp, nextBlock);
+ }
+
+ for (ui32 i = 0; i < 8; ++i) {
+ BlamkaRoundRef(
+ blockR.V[16 * i + 0], blockR.V[16 * i + 1], blockR.V[16 * i + 2], blockR.V[16 * i + 3],
+ blockR.V[16 * i + 4], blockR.V[16 * i + 5], blockR.V[16 * i + 6], blockR.V[16 * i + 7],
+ blockR.V[16 * i + 8], blockR.V[16 * i + 9], blockR.V[16 * i + 10], blockR.V[16 * i + 11],
+ blockR.V[16 * i + 12], blockR.V[16 * i + 13], blockR.V[16 * i + 14], blockR.V[16 * i + 15]);
+ }
+
+ for (ui32 i = 0; i < 8; ++i) {
+ BlamkaRoundRef(
+ blockR.V[2 * i + 0], blockR.V[2 * i + 1], blockR.V[2 * i + 16], blockR.V[2 * i + 17],
+ blockR.V[2 * i + 32], blockR.V[2 * i + 33], blockR.V[2 * i + 48], blockR.V[2 * i + 49],
+ blockR.V[2 * i + 64], blockR.V[2 * i + 65], blockR.V[2 * i + 80], blockR.V[2 * i + 81],
+ blockR.V[2 * i + 96], blockR.V[2 * i + 97], blockR.V[2 * i + 112], blockR.V[2 * i + 113]);
+ }
+
+ CopyBlock_(nextBlock, &blockTmp);
+ XorBlock_(nextBlock, &blockR);
+ }
+ };
+}
diff --git a/library/cpp/digest/argonish/internal/argon2/argon2_sse2.h b/library/cpp/digest/argonish/internal/argon2/argon2_sse2.h
index 1d2230a657..04fc70c56f 100644
--- a/library/cpp/digest/argonish/internal/argon2/argon2_sse2.h
+++ b/library/cpp/digest/argonish/internal/argon2/argon2_sse2.h
@@ -1,101 +1,101 @@
-#pragma once
-
-#include <emmintrin.h>
-#include "argon2_base.h"
+#pragma once
+
+#include <emmintrin.h>
+#include "argon2_base.h"
#include <library/cpp/digest/argonish/internal/blamka/blamka_sse2.h>
-
-namespace NArgonish {
- template <ui32 mcost, ui32 threads>
- class TArgon2SSE2 final: public TArgon2<EInstructionSet::SSE2, mcost, threads> {
- public:
- TArgon2SSE2(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
- : TArgon2<EInstructionSet::SSE2, mcost, threads>(atype, tcost, key, keylen)
- {
- }
-
- protected:
- virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
- __m128i* mdst = (__m128i*)dst->V;
- __m128i* msrc = (__m128i*)src->V;
-
- for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i)
- XorValues(mdst + i, msrc + i, mdst + i);
- }
-
- virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
- memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
- }
-
- virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool withXor) const override {
- __m128i blockxy[ARGON2_OWORDS_IN_BLOCK];
- __m128i state[ARGON2_OWORDS_IN_BLOCK];
-
- memcpy(state, prevBlock, ARGON2_BLOCK_SIZE);
-
- if (withXor) {
- for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
- state[i] = _mm_xor_si128(
- state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
- blockxy[i] = _mm_xor_si128(
- state[i], _mm_loadu_si128((const __m128i*)nextBlock->V + i));
- }
- } else {
- for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
- blockxy[i] = state[i] = _mm_xor_si128(
- state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
- }
- }
-
- for (ui32 i = 0; i < 8; ++i) {
- BlamkaG1SSE2(
- state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- BlamkaG2SSE2(
- state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- DiagonalizeSSE2(
- state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5],
- state[8 * i + 6], state[8 * i + 7]);
- BlamkaG1SSE2(
- state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- BlamkaG2SSE2(
- state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- UndiagonalizeSSE2(
- state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5],
- state[8 * i + 6], state[8 * i + 7]);
- }
-
- for (ui32 i = 0; i < 8; ++i) {
- BlamkaG1SSE2(
- state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
- BlamkaG2SSE2(
- state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
- DiagonalizeSSE2(
- state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i],
- state[8 * 6 + i], state[8 * 7 + i]);
- BlamkaG1SSE2(
- state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
- BlamkaG2SSE2(
- state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
- UndiagonalizeSSE2(
- state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i],
- state[8 * 6 + i], state[8 * 7 + i]);
- }
-
- for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
- state[i] = _mm_xor_si128(state[i], blockxy[i]);
- _mm_storeu_si128((__m128i*)nextBlock->V + i, state[i]);
- }
- }
- };
-}
+
+namespace NArgonish {
+ template <ui32 mcost, ui32 threads>
+ class TArgon2SSE2 final: public TArgon2<EInstructionSet::SSE2, mcost, threads> {
+ public:
+ TArgon2SSE2(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
+ : TArgon2<EInstructionSet::SSE2, mcost, threads>(atype, tcost, key, keylen)
+ {
+ }
+
+ protected:
+ virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
+ __m128i* mdst = (__m128i*)dst->V;
+ __m128i* msrc = (__m128i*)src->V;
+
+ for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i)
+ XorValues(mdst + i, msrc + i, mdst + i);
+ }
+
+ virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
+ memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
+ }
+
+ virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool withXor) const override {
+ __m128i blockxy[ARGON2_OWORDS_IN_BLOCK];
+ __m128i state[ARGON2_OWORDS_IN_BLOCK];
+
+ memcpy(state, prevBlock, ARGON2_BLOCK_SIZE);
+
+ if (withXor) {
+ for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
+ state[i] = _mm_xor_si128(
+ state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
+ blockxy[i] = _mm_xor_si128(
+ state[i], _mm_loadu_si128((const __m128i*)nextBlock->V + i));
+ }
+ } else {
+ for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
+ blockxy[i] = state[i] = _mm_xor_si128(
+ state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
+ }
+ }
+
+ for (ui32 i = 0; i < 8; ++i) {
+ BlamkaG1SSE2(
+ state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ BlamkaG2SSE2(
+ state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ DiagonalizeSSE2(
+ state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5],
+ state[8 * i + 6], state[8 * i + 7]);
+ BlamkaG1SSE2(
+ state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ BlamkaG2SSE2(
+ state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ UndiagonalizeSSE2(
+ state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5],
+ state[8 * i + 6], state[8 * i + 7]);
+ }
+
+ for (ui32 i = 0; i < 8; ++i) {
+ BlamkaG1SSE2(
+ state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
+ BlamkaG2SSE2(
+ state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
+ DiagonalizeSSE2(
+ state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i],
+ state[8 * 6 + i], state[8 * 7 + i]);
+ BlamkaG1SSE2(
+ state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
+ BlamkaG2SSE2(
+ state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
+ UndiagonalizeSSE2(
+ state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i],
+ state[8 * 6 + i], state[8 * 7 + i]);
+ }
+
+ for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
+ state[i] = _mm_xor_si128(state[i], blockxy[i]);
+ _mm_storeu_si128((__m128i*)nextBlock->V + i, state[i]);
+ }
+ }
+ };
+}
diff --git a/library/cpp/digest/argonish/internal/argon2/argon2_sse41.h b/library/cpp/digest/argonish/internal/argon2/argon2_sse41.h
index 1ad35048ea..c9b01915de 100644
--- a/library/cpp/digest/argonish/internal/argon2/argon2_sse41.h
+++ b/library/cpp/digest/argonish/internal/argon2/argon2_sse41.h
@@ -1,101 +1,101 @@
-#pragma once
-
-#include <smmintrin.h>
-#include "argon2_base.h"
+#pragma once
+
+#include <smmintrin.h>
+#include "argon2_base.h"
#include <library/cpp/digest/argonish/internal/blamka/blamka_ssse3.h>
-
-namespace NArgonish {
- template <ui32 mcost, ui32 threads>
- class TArgon2SSE41 final: public TArgon2<EInstructionSet::SSE41, mcost, threads> {
- public:
- TArgon2SSE41(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
- : TArgon2<EInstructionSet::SSE41, mcost, threads>(atype, tcost, key, keylen)
- {
- }
-
- protected:
- virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
- __m128i* mdst = (__m128i*)dst->V;
- __m128i* msrc = (__m128i*)src->V;
-
- for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i)
- XorValues(mdst + i, msrc + i, mdst + i);
- }
-
- virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
- memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
- }
-
- virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool withXor) const override {
- __m128i blockxy[ARGON2_OWORDS_IN_BLOCK];
- __m128i state[ARGON2_OWORDS_IN_BLOCK];
-
- memcpy(state, prevBlock, ARGON2_BLOCK_SIZE);
-
- if (withXor) {
- for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
- state[i] = _mm_xor_si128(
- state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
- blockxy[i] = _mm_xor_si128(
- state[i], _mm_loadu_si128((const __m128i*)nextBlock->V + i));
- }
- } else {
- for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
- blockxy[i] = state[i] = _mm_xor_si128(
- state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
- }
- }
-
- for (ui32 i = 0; i < 8; ++i) {
- BlamkaG1SSSE3(
- state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- BlamkaG2SSSE3(
- state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- DiagonalizeSSSE3(
- state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5],
- state[8 * i + 6], state[8 * i + 7]);
- BlamkaG1SSSE3(
- state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- BlamkaG2SSSE3(
- state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- UndiagonalizeSSSE3(
- state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5],
- state[8 * i + 6], state[8 * i + 7]);
- }
-
- for (ui32 i = 0; i < 8; ++i) {
- BlamkaG1SSSE3(
- state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
- BlamkaG2SSSE3(
- state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
- DiagonalizeSSSE3(
- state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i],
- state[8 * 6 + i], state[8 * 7 + i]);
- BlamkaG1SSSE3(
- state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
- BlamkaG2SSSE3(
- state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
- UndiagonalizeSSSE3(
- state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i],
- state[8 * 6 + i], state[8 * 7 + i]);
- }
-
- for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
- state[i] = _mm_xor_si128(state[i], blockxy[i]);
- _mm_storeu_si128((__m128i*)nextBlock->V + i, state[i]);
- }
- }
- };
-}
+
+namespace NArgonish {
+ template <ui32 mcost, ui32 threads>
+ class TArgon2SSE41 final: public TArgon2<EInstructionSet::SSE41, mcost, threads> {
+ public:
+ TArgon2SSE41(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
+ : TArgon2<EInstructionSet::SSE41, mcost, threads>(atype, tcost, key, keylen)
+ {
+ }
+
+ protected:
+ virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
+ __m128i* mdst = (__m128i*)dst->V;
+ __m128i* msrc = (__m128i*)src->V;
+
+ for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i)
+ XorValues(mdst + i, msrc + i, mdst + i);
+ }
+
+ virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
+ memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
+ }
+
+ virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool withXor) const override {
+ __m128i blockxy[ARGON2_OWORDS_IN_BLOCK];
+ __m128i state[ARGON2_OWORDS_IN_BLOCK];
+
+ memcpy(state, prevBlock, ARGON2_BLOCK_SIZE);
+
+ if (withXor) {
+ for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
+ state[i] = _mm_xor_si128(
+ state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
+ blockxy[i] = _mm_xor_si128(
+ state[i], _mm_loadu_si128((const __m128i*)nextBlock->V + i));
+ }
+ } else {
+ for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
+ blockxy[i] = state[i] = _mm_xor_si128(
+ state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
+ }
+ }
+
+ for (ui32 i = 0; i < 8; ++i) {
+ BlamkaG1SSSE3(
+ state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ BlamkaG2SSSE3(
+ state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ DiagonalizeSSSE3(
+ state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5],
+ state[8 * i + 6], state[8 * i + 7]);
+ BlamkaG1SSSE3(
+ state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ BlamkaG2SSSE3(
+ state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ UndiagonalizeSSSE3(
+ state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5],
+ state[8 * i + 6], state[8 * i + 7]);
+ }
+
+ for (ui32 i = 0; i < 8; ++i) {
+ BlamkaG1SSSE3(
+ state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
+ BlamkaG2SSSE3(
+ state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
+ DiagonalizeSSSE3(
+ state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i],
+ state[8 * 6 + i], state[8 * 7 + i]);
+ BlamkaG1SSSE3(
+ state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
+ BlamkaG2SSSE3(
+ state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
+ UndiagonalizeSSSE3(
+ state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i],
+ state[8 * 6 + i], state[8 * 7 + i]);
+ }
+
+ for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
+ state[i] = _mm_xor_si128(state[i], blockxy[i]);
+ _mm_storeu_si128((__m128i*)nextBlock->V + i, state[i]);
+ }
+ }
+ };
+}
diff --git a/library/cpp/digest/argonish/internal/argon2/argon2_ssse3.h b/library/cpp/digest/argonish/internal/argon2/argon2_ssse3.h
index a25a416834..714197a90f 100644
--- a/library/cpp/digest/argonish/internal/argon2/argon2_ssse3.h
+++ b/library/cpp/digest/argonish/internal/argon2/argon2_ssse3.h
@@ -1,102 +1,102 @@
-#pragma once
-
-#include <emmintrin.h>
-#include <tmmintrin.h>
-#include "argon2_base.h"
+#pragma once
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include "argon2_base.h"
#include <library/cpp/digest/argonish/internal/blamka/blamka_ssse3.h>
-
-namespace NArgonish {
- template <ui32 mcost, ui32 threads>
- class TArgon2SSSE3 final: public TArgon2<EInstructionSet::SSSE3, mcost, threads> {
- public:
- TArgon2SSSE3(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
- : TArgon2<EInstructionSet::SSSE3, mcost, threads>(atype, tcost, key, keylen)
- {
- }
-
- protected:
- virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
- __m128i* mdst = (__m128i*)dst->V;
- __m128i* msrc = (__m128i*)src->V;
-
- for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i)
- XorValues(mdst + i, msrc + i, mdst + i);
- }
-
- virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
- memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
- }
-
- virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool withXor) const override {
- __m128i blockxy[ARGON2_OWORDS_IN_BLOCK];
- __m128i state[ARGON2_OWORDS_IN_BLOCK];
-
- memcpy(state, prevBlock, ARGON2_BLOCK_SIZE);
-
- if (withXor) {
- for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
- state[i] = _mm_xor_si128(
- state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
- blockxy[i] = _mm_xor_si128(
- state[i], _mm_loadu_si128((const __m128i*)nextBlock->V + i));
- }
- } else {
- for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
- blockxy[i] = state[i] = _mm_xor_si128(
- state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
- }
- }
-
- for (ui32 i = 0; i < 8; ++i) {
- BlamkaG1SSSE3(
- state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- BlamkaG2SSSE3(
- state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- DiagonalizeSSSE3(
- state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5],
- state[8 * i + 6], state[8 * i + 7]);
- BlamkaG1SSSE3(
- state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- BlamkaG2SSSE3(
- state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
- UndiagonalizeSSSE3(
- state[8 * i + 2], state[8 * i + 3],
- state[8 * i + 4], state[8 * i + 5],
- state[8 * i + 6], state[8 * i + 7]);
- }
-
- for (ui32 i = 0; i < 8; ++i) {
- BlamkaG1SSSE3(
- state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
- BlamkaG2SSSE3(
- state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
- DiagonalizeSSSE3(
- state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i],
- state[8 * 6 + i], state[8 * 7 + i]);
- BlamkaG1SSSE3(
- state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
- BlamkaG2SSSE3(
- state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
- UndiagonalizeSSSE3(
- state[8 * 2 + i], state[8 * 3 + i],
- state[8 * 4 + i], state[8 * 5 + i],
- state[8 * 6 + i], state[8 * 7 + i]);
- }
-
- for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
- state[i] = _mm_xor_si128(state[i], blockxy[i]);
- _mm_storeu_si128((__m128i*)nextBlock->V + i, state[i]);
- }
- }
- };
-}
+
+namespace NArgonish {
+ template <ui32 mcost, ui32 threads>
+ class TArgon2SSSE3 final: public TArgon2<EInstructionSet::SSSE3, mcost, threads> {
+ public:
+ TArgon2SSSE3(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
+ : TArgon2<EInstructionSet::SSSE3, mcost, threads>(atype, tcost, key, keylen)
+ {
+ }
+
+ protected:
+ virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
+ __m128i* mdst = (__m128i*)dst->V;
+ __m128i* msrc = (__m128i*)src->V;
+
+ for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i)
+ XorValues(mdst + i, msrc + i, mdst + i);
+ }
+
+ virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
+ memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
+ }
+
+ virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool withXor) const override {
+ __m128i blockxy[ARGON2_OWORDS_IN_BLOCK];
+ __m128i state[ARGON2_OWORDS_IN_BLOCK];
+
+ memcpy(state, prevBlock, ARGON2_BLOCK_SIZE);
+
+ if (withXor) {
+ for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
+ state[i] = _mm_xor_si128(
+ state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
+ blockxy[i] = _mm_xor_si128(
+ state[i], _mm_loadu_si128((const __m128i*)nextBlock->V + i));
+ }
+ } else {
+ for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
+ blockxy[i] = state[i] = _mm_xor_si128(
+ state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
+ }
+ }
+
+ for (ui32 i = 0; i < 8; ++i) {
+ BlamkaG1SSSE3(
+ state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ BlamkaG2SSSE3(
+ state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ DiagonalizeSSSE3(
+ state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5],
+ state[8 * i + 6], state[8 * i + 7]);
+ BlamkaG1SSSE3(
+ state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ BlamkaG2SSSE3(
+ state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+ UndiagonalizeSSSE3(
+ state[8 * i + 2], state[8 * i + 3],
+ state[8 * i + 4], state[8 * i + 5],
+ state[8 * i + 6], state[8 * i + 7]);
+ }
+
+ for (ui32 i = 0; i < 8; ++i) {
+ BlamkaG1SSSE3(
+ state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
+ BlamkaG2SSSE3(
+ state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
+ DiagonalizeSSSE3(
+ state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i],
+ state[8 * 6 + i], state[8 * 7 + i]);
+ BlamkaG1SSSE3(
+ state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
+ BlamkaG2SSSE3(
+ state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
+ UndiagonalizeSSSE3(
+ state[8 * 2 + i], state[8 * 3 + i],
+ state[8 * 4 + i], state[8 * 5 + i],
+ state[8 * 6 + i], state[8 * 7 + i]);
+ }
+
+ for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
+ state[i] = _mm_xor_si128(state[i], blockxy[i]);
+ _mm_storeu_si128((__m128i*)nextBlock->V + i, state[i]);
+ }
+ }
+ };
+}
diff --git a/library/cpp/digest/argonish/internal/argon2/ya.make b/library/cpp/digest/argonish/internal/argon2/ya.make
index 85459865ba..10002edb17 100644
--- a/library/cpp/digest/argonish/internal/argon2/ya.make
+++ b/library/cpp/digest/argonish/internal/argon2/ya.make
@@ -1,10 +1,10 @@
-LIBRARY()
-
-OWNER(e-sidorov)
-
-PEERDIR(
+LIBRARY()
+
+OWNER(e-sidorov)
+
+PEERDIR(
library/cpp/digest/argonish/internal/blamka
library/cpp/digest/argonish/internal/blake2b
-)
-
-END()
+)
+
+END()
diff --git a/library/cpp/digest/argonish/internal/blake2b/blake2b.h b/library/cpp/digest/argonish/internal/blake2b/blake2b.h
index 3dcfc3fc48..4dc696c972 100644
--- a/library/cpp/digest/argonish/internal/blake2b/blake2b.h
+++ b/library/cpp/digest/argonish/internal/blake2b/blake2b.h
@@ -1,187 +1,187 @@
-#pragma once
-
-#include <util/generic/yexception.h>
-#include <util/system/compiler.h>
+#pragma once
+
+#include <util/generic/yexception.h>
+#include <util/system/compiler.h>
#include <library/cpp/digest/argonish/blake2b.h>
-
-namespace NArgonish {
- const ui32 BLAKE2B_BLOCKBYTES = 128;
- const ui32 BLAKE2B_BLOCKQWORDS = BLAKE2B_BLOCKBYTES / 8;
- const ui32 BLAKE2B_OUTBYTES = 64;
- const ui32 BLAKE2B_KEYBYTES = 64;
- const ui32 BLAKE2B_SALTBYTES = 16;
- const ui32 BLAKE2B_PERSONALBYTES = 16;
-
- template <NArgonish::EInstructionSet instructionSet>
- class TBlake2B final: public IBlake2Base {
- public:
- virtual ~TBlake2B<instructionSet>() {
- SecureZeroMemory_((void*)&State_, sizeof(State_));
- SecureZeroMemory_((void*)&Param_, sizeof(Param_));
- }
-
- EInstructionSet GetInstructionSet() {
- return instructionSet;
- }
-
- protected:
- struct TBlake2BState {
- ui64 H[8];
- ui64 T[2];
- ui64 F[2];
- ui64 Buf[BLAKE2B_BLOCKQWORDS];
- size_t BufLen;
- size_t OutLen;
- ui8 LastNode;
- };
-
+
+namespace NArgonish {
+ const ui32 BLAKE2B_BLOCKBYTES = 128;
+ const ui32 BLAKE2B_BLOCKQWORDS = BLAKE2B_BLOCKBYTES / 8;
+ const ui32 BLAKE2B_OUTBYTES = 64;
+ const ui32 BLAKE2B_KEYBYTES = 64;
+ const ui32 BLAKE2B_SALTBYTES = 16;
+ const ui32 BLAKE2B_PERSONALBYTES = 16;
+
+ template <NArgonish::EInstructionSet instructionSet>
+ class TBlake2B final: public IBlake2Base {
+ public:
+ virtual ~TBlake2B<instructionSet>() {
+ SecureZeroMemory_((void*)&State_, sizeof(State_));
+ SecureZeroMemory_((void*)&Param_, sizeof(Param_));
+ }
+
+ EInstructionSet GetInstructionSet() {
+ return instructionSet;
+ }
+
+ protected:
+ struct TBlake2BState {
+ ui64 H[8];
+ ui64 T[2];
+ ui64 F[2];
+ ui64 Buf[BLAKE2B_BLOCKQWORDS];
+ size_t BufLen;
+ size_t OutLen;
+ ui8 LastNode;
+ };
+
struct TBlake2BParam {
- ui8 DigestLen; /* 1 */
- ui8 KeyLen; /* 2 */
- ui8 Fanout; /* 3 */
- ui8 Depth; /* 4 */
- ui32 LeafLength; /* 8 */
- ui32 NodeOffset; /* 12 */
- ui32 XofLength; /* 16 */
- ui8 NodeDepth; /* 17 */
- ui8 InnerLength; /* 18 */
- ui8 Reserved[14]; /* 32 */
- ui8 Salt[BLAKE2B_SALTBYTES]; /* 48 */
- ui8 Personal[BLAKE2B_PERSONALBYTES]; /* 64 */
+ ui8 DigestLen; /* 1 */
+ ui8 KeyLen; /* 2 */
+ ui8 Fanout; /* 3 */
+ ui8 Depth; /* 4 */
+ ui32 LeafLength; /* 8 */
+ ui32 NodeOffset; /* 12 */
+ ui32 XofLength; /* 16 */
+ ui8 NodeDepth; /* 17 */
+ ui8 InnerLength; /* 18 */
+ ui8 Reserved[14]; /* 32 */
+ ui8 Salt[BLAKE2B_SALTBYTES]; /* 48 */
+ ui8 Personal[BLAKE2B_PERSONALBYTES]; /* 64 */
} Y_PACKED;
-
- TBlake2BState State_;
- TBlake2BParam Param_;
-
- protected:
- void Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]);
- void InitialXor_(ui8* h, const ui8* p);
- void* GetIV_() const;
-
- static void SecureZeroMemory_(void* src, size_t len) {
- static void* (*const volatile memsetv)(void*, int, size_t) = &memset;
- memsetv(src, 0, len);
- }
-
- void InitParam_() {
- memset(&State_, 0, sizeof(State_));
- InitialXor_((ui8*)(State_.H), (const ui8*)(&Param_));
- State_.OutLen = Param_.DigestLen;
- }
-
- void IncrementCounter_(const ui64 inc) {
- State_.T[0] += inc;
- State_.T[1] += (State_.T[0] < inc) ? 1 : 0;
- }
-
- bool IsLastBlock_() {
- return State_.F[0] != 0;
- }
-
- void SetLastNode_() {
- State_.F[1] = (ui64)-1;
- }
-
- void SetLastBlock_() {
- if (State_.LastNode)
- SetLastNode_();
-
- State_.F[0] = (ui64)-1;
- }
-
- public:
- TBlake2B(size_t outlen) {
- /*
- * Note that outlen check was moved to proxy class
- */
-
- Param_.DigestLen = (ui8)outlen;
- Param_.KeyLen = 0;
- Param_.Fanout = 1;
- Param_.Depth = 1;
- Param_.LeafLength = 0;
- Param_.NodeOffset = 0;
- Param_.XofLength = 0;
- Param_.NodeDepth = 0;
- Param_.InnerLength = 0;
-
- memset(Param_.Reserved, 0, sizeof(Param_.Reserved));
- memset(Param_.Salt, 0, sizeof(Param_.Salt));
- memset(Param_.Personal, 0, sizeof(Param_.Personal));
-
- InitParam_();
- }
-
- TBlake2B(size_t outlen, const void* key, size_t keylen) {
- /**
- * Note that key and outlen checks were moved to proxy classes
- */
- Param_.DigestLen = (ui8)outlen;
- Param_.KeyLen = (ui8)keylen;
- Param_.Fanout = 1;
- Param_.Depth = 1;
-
- Param_.LeafLength = 0;
- Param_.NodeOffset = 0;
- Param_.XofLength = 0;
- Param_.NodeDepth = 0;
- Param_.InnerLength = 0;
-
- memset(Param_.Reserved, 0, sizeof(Param_.Reserved));
- memset(Param_.Salt, 0, sizeof(Param_.Salt));
- memset(Param_.Personal, 0, sizeof(Param_.Personal));
-
- InitParam_();
- ui8 block[BLAKE2B_BLOCKBYTES] = {0};
- memcpy(block, key, keylen);
- Update(block, BLAKE2B_BLOCKBYTES);
- SecureZeroMemory_(block, BLAKE2B_BLOCKBYTES);
- }
-
- void Update(ui32 in) override {
- Update((const void*)&in, sizeof(in));
- }
-
- void Update(const void* pin, size_t inlen) override {
- const ui8* in = (ui8*)pin;
- if (inlen > 0) {
- size_t left = State_.BufLen;
- size_t fill = BLAKE2B_BLOCKBYTES - left;
- if (inlen > fill) {
- State_.BufLen = 0;
- memcpy((ui8*)State_.Buf + left, in, fill); /* Fill buffer */
- IncrementCounter_(BLAKE2B_BLOCKBYTES);
- Compress_(State_.Buf); /* Compress */
- in += fill;
- inlen -= fill;
- while (inlen > BLAKE2B_BLOCKBYTES) {
- /* to fix ubsan's unaligned report */
- ui64 tmpbuf[BLAKE2B_BLOCKQWORDS];
- memcpy(tmpbuf, in, BLAKE2B_BLOCKBYTES);
-
- IncrementCounter_(BLAKE2B_BLOCKBYTES);
- Compress_(tmpbuf);
- in += BLAKE2B_BLOCKBYTES;
- inlen -= BLAKE2B_BLOCKBYTES;
- }
- }
- memcpy((ui8*)State_.Buf + State_.BufLen, in, inlen);
- State_.BufLen += inlen;
- }
- }
-
- void Final(void* out, size_t outlen) override {
- if (out == nullptr || outlen < State_.OutLen)
- ythrow yexception() << "out is null or outlen is too long";
-
- if (IsLastBlock_())
- ythrow yexception() << "Final can't be called several times";
-
- IncrementCounter_(State_.BufLen);
- SetLastBlock_();
- memset((ui8*)State_.Buf + State_.BufLen, 0, BLAKE2B_BLOCKBYTES - State_.BufLen);
- Compress_(State_.Buf);
- memcpy(out, (void*)&State_.H[0], outlen);
- }
- };
-}
+
+ TBlake2BState State_;
+ TBlake2BParam Param_;
+
+ protected:
+ void Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]);
+ void InitialXor_(ui8* h, const ui8* p);
+ void* GetIV_() const;
+
+ static void SecureZeroMemory_(void* src, size_t len) {
+ static void* (*const volatile memsetv)(void*, int, size_t) = &memset;
+ memsetv(src, 0, len);
+ }
+
+ void InitParam_() {
+ memset(&State_, 0, sizeof(State_));
+ InitialXor_((ui8*)(State_.H), (const ui8*)(&Param_));
+ State_.OutLen = Param_.DigestLen;
+ }
+
+ void IncrementCounter_(const ui64 inc) {
+ State_.T[0] += inc;
+ State_.T[1] += (State_.T[0] < inc) ? 1 : 0;
+ }
+
+ bool IsLastBlock_() {
+ return State_.F[0] != 0;
+ }
+
+ void SetLastNode_() {
+ State_.F[1] = (ui64)-1;
+ }
+
+ void SetLastBlock_() {
+ if (State_.LastNode)
+ SetLastNode_();
+
+ State_.F[0] = (ui64)-1;
+ }
+
+ public:
+ TBlake2B(size_t outlen) {
+ /*
+ * Note that outlen check was moved to proxy class
+ */
+
+ Param_.DigestLen = (ui8)outlen;
+ Param_.KeyLen = 0;
+ Param_.Fanout = 1;
+ Param_.Depth = 1;
+ Param_.LeafLength = 0;
+ Param_.NodeOffset = 0;
+ Param_.XofLength = 0;
+ Param_.NodeDepth = 0;
+ Param_.InnerLength = 0;
+
+ memset(Param_.Reserved, 0, sizeof(Param_.Reserved));
+ memset(Param_.Salt, 0, sizeof(Param_.Salt));
+ memset(Param_.Personal, 0, sizeof(Param_.Personal));
+
+ InitParam_();
+ }
+
+ TBlake2B(size_t outlen, const void* key, size_t keylen) {
+ /**
+ * Note that key and outlen checks were moved to proxy classes
+ */
+ Param_.DigestLen = (ui8)outlen;
+ Param_.KeyLen = (ui8)keylen;
+ Param_.Fanout = 1;
+ Param_.Depth = 1;
+
+ Param_.LeafLength = 0;
+ Param_.NodeOffset = 0;
+ Param_.XofLength = 0;
+ Param_.NodeDepth = 0;
+ Param_.InnerLength = 0;
+
+ memset(Param_.Reserved, 0, sizeof(Param_.Reserved));
+ memset(Param_.Salt, 0, sizeof(Param_.Salt));
+ memset(Param_.Personal, 0, sizeof(Param_.Personal));
+
+ InitParam_();
+ ui8 block[BLAKE2B_BLOCKBYTES] = {0};
+ memcpy(block, key, keylen);
+ Update(block, BLAKE2B_BLOCKBYTES);
+ SecureZeroMemory_(block, BLAKE2B_BLOCKBYTES);
+ }
+
+ void Update(ui32 in) override {
+ Update((const void*)&in, sizeof(in));
+ }
+
+ void Update(const void* pin, size_t inlen) override {
+ const ui8* in = (ui8*)pin;
+ if (inlen > 0) {
+ size_t left = State_.BufLen;
+ size_t fill = BLAKE2B_BLOCKBYTES - left;
+ if (inlen > fill) {
+ State_.BufLen = 0;
+ memcpy((ui8*)State_.Buf + left, in, fill); /* Fill buffer */
+ IncrementCounter_(BLAKE2B_BLOCKBYTES);
+ Compress_(State_.Buf); /* Compress */
+ in += fill;
+ inlen -= fill;
+ while (inlen > BLAKE2B_BLOCKBYTES) {
+ /* to fix ubsan's unaligned report */
+ ui64 tmpbuf[BLAKE2B_BLOCKQWORDS];
+ memcpy(tmpbuf, in, BLAKE2B_BLOCKBYTES);
+
+ IncrementCounter_(BLAKE2B_BLOCKBYTES);
+ Compress_(tmpbuf);
+ in += BLAKE2B_BLOCKBYTES;
+ inlen -= BLAKE2B_BLOCKBYTES;
+ }
+ }
+ memcpy((ui8*)State_.Buf + State_.BufLen, in, inlen);
+ State_.BufLen += inlen;
+ }
+ }
+
+ void Final(void* out, size_t outlen) override {
+ if (out == nullptr || outlen < State_.OutLen)
+ ythrow yexception() << "out is null or outlen is too long";
+
+ if (IsLastBlock_())
+ ythrow yexception() << "Final can't be called several times";
+
+ IncrementCounter_(State_.BufLen);
+ SetLastBlock_();
+ memset((ui8*)State_.Buf + State_.BufLen, 0, BLAKE2B_BLOCKBYTES - State_.BufLen);
+ Compress_(State_.Buf);
+ memcpy(out, (void*)&State_.H[0], outlen);
+ }
+ };
+}
diff --git a/library/cpp/digest/argonish/internal/blake2b/blake2b_avx2.h b/library/cpp/digest/argonish/internal/blake2b/blake2b_avx2.h
index 359ca90ebb..76eec8cd5a 100644
--- a/library/cpp/digest/argonish/internal/blake2b/blake2b_avx2.h
+++ b/library/cpp/digest/argonish/internal/blake2b/blake2b_avx2.h
@@ -1,104 +1,104 @@
-#pragma once
-
-#include <immintrin.h>
-#include "blake2b.h"
+#pragma once
+
+#include <immintrin.h>
+#include "blake2b.h"
#include <library/cpp/digest/argonish/internal/rotations/rotations_avx2.h>
-
-namespace NArgonish {
- template <>
- void* TBlake2B<EInstructionSet::AVX2>::GetIV_() const {
- static const __m256i Iv[2] = {
- _mm256_set_epi64x(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL, 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL),
- _mm256_set_epi64x(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL, 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL)};
- return (void*)Iv;
- }
-
- template <>
- void TBlake2B<EInstructionSet::AVX2>::InitialXor_(ui8* h, const ui8* p) {
- __m256i* iv = (__m256i*)GetIV_();
- __m256i* m_res = (__m256i*)h;
- const __m256i* m_second = (__m256i*)p;
- _mm256_storeu_si256(m_res, _mm256_xor_si256(iv[0], _mm256_loadu_si256(m_second)));
- _mm256_storeu_si256(m_res + 1, _mm256_xor_si256(iv[1], _mm256_loadu_si256(m_second + 1)));
- }
-
- /*
- * a = v0, v1, v2, v3
- * b = v4, v5, v6, v7
- * c = v8, v9, v10, v11
- * d = v12, v13, v14, v15
- */
- static inline void G1AVX2(ui32 r, __m256i& a, __m256i& b, __m256i& c, __m256i& d, const ui64* blk, const __m128i vindex[12][4]) {
- a = _mm256_add_epi64(a, _mm256_add_epi64(b, _mm256_i32gather_epi64((const long long int*)blk, vindex[r][0], 8)));
- d = Rotr32(_mm256_xor_si256(a, d));
- c = _mm256_add_epi64(c, d);
- b = Rotr24(_mm256_xor_si256(b, c));
-
- a = _mm256_add_epi64(a, _mm256_add_epi64(b, _mm256_i32gather_epi64((const long long int*)blk, vindex[r][1], 8)));
- d = Rotr16(_mm256_xor_si256(a, d));
- c = _mm256_add_epi64(c, d);
- b = Rotr63(_mm256_xor_si256(b, c));
- }
-
- static inline void G2AVX2(ui32 r, __m256i& a, __m256i& b, __m256i& c, __m256i& d, const ui64* blk, const __m128i vindex[12][4]) {
- a = _mm256_add_epi64(a, _mm256_add_epi64(b, _mm256_i32gather_epi64((const long long int*)blk, vindex[r][2], 8)));
- d = Rotr32(_mm256_xor_si256(a, d));
- c = _mm256_add_epi64(c, d);
- b = Rotr24(_mm256_xor_si256(b, c));
-
- a = _mm256_add_epi64(a, _mm256_add_epi64(b, _mm256_i32gather_epi64((const long long int*)blk, vindex[r][3], 8)));
- d = Rotr16(_mm256_xor_si256(a, d));
- c = _mm256_add_epi64(c, d);
- b = Rotr63(_mm256_xor_si256(b, c));
- }
-
- static inline void Diagonalize(__m256i& b, __m256i& c, __m256i& d) {
- b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1));
- c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2));
- d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3));
- }
-
- static inline void Undiagonalize(__m256i& b, __m256i& c, __m256i& d) {
- b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3));
- c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2));
- d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1));
- }
-
- template <>
- void TBlake2B<EInstructionSet::AVX2>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
- static const __m128i VIndex[12][4] = {
- {_mm_set_epi32(6, 4, 2, 0), _mm_set_epi32(7, 5, 3, 1), _mm_set_epi32(14, 12, 10, 8), _mm_set_epi32(15, 13, 11, 9)},
- {_mm_set_epi32(13, 9, 4, 14), _mm_set_epi32(6, 15, 8, 10), _mm_set_epi32(5, 11, 0, 1), _mm_set_epi32(3, 7, 2, 12)},
- {_mm_set_epi32(15, 5, 12, 11), _mm_set_epi32(13, 2, 0, 8), _mm_set_epi32(9, 7, 3, 10), _mm_set_epi32(4, 1, 6, 14)},
- {_mm_set_epi32(11, 13, 3, 7), _mm_set_epi32(14, 12, 1, 9), _mm_set_epi32(15, 4, 5, 2), _mm_set_epi32(8, 0, 10, 6)},
- {_mm_set_epi32(10, 2, 5, 9), _mm_set_epi32(15, 4, 7, 0), _mm_set_epi32(3, 6, 11, 14), _mm_set_epi32(13, 8, 12, 1)},
- {_mm_set_epi32(8, 0, 6, 2), _mm_set_epi32(3, 11, 10, 12), _mm_set_epi32(1, 15, 7, 4), _mm_set_epi32(9, 14, 5, 13)},
- {_mm_set_epi32(4, 14, 1, 12), _mm_set_epi32(10, 13, 15, 5), _mm_set_epi32(8, 9, 6, 0), _mm_set_epi32(11, 2, 3, 7)},
- {_mm_set_epi32(3, 12, 7, 13), _mm_set_epi32(9, 1, 14, 11), _mm_set_epi32(2, 8, 15, 5), _mm_set_epi32(10, 6, 4, 0)},
- {_mm_set_epi32(0, 11, 14, 6), _mm_set_epi32(8, 3, 9, 15), _mm_set_epi32(10, 1, 13, 12), _mm_set_epi32(5, 4, 7, 2)},
- {_mm_set_epi32(1, 7, 8, 10), _mm_set_epi32(5, 6, 4, 2), _mm_set_epi32(13, 3, 9, 15), _mm_set_epi32(0, 12, 14, 11)},
- {_mm_set_epi32(6, 4, 2, 0), _mm_set_epi32(7, 5, 3, 1), _mm_set_epi32(14, 12, 10, 8), _mm_set_epi32(15, 13, 11, 9)},
- {_mm_set_epi32(13, 9, 4, 14), _mm_set_epi32(6, 15, 8, 10), _mm_set_epi32(5, 11, 0, 1), _mm_set_epi32(3, 7, 2, 12)},
- };
-
- __m256i* iv = (__m256i*)GetIV_();
- __m256i a = _mm256_loadu_si256((__m256i*)&State_.H[0]);
- __m256i b = _mm256_loadu_si256((__m256i*)&State_.H[4]);
- __m256i c = iv[0];
- __m256i d = _mm256_xor_si256(iv[1], _mm256_loadu_si256((__m256i*)&State_.T[0]));
-
- for (ui32 r = 0; r < 12; ++r) {
- G1AVX2(r, a, b, c, d, block, VIndex);
- Diagonalize(b, c, d);
- G2AVX2(r, a, b, c, d, block, VIndex);
- Undiagonalize(b, c, d);
- }
-
- _mm256_storeu_si256((__m256i*)State_.H, _mm256_xor_si256(
- _mm256_loadu_si256((__m256i*)State_.H),
- _mm256_xor_si256(a, c)));
- _mm256_storeu_si256(((__m256i*)State_.H) + 1, _mm256_xor_si256(
- _mm256_loadu_si256(((__m256i*)State_.H) + 1),
- _mm256_xor_si256(b, d)));
- }
-}
+
+namespace NArgonish {
+ template <>
+ void* TBlake2B<EInstructionSet::AVX2>::GetIV_() const {
+ static const __m256i Iv[2] = {
+ _mm256_set_epi64x(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL, 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL),
+ _mm256_set_epi64x(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL, 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL)};
+ return (void*)Iv;
+ }
+
+ template <>
+ void TBlake2B<EInstructionSet::AVX2>::InitialXor_(ui8* h, const ui8* p) {
+ __m256i* iv = (__m256i*)GetIV_();
+ __m256i* m_res = (__m256i*)h;
+ const __m256i* m_second = (__m256i*)p;
+ _mm256_storeu_si256(m_res, _mm256_xor_si256(iv[0], _mm256_loadu_si256(m_second)));
+ _mm256_storeu_si256(m_res + 1, _mm256_xor_si256(iv[1], _mm256_loadu_si256(m_second + 1)));
+ }
+
+ /*
+ * a = v0, v1, v2, v3
+ * b = v4, v5, v6, v7
+ * c = v8, v9, v10, v11
+ * d = v12, v13, v14, v15
+ */
+ static inline void G1AVX2(ui32 r, __m256i& a, __m256i& b, __m256i& c, __m256i& d, const ui64* blk, const __m128i vindex[12][4]) {
+ a = _mm256_add_epi64(a, _mm256_add_epi64(b, _mm256_i32gather_epi64((const long long int*)blk, vindex[r][0], 8)));
+ d = Rotr32(_mm256_xor_si256(a, d));
+ c = _mm256_add_epi64(c, d);
+ b = Rotr24(_mm256_xor_si256(b, c));
+
+ a = _mm256_add_epi64(a, _mm256_add_epi64(b, _mm256_i32gather_epi64((const long long int*)blk, vindex[r][1], 8)));
+ d = Rotr16(_mm256_xor_si256(a, d));
+ c = _mm256_add_epi64(c, d);
+ b = Rotr63(_mm256_xor_si256(b, c));
+ }
+
+ static inline void G2AVX2(ui32 r, __m256i& a, __m256i& b, __m256i& c, __m256i& d, const ui64* blk, const __m128i vindex[12][4]) {
+ a = _mm256_add_epi64(a, _mm256_add_epi64(b, _mm256_i32gather_epi64((const long long int*)blk, vindex[r][2], 8)));
+ d = Rotr32(_mm256_xor_si256(a, d));
+ c = _mm256_add_epi64(c, d);
+ b = Rotr24(_mm256_xor_si256(b, c));
+
+ a = _mm256_add_epi64(a, _mm256_add_epi64(b, _mm256_i32gather_epi64((const long long int*)blk, vindex[r][3], 8)));
+ d = Rotr16(_mm256_xor_si256(a, d));
+ c = _mm256_add_epi64(c, d);
+ b = Rotr63(_mm256_xor_si256(b, c));
+ }
+
+ static inline void Diagonalize(__m256i& b, __m256i& c, __m256i& d) {
+ b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1));
+ c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2));
+ d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3));
+ }
+
+ static inline void Undiagonalize(__m256i& b, __m256i& c, __m256i& d) {
+ b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3));
+ c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2));
+ d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1));
+ }
+
+ template <>
+ void TBlake2B<EInstructionSet::AVX2>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
+ static const __m128i VIndex[12][4] = {
+ {_mm_set_epi32(6, 4, 2, 0), _mm_set_epi32(7, 5, 3, 1), _mm_set_epi32(14, 12, 10, 8), _mm_set_epi32(15, 13, 11, 9)},
+ {_mm_set_epi32(13, 9, 4, 14), _mm_set_epi32(6, 15, 8, 10), _mm_set_epi32(5, 11, 0, 1), _mm_set_epi32(3, 7, 2, 12)},
+ {_mm_set_epi32(15, 5, 12, 11), _mm_set_epi32(13, 2, 0, 8), _mm_set_epi32(9, 7, 3, 10), _mm_set_epi32(4, 1, 6, 14)},
+ {_mm_set_epi32(11, 13, 3, 7), _mm_set_epi32(14, 12, 1, 9), _mm_set_epi32(15, 4, 5, 2), _mm_set_epi32(8, 0, 10, 6)},
+ {_mm_set_epi32(10, 2, 5, 9), _mm_set_epi32(15, 4, 7, 0), _mm_set_epi32(3, 6, 11, 14), _mm_set_epi32(13, 8, 12, 1)},
+ {_mm_set_epi32(8, 0, 6, 2), _mm_set_epi32(3, 11, 10, 12), _mm_set_epi32(1, 15, 7, 4), _mm_set_epi32(9, 14, 5, 13)},
+ {_mm_set_epi32(4, 14, 1, 12), _mm_set_epi32(10, 13, 15, 5), _mm_set_epi32(8, 9, 6, 0), _mm_set_epi32(11, 2, 3, 7)},
+ {_mm_set_epi32(3, 12, 7, 13), _mm_set_epi32(9, 1, 14, 11), _mm_set_epi32(2, 8, 15, 5), _mm_set_epi32(10, 6, 4, 0)},
+ {_mm_set_epi32(0, 11, 14, 6), _mm_set_epi32(8, 3, 9, 15), _mm_set_epi32(10, 1, 13, 12), _mm_set_epi32(5, 4, 7, 2)},
+ {_mm_set_epi32(1, 7, 8, 10), _mm_set_epi32(5, 6, 4, 2), _mm_set_epi32(13, 3, 9, 15), _mm_set_epi32(0, 12, 14, 11)},
+ {_mm_set_epi32(6, 4, 2, 0), _mm_set_epi32(7, 5, 3, 1), _mm_set_epi32(14, 12, 10, 8), _mm_set_epi32(15, 13, 11, 9)},
+ {_mm_set_epi32(13, 9, 4, 14), _mm_set_epi32(6, 15, 8, 10), _mm_set_epi32(5, 11, 0, 1), _mm_set_epi32(3, 7, 2, 12)},
+ };
+
+ __m256i* iv = (__m256i*)GetIV_();
+ __m256i a = _mm256_loadu_si256((__m256i*)&State_.H[0]);
+ __m256i b = _mm256_loadu_si256((__m256i*)&State_.H[4]);
+ __m256i c = iv[0];
+ __m256i d = _mm256_xor_si256(iv[1], _mm256_loadu_si256((__m256i*)&State_.T[0]));
+
+ for (ui32 r = 0; r < 12; ++r) {
+ G1AVX2(r, a, b, c, d, block, VIndex);
+ Diagonalize(b, c, d);
+ G2AVX2(r, a, b, c, d, block, VIndex);
+ Undiagonalize(b, c, d);
+ }
+
+ _mm256_storeu_si256((__m256i*)State_.H, _mm256_xor_si256(
+ _mm256_loadu_si256((__m256i*)State_.H),
+ _mm256_xor_si256(a, c)));
+ _mm256_storeu_si256(((__m256i*)State_.H) + 1, _mm256_xor_si256(
+ _mm256_loadu_si256(((__m256i*)State_.H) + 1),
+ _mm256_xor_si256(b, d)));
+ }
+}
diff --git a/library/cpp/digest/argonish/internal/blake2b/blake2b_ref.h b/library/cpp/digest/argonish/internal/blake2b/blake2b_ref.h
index ef98ed8fc8..1a2306f4a0 100644
--- a/library/cpp/digest/argonish/internal/blake2b/blake2b_ref.h
+++ b/library/cpp/digest/argonish/internal/blake2b/blake2b_ref.h
@@ -1,83 +1,83 @@
-#pragma once
-
-#include "blake2b.h"
+#pragma once
+
+#include "blake2b.h"
#include <library/cpp/digest/argonish/internal/rotations/rotations_ref.h>
-
-namespace NArgonish {
- static const ui8 Sigma[12][16] = {
- {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
- {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
- {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
- {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
- {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
- {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
- {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
- {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
- {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
- {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
- {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
- {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}};
-
- static const ui64 Iv[8] = {
- 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
- 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
- 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
- 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL};
-
- static inline void GRef(ui64 r, ui64 i, ui64& a, ui64& b, ui64& c, ui64& d, const ui64* m) {
- a = a + b + m[Sigma[r][2 * i + 0]];
- d = Rotr(d ^ a, 32);
- c = c + d;
- b = Rotr(b ^ c, 24);
- a = a + b + m[Sigma[r][2 * i + 1]];
- d = Rotr(d ^ a, 16);
- c = c + d;
- b = Rotr(b ^ c, 63);
- }
-
- static inline void Round(ui64 r, ui64* v, const ui64* m) {
- GRef(r, 0, v[0], v[4], v[8], v[12], m);
- GRef(r, 1, v[1], v[5], v[9], v[13], m);
- GRef(r, 2, v[2], v[6], v[10], v[14], m);
- GRef(r, 3, v[3], v[7], v[11], v[15], m);
- GRef(r, 4, v[0], v[5], v[10], v[15], m);
- GRef(r, 5, v[1], v[6], v[11], v[12], m);
- GRef(r, 6, v[2], v[7], v[8], v[13], m);
- GRef(r, 7, v[3], v[4], v[9], v[14], m);
- }
-
- template <>
- void* TBlake2B<EInstructionSet::REF>::GetIV_() const {
- return nullptr;
- }
-
- template <>
- void TBlake2B<EInstructionSet::REF>::InitialXor_(ui8* h, const ui8* p) {
- for (size_t i = 0; i < 8; ++i)
- ((ui64*)h)[i] = Iv[i] ^ ((ui64*)p)[i];
- }
-
- template <>
- void TBlake2B<EInstructionSet::REF>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
- ui64 v[16];
- for (size_t i = 0; i < 8; ++i) {
- v[i] = State_.H[i];
- }
-
- v[8] = Iv[0];
- v[9] = Iv[1];
- v[10] = Iv[2];
- v[11] = Iv[3];
- v[12] = Iv[4] ^ State_.T[0];
- v[13] = Iv[5] ^ State_.T[1];
- v[14] = Iv[6] ^ State_.F[0];
- v[15] = Iv[7] ^ State_.F[1];
-
- for (ui64 r = 0; r < 12; ++r)
- Round(r, v, block);
-
- for (size_t i = 0; i < 8; ++i) {
- State_.H[i] = State_.H[i] ^ v[i] ^ v[i + 8];
- }
- }
-}
+
+namespace NArgonish {
+ static const ui8 Sigma[12][16] = {
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+ {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+ {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+ {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+ {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+ {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+ {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+ {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+ {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}};
+
+ static const ui64 Iv[8] = {
+ 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+ 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+ 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+ 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL};
+
+ static inline void GRef(ui64 r, ui64 i, ui64& a, ui64& b, ui64& c, ui64& d, const ui64* m) {
+ a = a + b + m[Sigma[r][2 * i + 0]];
+ d = Rotr(d ^ a, 32);
+ c = c + d;
+ b = Rotr(b ^ c, 24);
+ a = a + b + m[Sigma[r][2 * i + 1]];
+ d = Rotr(d ^ a, 16);
+ c = c + d;
+ b = Rotr(b ^ c, 63);
+ }
+
+ static inline void Round(ui64 r, ui64* v, const ui64* m) {
+ GRef(r, 0, v[0], v[4], v[8], v[12], m);
+ GRef(r, 1, v[1], v[5], v[9], v[13], m);
+ GRef(r, 2, v[2], v[6], v[10], v[14], m);
+ GRef(r, 3, v[3], v[7], v[11], v[15], m);
+ GRef(r, 4, v[0], v[5], v[10], v[15], m);
+ GRef(r, 5, v[1], v[6], v[11], v[12], m);
+ GRef(r, 6, v[2], v[7], v[8], v[13], m);
+ GRef(r, 7, v[3], v[4], v[9], v[14], m);
+ }
+
+ template <>
+ void* TBlake2B<EInstructionSet::REF>::GetIV_() const {
+ return nullptr;
+ }
+
+ template <>
+ void TBlake2B<EInstructionSet::REF>::InitialXor_(ui8* h, const ui8* p) {
+ for (size_t i = 0; i < 8; ++i)
+ ((ui64*)h)[i] = Iv[i] ^ ((ui64*)p)[i];
+ }
+
+ template <>
+ void TBlake2B<EInstructionSet::REF>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
+ ui64 v[16];
+ for (size_t i = 0; i < 8; ++i) {
+ v[i] = State_.H[i];
+ }
+
+ v[8] = Iv[0];
+ v[9] = Iv[1];
+ v[10] = Iv[2];
+ v[11] = Iv[3];
+ v[12] = Iv[4] ^ State_.T[0];
+ v[13] = Iv[5] ^ State_.T[1];
+ v[14] = Iv[6] ^ State_.F[0];
+ v[15] = Iv[7] ^ State_.F[1];
+
+ for (ui64 r = 0; r < 12; ++r)
+ Round(r, v, block);
+
+ for (size_t i = 0; i < 8; ++i) {
+ State_.H[i] = State_.H[i] ^ v[i] ^ v[i + 8];
+ }
+ }
+}
diff --git a/library/cpp/digest/argonish/internal/blake2b/blake2b_sse2.h b/library/cpp/digest/argonish/internal/blake2b/blake2b_sse2.h
index e85a78044c..0b4f8f85cc 100644
--- a/library/cpp/digest/argonish/internal/blake2b/blake2b_sse2.h
+++ b/library/cpp/digest/argonish/internal/blake2b/blake2b_sse2.h
@@ -1,163 +1,163 @@
-#pragma once
-
-#include <emmintrin.h>
-#include "blake2b.h"
+#pragma once
+
+#include <emmintrin.h>
+#include "blake2b.h"
#include <library/cpp/digest/argonish/internal/rotations/rotations_sse2.h>
-
-namespace NArgonish {
- template <>
- void* TBlake2B<EInstructionSet::SSE2>::GetIV_() const {
- static const __m128i Iv[4] = {
- _mm_set_epi64x(0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL),
- _mm_set_epi64x(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL),
- _mm_set_epi64x(0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL),
- _mm_set_epi64x(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL)};
-
- return (void*)Iv;
- }
-
- static const ui32 Sigma[12][16] = {
- {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
- {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
- {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
- {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
- {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
- {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
- {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
- {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
- {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
- {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
- {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
- {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}};
-
- static inline void G1(
- __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
- __m128i& b0, __m128i& b1) {
- row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
- row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
-
- row4l = _mm_xor_si128(row4l, row1l);
- row4h = _mm_xor_si128(row4h, row1h);
-
- row4l = Rotr32(row4l);
- row4h = Rotr32(row4h);
-
- row3l = _mm_add_epi64(row3l, row4l);
- row3h = _mm_add_epi64(row3h, row4h);
-
- row2l = _mm_xor_si128(row2l, row3l);
- row2h = _mm_xor_si128(row2h, row3h);
-
- row2l = Rotr24(row2l);
- row2h = Rotr24(row2h);
- }
-
- static inline void G2(
- __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
- __m128i& b0, __m128i& b1) {
- row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
- row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
-
- row4l = _mm_xor_si128(row4l, row1l);
- row4h = _mm_xor_si128(row4h, row1h);
-
- row4l = Rotr16(row4l);
- row4h = Rotr16(row4h);
-
- row3l = _mm_add_epi64(row3l, row4l);
- row3h = _mm_add_epi64(row3h, row4h);
-
- row2l = _mm_xor_si128(row2l, row3l);
- row2h = _mm_xor_si128(row2h, row3h);
-
- row2l = Rotr63(row2l);
- row2h = Rotr63(row2h);
- }
-
- static inline void Diagonalize(
- __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row2h, __m128i& row3h, __m128i& row4h) {
- __m128i t0 = row4l;
- __m128i t1 = row2l;
- row4l = row3l;
- row3l = row3h;
- row3h = row4l;
- row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
- row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
- row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
- row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
- }
-
- static inline void Undiagonalize(
- __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row2h, __m128i& row3h, __m128i& row4h) {
- __m128i t0 = row3l;
- row3l = row3h;
- row3h = t0;
- t0 = row2l;
- __m128i t1 = row4l;
- row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
- row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
- row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
- row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
- }
-
- static inline void Round(int r, const ui64* block_ptr,
- __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h) {
- __m128i b0, b1;
- b0 = _mm_set_epi64x(block_ptr[Sigma[r][2]], block_ptr[Sigma[r][0]]);
- b1 = _mm_set_epi64x(block_ptr[Sigma[r][6]], block_ptr[Sigma[r][4]]);
- G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
- b0 = _mm_set_epi64x(block_ptr[Sigma[r][3]], block_ptr[Sigma[r][1]]);
- b1 = _mm_set_epi64x(block_ptr[Sigma[r][7]], block_ptr[Sigma[r][5]]);
- G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
- Diagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
- b0 = _mm_set_epi64x(block_ptr[Sigma[r][10]], block_ptr[Sigma[r][8]]);
- b1 = _mm_set_epi64x(block_ptr[Sigma[r][14]], block_ptr[Sigma[r][12]]);
- G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
- b0 = _mm_set_epi64x(block_ptr[Sigma[r][11]], block_ptr[Sigma[r][9]]);
- b1 = _mm_set_epi64x(block_ptr[Sigma[r][15]], block_ptr[Sigma[r][13]]);
- G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
- Undiagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
- }
-
- template <>
- void TBlake2B<EInstructionSet::SSE2>::InitialXor_(ui8* h, const ui8* p) {
- __m128i* m_res = (__m128i*)h;
- const __m128i* m_p = (__m128i*)p;
- __m128i* iv = (__m128i*)GetIV_();
-
- _mm_storeu_si128(m_res + 0, _mm_xor_si128(iv[0], _mm_loadu_si128(m_p + 0)));
- _mm_storeu_si128(m_res + 1, _mm_xor_si128(iv[1], _mm_loadu_si128(m_p + 1)));
- _mm_storeu_si128(m_res + 2, _mm_xor_si128(iv[2], _mm_loadu_si128(m_p + 2)));
- _mm_storeu_si128(m_res + 3, _mm_xor_si128(iv[3], _mm_loadu_si128(m_p + 3)));
- }
-
- template <>
- void TBlake2B<EInstructionSet::SSE2>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
- __m128i* iv = (__m128i*)GetIV_();
- __m128i row1l = _mm_loadu_si128((__m128i*)&State_.H[0]);
- __m128i row1h = _mm_loadu_si128((__m128i*)&State_.H[2]);
- __m128i row2l = _mm_loadu_si128((__m128i*)&State_.H[4]);
- __m128i row2h = _mm_loadu_si128((__m128i*)&State_.H[6]);
- __m128i row3l = iv[0];
- __m128i row3h = iv[1];
- __m128i row4l = _mm_xor_si128(iv[2], _mm_loadu_si128((__m128i*)&State_.T[0]));
- __m128i row4h = _mm_xor_si128(iv[3], _mm_loadu_si128((__m128i*)&State_.F[0]));
-
- for (int r = 0; r < 12; r++)
- Round(r, block, row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
-
- _mm_storeu_si128((__m128i*)&State_.H[0],
- _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[0]), _mm_xor_si128(row3l, row1l)));
- _mm_storeu_si128((__m128i*)&State_.H[2],
- _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[2]), _mm_xor_si128(row3h, row1h)));
- _mm_storeu_si128((__m128i*)&State_.H[4],
- _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[4]), _mm_xor_si128(row4l, row2l)));
- _mm_storeu_si128((__m128i*)&State_.H[6],
- _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[6]), _mm_xor_si128(row4h, row2h)));
- }
-}
+
+namespace NArgonish {
+ template <>
+ void* TBlake2B<EInstructionSet::SSE2>::GetIV_() const {
+ static const __m128i Iv[4] = {
+ _mm_set_epi64x(0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL),
+ _mm_set_epi64x(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL),
+ _mm_set_epi64x(0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL),
+ _mm_set_epi64x(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL)};
+
+ return (void*)Iv;
+ }
+
+ static const ui32 Sigma[12][16] = {
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+ {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+ {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+ {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+ {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+ {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+ {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+ {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+ {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}};
+
+ static inline void G1(
+ __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
+ __m128i& b0, __m128i& b1) {
+ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
+ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
+
+ row4l = _mm_xor_si128(row4l, row1l);
+ row4h = _mm_xor_si128(row4h, row1h);
+
+ row4l = Rotr32(row4l);
+ row4h = Rotr32(row4h);
+
+ row3l = _mm_add_epi64(row3l, row4l);
+ row3h = _mm_add_epi64(row3h, row4h);
+
+ row2l = _mm_xor_si128(row2l, row3l);
+ row2h = _mm_xor_si128(row2h, row3h);
+
+ row2l = Rotr24(row2l);
+ row2h = Rotr24(row2h);
+ }
+
+ static inline void G2(
+ __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
+ __m128i& b0, __m128i& b1) {
+ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
+ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
+
+ row4l = _mm_xor_si128(row4l, row1l);
+ row4h = _mm_xor_si128(row4h, row1h);
+
+ row4l = Rotr16(row4l);
+ row4h = Rotr16(row4h);
+
+ row3l = _mm_add_epi64(row3l, row4l);
+ row3h = _mm_add_epi64(row3h, row4h);
+
+ row2l = _mm_xor_si128(row2l, row3l);
+ row2h = _mm_xor_si128(row2h, row3h);
+
+ row2l = Rotr63(row2l);
+ row2h = Rotr63(row2h);
+ }
+
+ static inline void Diagonalize(
+ __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row2h, __m128i& row3h, __m128i& row4h) {
+ __m128i t0 = row4l;
+ __m128i t1 = row2l;
+ row4l = row3l;
+ row3l = row3h;
+ row3h = row4l;
+ row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
+ row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
+ row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
+ row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
+ }
+
+ static inline void Undiagonalize(
+ __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row2h, __m128i& row3h, __m128i& row4h) {
+ __m128i t0 = row3l;
+ row3l = row3h;
+ row3h = t0;
+ t0 = row2l;
+ __m128i t1 = row4l;
+ row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
+ row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
+ row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
+ row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
+ }
+
+ static inline void Round(int r, const ui64* block_ptr,
+ __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h) {
+ __m128i b0, b1;
+ b0 = _mm_set_epi64x(block_ptr[Sigma[r][2]], block_ptr[Sigma[r][0]]);
+ b1 = _mm_set_epi64x(block_ptr[Sigma[r][6]], block_ptr[Sigma[r][4]]);
+ G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
+ b0 = _mm_set_epi64x(block_ptr[Sigma[r][3]], block_ptr[Sigma[r][1]]);
+ b1 = _mm_set_epi64x(block_ptr[Sigma[r][7]], block_ptr[Sigma[r][5]]);
+ G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
+ Diagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
+ b0 = _mm_set_epi64x(block_ptr[Sigma[r][10]], block_ptr[Sigma[r][8]]);
+ b1 = _mm_set_epi64x(block_ptr[Sigma[r][14]], block_ptr[Sigma[r][12]]);
+ G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
+ b0 = _mm_set_epi64x(block_ptr[Sigma[r][11]], block_ptr[Sigma[r][9]]);
+ b1 = _mm_set_epi64x(block_ptr[Sigma[r][15]], block_ptr[Sigma[r][13]]);
+ G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
+ Undiagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
+ }
+
+ template <>
+ void TBlake2B<EInstructionSet::SSE2>::InitialXor_(ui8* h, const ui8* p) {
+ __m128i* m_res = (__m128i*)h;
+ const __m128i* m_p = (__m128i*)p;
+ __m128i* iv = (__m128i*)GetIV_();
+
+ _mm_storeu_si128(m_res + 0, _mm_xor_si128(iv[0], _mm_loadu_si128(m_p + 0)));
+ _mm_storeu_si128(m_res + 1, _mm_xor_si128(iv[1], _mm_loadu_si128(m_p + 1)));
+ _mm_storeu_si128(m_res + 2, _mm_xor_si128(iv[2], _mm_loadu_si128(m_p + 2)));
+ _mm_storeu_si128(m_res + 3, _mm_xor_si128(iv[3], _mm_loadu_si128(m_p + 3)));
+ }
+
+ template <>
+ void TBlake2B<EInstructionSet::SSE2>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
+ __m128i* iv = (__m128i*)GetIV_();
+ __m128i row1l = _mm_loadu_si128((__m128i*)&State_.H[0]);
+ __m128i row1h = _mm_loadu_si128((__m128i*)&State_.H[2]);
+ __m128i row2l = _mm_loadu_si128((__m128i*)&State_.H[4]);
+ __m128i row2h = _mm_loadu_si128((__m128i*)&State_.H[6]);
+ __m128i row3l = iv[0];
+ __m128i row3h = iv[1];
+ __m128i row4l = _mm_xor_si128(iv[2], _mm_loadu_si128((__m128i*)&State_.T[0]));
+ __m128i row4h = _mm_xor_si128(iv[3], _mm_loadu_si128((__m128i*)&State_.F[0]));
+
+ for (int r = 0; r < 12; r++)
+ Round(r, block, row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
+
+ _mm_storeu_si128((__m128i*)&State_.H[0],
+ _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[0]), _mm_xor_si128(row3l, row1l)));
+ _mm_storeu_si128((__m128i*)&State_.H[2],
+ _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[2]), _mm_xor_si128(row3h, row1h)));
+ _mm_storeu_si128((__m128i*)&State_.H[4],
+ _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[4]), _mm_xor_si128(row4l, row2l)));
+ _mm_storeu_si128((__m128i*)&State_.H[6],
+ _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[6]), _mm_xor_si128(row4h, row2h)));
+ }
+}
diff --git a/library/cpp/digest/argonish/internal/blake2b/blake2b_sse41.h b/library/cpp/digest/argonish/internal/blake2b/blake2b_sse41.h
index 1a033bcceb..c1103db4c9 100644
--- a/library/cpp/digest/argonish/internal/blake2b/blake2b_sse41.h
+++ b/library/cpp/digest/argonish/internal/blake2b/blake2b_sse41.h
@@ -1,172 +1,172 @@
-#pragma once
-
-#include <smmintrin.h>
-#include "blake2b.h"
-#include "load_sse41.h"
+#pragma once
+
+#include <smmintrin.h>
+#include "blake2b.h"
+#include "load_sse41.h"
#include <library/cpp/digest/argonish/internal/rotations/rotations_ssse3.h>
-
-namespace NArgonish {
- template <>
- void* TBlake2B<EInstructionSet::SSE41>::GetIV_() const {
- static const __m128i Iv[4] = {
- _mm_set_epi64x(0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL),
- _mm_set_epi64x(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL),
- _mm_set_epi64x(0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL),
- _mm_set_epi64x(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL)};
- return (void*)Iv;
- }
-
- static inline void G1(
- __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
- __m128i& b0, __m128i& b1) {
- row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
- row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
-
- row4l = _mm_xor_si128(row4l, row1l);
- row4h = _mm_xor_si128(row4h, row1h);
-
- row4l = Rotr32(row4l);
- row4h = Rotr32(row4h);
-
- row3l = _mm_add_epi64(row3l, row4l);
- row3h = _mm_add_epi64(row3h, row4h);
-
- row2l = _mm_xor_si128(row2l, row3l);
- row2h = _mm_xor_si128(row2h, row3h);
-
- row2l = Rotr24(row2l);
- row2h = Rotr24(row2h);
- }
-
- static inline void G2(
- __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
- __m128i& b0, __m128i& b1) {
- row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
- row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
-
- row4l = _mm_xor_si128(row4l, row1l);
- row4h = _mm_xor_si128(row4h, row1h);
-
- row4l = Rotr16(row4l);
- row4h = Rotr16(row4h);
-
- row3l = _mm_add_epi64(row3l, row4l);
- row3h = _mm_add_epi64(row3h, row4h);
-
- row2l = _mm_xor_si128(row2l, row3l);
- row2h = _mm_xor_si128(row2h, row3h);
-
- row2l = Rotr63(row2l);
- row2h = Rotr63(row2h);
- }
-
- static inline void Diagonalize(
- __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row2h, __m128i& row3h, __m128i& row4h) {
- __m128i t0 = _mm_alignr_epi8(row2h, row2l, 8);
- __m128i t1 = _mm_alignr_epi8(row2l, row2h, 8);
- row2l = t0;
- row2h = t1;
-
- t0 = row3l;
- row3l = row3h;
- row3h = t0;
-
- t0 = _mm_alignr_epi8(row4h, row4l, 8);
- t1 = _mm_alignr_epi8(row4l, row4h, 8);
- row4l = t1;
- row4h = t0;
- }
-
- static inline void Undiagonalize(
- __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row2h, __m128i& row3h, __m128i& row4h) {
- __m128i t0 = _mm_alignr_epi8(row2l, row2h, 8);
- __m128i t1 = _mm_alignr_epi8(row2h, row2l, 8);
- row2l = t0;
- row2h = t1;
-
- t0 = row3l;
- row3l = row3h;
- row3h = t0;
-
- t0 = _mm_alignr_epi8(row4l, row4h, 8);
- t1 = _mm_alignr_epi8(row4h, row4l, 8);
- row4l = t1;
- row4h = t0;
- }
-
-#define ROUND(r) \
- LOAD_MSG_##r##_1(b0, b1); \
- G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
- LOAD_MSG_##r##_2(b0, b1); \
- G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
- Diagonalize(row2l, row3l, row4l, row2h, row3h, row4h); \
- LOAD_MSG_##r##_3(b0, b1); \
- G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
- LOAD_MSG_##r##_4(b0, b1); \
- G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
- Undiagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
-
- template <>
- void TBlake2B<EInstructionSet::SSE41>::InitialXor_(ui8* h, const ui8* p) {
- __m128i* m_res = (__m128i*)h;
- const __m128i* m_p = (__m128i*)p;
- __m128i* iv = (__m128i*)GetIV_();
-
- _mm_storeu_si128(m_res + 0, _mm_xor_si128(iv[0], _mm_loadu_si128(m_p + 0)));
- _mm_storeu_si128(m_res + 1, _mm_xor_si128(iv[1], _mm_loadu_si128(m_p + 1)));
- _mm_storeu_si128(m_res + 2, _mm_xor_si128(iv[2], _mm_loadu_si128(m_p + 2)));
- _mm_storeu_si128(m_res + 3, _mm_xor_si128(iv[3], _mm_loadu_si128(m_p + 3)));
- }
-
- template <>
- void TBlake2B<EInstructionSet::SSE41>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
- const __m128i* block_ptr = (__m128i*)block;
- __m128i* iv = (__m128i*)GetIV_();
- const __m128i m0 = _mm_loadu_si128(block_ptr + 0);
- const __m128i m1 = _mm_loadu_si128(block_ptr + 1);
- const __m128i m2 = _mm_loadu_si128(block_ptr + 2);
- const __m128i m3 = _mm_loadu_si128(block_ptr + 3);
- const __m128i m4 = _mm_loadu_si128(block_ptr + 4);
- const __m128i m5 = _mm_loadu_si128(block_ptr + 5);
- const __m128i m6 = _mm_loadu_si128(block_ptr + 6);
- const __m128i m7 = _mm_loadu_si128(block_ptr + 7);
-
- __m128i row1l = _mm_loadu_si128((__m128i*)&State_.H[0]);
- __m128i row1h = _mm_loadu_si128((__m128i*)&State_.H[2]);
- __m128i row2l = _mm_loadu_si128((__m128i*)&State_.H[4]);
- __m128i row2h = _mm_loadu_si128((__m128i*)&State_.H[6]);
- __m128i row3l = iv[0];
- __m128i row3h = iv[1];
- __m128i row4l = _mm_xor_si128(iv[2], _mm_loadu_si128((__m128i*)&State_.T[0]));
- __m128i row4h = _mm_xor_si128(iv[3], _mm_loadu_si128((__m128i*)&State_.F[0]));
- __m128i b0, b1;
-
- ROUND(0);
- ROUND(1);
- ROUND(2);
- ROUND(3);
- ROUND(4);
- ROUND(5);
- ROUND(6);
- ROUND(7);
- ROUND(8);
- ROUND(9);
- ROUND(10);
- ROUND(11);
-
- _mm_storeu_si128((__m128i*)&State_.H[0],
- _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[0]), _mm_xor_si128(row3l, row1l)));
- _mm_storeu_si128((__m128i*)&State_.H[2],
- _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[2]), _mm_xor_si128(row3h, row1h)));
- _mm_storeu_si128((__m128i*)&State_.H[4],
- _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[4]), _mm_xor_si128(row4l, row2l)));
- _mm_storeu_si128((__m128i*)&State_.H[6],
- _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[6]), _mm_xor_si128(row4h, row2h)));
- }
-
-#undef ROUND
-}
+
+namespace NArgonish {
+ template <>
+ void* TBlake2B<EInstructionSet::SSE41>::GetIV_() const {
+ static const __m128i Iv[4] = {
+ _mm_set_epi64x(0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL),
+ _mm_set_epi64x(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL),
+ _mm_set_epi64x(0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL),
+ _mm_set_epi64x(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL)};
+ return (void*)Iv;
+ }
+
+ static inline void G1(
+ __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
+ __m128i& b0, __m128i& b1) {
+ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
+ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
+
+ row4l = _mm_xor_si128(row4l, row1l);
+ row4h = _mm_xor_si128(row4h, row1h);
+
+ row4l = Rotr32(row4l);
+ row4h = Rotr32(row4h);
+
+ row3l = _mm_add_epi64(row3l, row4l);
+ row3h = _mm_add_epi64(row3h, row4h);
+
+ row2l = _mm_xor_si128(row2l, row3l);
+ row2h = _mm_xor_si128(row2h, row3h);
+
+ row2l = Rotr24(row2l);
+ row2h = Rotr24(row2h);
+ }
+
+ static inline void G2(
+ __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
+ __m128i& b0, __m128i& b1) {
+ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
+ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
+
+ row4l = _mm_xor_si128(row4l, row1l);
+ row4h = _mm_xor_si128(row4h, row1h);
+
+ row4l = Rotr16(row4l);
+ row4h = Rotr16(row4h);
+
+ row3l = _mm_add_epi64(row3l, row4l);
+ row3h = _mm_add_epi64(row3h, row4h);
+
+ row2l = _mm_xor_si128(row2l, row3l);
+ row2h = _mm_xor_si128(row2h, row3h);
+
+ row2l = Rotr63(row2l);
+ row2h = Rotr63(row2h);
+ }
+
+ static inline void Diagonalize(
+ __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row2h, __m128i& row3h, __m128i& row4h) {
+ __m128i t0 = _mm_alignr_epi8(row2h, row2l, 8);
+ __m128i t1 = _mm_alignr_epi8(row2l, row2h, 8);
+ row2l = t0;
+ row2h = t1;
+
+ t0 = row3l;
+ row3l = row3h;
+ row3h = t0;
+
+ t0 = _mm_alignr_epi8(row4h, row4l, 8);
+ t1 = _mm_alignr_epi8(row4l, row4h, 8);
+ row4l = t1;
+ row4h = t0;
+ }
+
+ static inline void Undiagonalize(
+ __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row2h, __m128i& row3h, __m128i& row4h) {
+ __m128i t0 = _mm_alignr_epi8(row2l, row2h, 8);
+ __m128i t1 = _mm_alignr_epi8(row2h, row2l, 8);
+ row2l = t0;
+ row2h = t1;
+
+ t0 = row3l;
+ row3l = row3h;
+ row3h = t0;
+
+ t0 = _mm_alignr_epi8(row4l, row4h, 8);
+ t1 = _mm_alignr_epi8(row4h, row4l, 8);
+ row4l = t1;
+ row4h = t0;
+ }
+
+#define ROUND(r) \
+ LOAD_MSG_##r##_1(b0, b1); \
+ G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
+ LOAD_MSG_##r##_2(b0, b1); \
+ G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
+ Diagonalize(row2l, row3l, row4l, row2h, row3h, row4h); \
+ LOAD_MSG_##r##_3(b0, b1); \
+ G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
+ LOAD_MSG_##r##_4(b0, b1); \
+ G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
+ Undiagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
+
+ template <>
+ void TBlake2B<EInstructionSet::SSE41>::InitialXor_(ui8* h, const ui8* p) {
+ __m128i* m_res = (__m128i*)h;
+ const __m128i* m_p = (__m128i*)p;
+ __m128i* iv = (__m128i*)GetIV_();
+
+ _mm_storeu_si128(m_res + 0, _mm_xor_si128(iv[0], _mm_loadu_si128(m_p + 0)));
+ _mm_storeu_si128(m_res + 1, _mm_xor_si128(iv[1], _mm_loadu_si128(m_p + 1)));
+ _mm_storeu_si128(m_res + 2, _mm_xor_si128(iv[2], _mm_loadu_si128(m_p + 2)));
+ _mm_storeu_si128(m_res + 3, _mm_xor_si128(iv[3], _mm_loadu_si128(m_p + 3)));
+ }
+
+ template <>
+ void TBlake2B<EInstructionSet::SSE41>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
+ const __m128i* block_ptr = (__m128i*)block;
+ __m128i* iv = (__m128i*)GetIV_();
+ const __m128i m0 = _mm_loadu_si128(block_ptr + 0);
+ const __m128i m1 = _mm_loadu_si128(block_ptr + 1);
+ const __m128i m2 = _mm_loadu_si128(block_ptr + 2);
+ const __m128i m3 = _mm_loadu_si128(block_ptr + 3);
+ const __m128i m4 = _mm_loadu_si128(block_ptr + 4);
+ const __m128i m5 = _mm_loadu_si128(block_ptr + 5);
+ const __m128i m6 = _mm_loadu_si128(block_ptr + 6);
+ const __m128i m7 = _mm_loadu_si128(block_ptr + 7);
+
+ __m128i row1l = _mm_loadu_si128((__m128i*)&State_.H[0]);
+ __m128i row1h = _mm_loadu_si128((__m128i*)&State_.H[2]);
+ __m128i row2l = _mm_loadu_si128((__m128i*)&State_.H[4]);
+ __m128i row2h = _mm_loadu_si128((__m128i*)&State_.H[6]);
+ __m128i row3l = iv[0];
+ __m128i row3h = iv[1];
+ __m128i row4l = _mm_xor_si128(iv[2], _mm_loadu_si128((__m128i*)&State_.T[0]));
+ __m128i row4h = _mm_xor_si128(iv[3], _mm_loadu_si128((__m128i*)&State_.F[0]));
+ __m128i b0, b1;
+
+ ROUND(0);
+ ROUND(1);
+ ROUND(2);
+ ROUND(3);
+ ROUND(4);
+ ROUND(5);
+ ROUND(6);
+ ROUND(7);
+ ROUND(8);
+ ROUND(9);
+ ROUND(10);
+ ROUND(11);
+
+ _mm_storeu_si128((__m128i*)&State_.H[0],
+ _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[0]), _mm_xor_si128(row3l, row1l)));
+ _mm_storeu_si128((__m128i*)&State_.H[2],
+ _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[2]), _mm_xor_si128(row3h, row1h)));
+ _mm_storeu_si128((__m128i*)&State_.H[4],
+ _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[4]), _mm_xor_si128(row4l, row2l)));
+ _mm_storeu_si128((__m128i*)&State_.H[6],
+ _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[6]), _mm_xor_si128(row4h, row2h)));
+ }
+
+#undef ROUND
+}
diff --git a/library/cpp/digest/argonish/internal/blake2b/blake2b_ssse3.h b/library/cpp/digest/argonish/internal/blake2b/blake2b_ssse3.h
index 4cca5a5e7f..24bf8ea31a 100644
--- a/library/cpp/digest/argonish/internal/blake2b/blake2b_ssse3.h
+++ b/library/cpp/digest/argonish/internal/blake2b/blake2b_ssse3.h
@@ -1,171 +1,171 @@
-#pragma once
-
-#include <emmintrin.h>
-#include <tmmintrin.h>
-#include "blake2b.h"
+#pragma once
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include "blake2b.h"
#include <library/cpp/digest/argonish/internal/rotations/rotations_ssse3.h>
-
-namespace NArgonish {
- template <>
- void* TBlake2B<EInstructionSet::SSSE3>::GetIV_() const {
- static const __m128i Iv[4] = {
- _mm_set_epi64x(0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL),
- _mm_set_epi64x(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL),
- _mm_set_epi64x(0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL),
- _mm_set_epi64x(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL)};
- return (void*)Iv;
- }
-
- static const ui32 Sigma[12][16] = {
- {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
- {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
- {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
- {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
- {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
- {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
- {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
- {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
- {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
- {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
- {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
- {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}};
-
- static inline void G1(
- __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
- __m128i& b0, __m128i& b1) {
- row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
- row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
-
- row4l = _mm_xor_si128(row4l, row1l);
- row4h = _mm_xor_si128(row4h, row1h);
-
- row4l = Rotr32(row4l);
- row4h = Rotr32(row4h);
-
- row3l = _mm_add_epi64(row3l, row4l);
- row3h = _mm_add_epi64(row3h, row4h);
-
- row2l = _mm_xor_si128(row2l, row3l);
- row2h = _mm_xor_si128(row2h, row3h);
-
- row2l = Rotr24(row2l);
- row2h = Rotr24(row2h);
- }
-
- static inline void G2(
- __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
- __m128i& b0, __m128i& b1) {
- row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
- row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
-
- row4l = _mm_xor_si128(row4l, row1l);
- row4h = _mm_xor_si128(row4h, row1h);
-
- row4l = Rotr16(row4l);
- row4h = Rotr16(row4h);
-
- row3l = _mm_add_epi64(row3l, row4l);
- row3h = _mm_add_epi64(row3h, row4h);
-
- row2l = _mm_xor_si128(row2l, row3l);
- row2h = _mm_xor_si128(row2h, row3h);
-
- row2l = Rotr63(row2l);
- row2h = Rotr63(row2h);
- }
-
- static inline void Diagonalize(
- __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row2h, __m128i& row3h, __m128i& row4h) {
- __m128i t0 = _mm_alignr_epi8(row2h, row2l, 8);
- __m128i t1 = _mm_alignr_epi8(row2l, row2h, 8);
- row2l = t0;
- row2h = t1;
-
- t0 = row3l;
- row3l = row3h;
- row3h = t0;
-
- t0 = _mm_alignr_epi8(row4h, row4l, 8);
- t1 = _mm_alignr_epi8(row4l, row4h, 8);
- row4l = t1;
- row4h = t0;
- }
-
- static inline void Undiagonalize(
- __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row2h, __m128i& row3h, __m128i& row4h) {
- __m128i t0 = _mm_alignr_epi8(row2l, row2h, 8);
- __m128i t1 = _mm_alignr_epi8(row2h, row2l, 8);
- row2l = t0;
- row2h = t1;
-
- t0 = row3l;
- row3l = row3h;
- row3h = t0;
-
- t0 = _mm_alignr_epi8(row4l, row4h, 8);
- t1 = _mm_alignr_epi8(row4h, row4l, 8);
- row4l = t1;
- row4h = t0;
- }
-
- static inline void Round(int r, const ui64* block_ptr,
- __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
- __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h) {
- __m128i b0, b1;
- b0 = _mm_set_epi64x(block_ptr[Sigma[r][2]], block_ptr[Sigma[r][0]]);
- b1 = _mm_set_epi64x(block_ptr[Sigma[r][6]], block_ptr[Sigma[r][4]]);
- G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
- b0 = _mm_set_epi64x(block_ptr[Sigma[r][3]], block_ptr[Sigma[r][1]]);
- b1 = _mm_set_epi64x(block_ptr[Sigma[r][7]], block_ptr[Sigma[r][5]]);
- G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
- Diagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
- b0 = _mm_set_epi64x(block_ptr[Sigma[r][10]], block_ptr[Sigma[r][8]]);
- b1 = _mm_set_epi64x(block_ptr[Sigma[r][14]], block_ptr[Sigma[r][12]]);
- G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
- b0 = _mm_set_epi64x(block_ptr[Sigma[r][11]], block_ptr[Sigma[r][9]]);
- b1 = _mm_set_epi64x(block_ptr[Sigma[r][15]], block_ptr[Sigma[r][13]]);
- G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
- Undiagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
- }
-
- template <>
- void TBlake2B<EInstructionSet::SSSE3>::InitialXor_(ui8* h, const ui8* p) {
- __m128i* m_res = (__m128i*)h;
- const __m128i* m_p = (__m128i*)p;
- __m128i* iv = (__m128i*)GetIV_();
-
- _mm_storeu_si128(m_res + 0, _mm_xor_si128(iv[0], _mm_loadu_si128(m_p + 0)));
- _mm_storeu_si128(m_res + 1, _mm_xor_si128(iv[1], _mm_loadu_si128(m_p + 1)));
- _mm_storeu_si128(m_res + 2, _mm_xor_si128(iv[2], _mm_loadu_si128(m_p + 2)));
- _mm_storeu_si128(m_res + 3, _mm_xor_si128(iv[3], _mm_loadu_si128(m_p + 3)));
- }
-
- template <>
- void TBlake2B<EInstructionSet::SSSE3>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
- __m128i* iv = (__m128i*)GetIV_();
- __m128i row1l = _mm_loadu_si128((__m128i*)&State_.H[0]);
- __m128i row1h = _mm_loadu_si128((__m128i*)&State_.H[2]);
- __m128i row2l = _mm_loadu_si128((__m128i*)&State_.H[4]);
- __m128i row2h = _mm_loadu_si128((__m128i*)&State_.H[6]);
- __m128i row3l = iv[0];
- __m128i row3h = iv[1];
- __m128i row4l = _mm_xor_si128(iv[2], _mm_loadu_si128((__m128i*)&State_.T[0]));
- __m128i row4h = _mm_xor_si128(iv[3], _mm_loadu_si128((__m128i*)&State_.F[0]));
-
- for (int r = 0; r < 12; ++r)
- Round(r, block, row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
-
- _mm_storeu_si128((__m128i*)&State_.H[0],
- _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[0]), _mm_xor_si128(row3l, row1l)));
- _mm_storeu_si128((__m128i*)&State_.H[2],
- _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[2]), _mm_xor_si128(row3h, row1h)));
- _mm_storeu_si128((__m128i*)&State_.H[4],
- _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[4]), _mm_xor_si128(row4l, row2l)));
- _mm_storeu_si128((__m128i*)&State_.H[6],
- _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[6]), _mm_xor_si128(row4h, row2h)));
- }
-}
+
+namespace NArgonish {
+ template <>
+ void* TBlake2B<EInstructionSet::SSSE3>::GetIV_() const {
+ static const __m128i Iv[4] = {
+ _mm_set_epi64x(0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL),
+ _mm_set_epi64x(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL),
+ _mm_set_epi64x(0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL),
+ _mm_set_epi64x(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL)};
+ return (void*)Iv;
+ }
+
+ static const ui32 Sigma[12][16] = {
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+ {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+ {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+ {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+ {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+ {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+ {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+ {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+ {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}};
+
+ static inline void G1(
+ __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
+ __m128i& b0, __m128i& b1) {
+ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
+ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
+
+ row4l = _mm_xor_si128(row4l, row1l);
+ row4h = _mm_xor_si128(row4h, row1h);
+
+ row4l = Rotr32(row4l);
+ row4h = Rotr32(row4h);
+
+ row3l = _mm_add_epi64(row3l, row4l);
+ row3h = _mm_add_epi64(row3h, row4h);
+
+ row2l = _mm_xor_si128(row2l, row3l);
+ row2h = _mm_xor_si128(row2h, row3h);
+
+ row2l = Rotr24(row2l);
+ row2h = Rotr24(row2h);
+ }
+
+ static inline void G2(
+ __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
+ __m128i& b0, __m128i& b1) {
+ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
+ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
+
+ row4l = _mm_xor_si128(row4l, row1l);
+ row4h = _mm_xor_si128(row4h, row1h);
+
+ row4l = Rotr16(row4l);
+ row4h = Rotr16(row4h);
+
+ row3l = _mm_add_epi64(row3l, row4l);
+ row3h = _mm_add_epi64(row3h, row4h);
+
+ row2l = _mm_xor_si128(row2l, row3l);
+ row2h = _mm_xor_si128(row2h, row3h);
+
+ row2l = Rotr63(row2l);
+ row2h = Rotr63(row2h);
+ }
+
+ static inline void Diagonalize(
+ __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row2h, __m128i& row3h, __m128i& row4h) {
+ __m128i t0 = _mm_alignr_epi8(row2h, row2l, 8);
+ __m128i t1 = _mm_alignr_epi8(row2l, row2h, 8);
+ row2l = t0;
+ row2h = t1;
+
+ t0 = row3l;
+ row3l = row3h;
+ row3h = t0;
+
+ t0 = _mm_alignr_epi8(row4h, row4l, 8);
+ t1 = _mm_alignr_epi8(row4l, row4h, 8);
+ row4l = t1;
+ row4h = t0;
+ }
+
+ static inline void Undiagonalize(
+ __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row2h, __m128i& row3h, __m128i& row4h) {
+ __m128i t0 = _mm_alignr_epi8(row2l, row2h, 8);
+ __m128i t1 = _mm_alignr_epi8(row2h, row2l, 8);
+ row2l = t0;
+ row2h = t1;
+
+ t0 = row3l;
+ row3l = row3h;
+ row3h = t0;
+
+ t0 = _mm_alignr_epi8(row4l, row4h, 8);
+ t1 = _mm_alignr_epi8(row4h, row4l, 8);
+ row4l = t1;
+ row4h = t0;
+ }
+
+ static inline void Round(int r, const ui64* block_ptr,
+ __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
+ __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h) {
+ __m128i b0, b1;
+ b0 = _mm_set_epi64x(block_ptr[Sigma[r][2]], block_ptr[Sigma[r][0]]);
+ b1 = _mm_set_epi64x(block_ptr[Sigma[r][6]], block_ptr[Sigma[r][4]]);
+ G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
+ b0 = _mm_set_epi64x(block_ptr[Sigma[r][3]], block_ptr[Sigma[r][1]]);
+ b1 = _mm_set_epi64x(block_ptr[Sigma[r][7]], block_ptr[Sigma[r][5]]);
+ G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
+ Diagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
+ b0 = _mm_set_epi64x(block_ptr[Sigma[r][10]], block_ptr[Sigma[r][8]]);
+ b1 = _mm_set_epi64x(block_ptr[Sigma[r][14]], block_ptr[Sigma[r][12]]);
+ G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
+ b0 = _mm_set_epi64x(block_ptr[Sigma[r][11]], block_ptr[Sigma[r][9]]);
+ b1 = _mm_set_epi64x(block_ptr[Sigma[r][15]], block_ptr[Sigma[r][13]]);
+ G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
+ Undiagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
+ }
+
+ template <>
+ void TBlake2B<EInstructionSet::SSSE3>::InitialXor_(ui8* h, const ui8* p) {
+ __m128i* m_res = (__m128i*)h;
+ const __m128i* m_p = (__m128i*)p;
+ __m128i* iv = (__m128i*)GetIV_();
+
+ _mm_storeu_si128(m_res + 0, _mm_xor_si128(iv[0], _mm_loadu_si128(m_p + 0)));
+ _mm_storeu_si128(m_res + 1, _mm_xor_si128(iv[1], _mm_loadu_si128(m_p + 1)));
+ _mm_storeu_si128(m_res + 2, _mm_xor_si128(iv[2], _mm_loadu_si128(m_p + 2)));
+ _mm_storeu_si128(m_res + 3, _mm_xor_si128(iv[3], _mm_loadu_si128(m_p + 3)));
+ }
+
+ template <>
+ void TBlake2B<EInstructionSet::SSSE3>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
+ __m128i* iv = (__m128i*)GetIV_();
+ __m128i row1l = _mm_loadu_si128((__m128i*)&State_.H[0]);
+ __m128i row1h = _mm_loadu_si128((__m128i*)&State_.H[2]);
+ __m128i row2l = _mm_loadu_si128((__m128i*)&State_.H[4]);
+ __m128i row2h = _mm_loadu_si128((__m128i*)&State_.H[6]);
+ __m128i row3l = iv[0];
+ __m128i row3h = iv[1];
+ __m128i row4l = _mm_xor_si128(iv[2], _mm_loadu_si128((__m128i*)&State_.T[0]));
+ __m128i row4h = _mm_xor_si128(iv[3], _mm_loadu_si128((__m128i*)&State_.F[0]));
+
+ for (int r = 0; r < 12; ++r)
+ Round(r, block, row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
+
+ _mm_storeu_si128((__m128i*)&State_.H[0],
+ _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[0]), _mm_xor_si128(row3l, row1l)));
+ _mm_storeu_si128((__m128i*)&State_.H[2],
+ _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[2]), _mm_xor_si128(row3h, row1h)));
+ _mm_storeu_si128((__m128i*)&State_.H[4],
+ _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[4]), _mm_xor_si128(row4l, row2l)));
+ _mm_storeu_si128((__m128i*)&State_.H[6],
+ _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[6]), _mm_xor_si128(row4h, row2h)));
+ }
+}
diff --git a/library/cpp/digest/argonish/internal/blake2b/load_sse41.h b/library/cpp/digest/argonish/internal/blake2b/load_sse41.h
index 060455aac2..9b1f7781f9 100644
--- a/library/cpp/digest/argonish/internal/blake2b/load_sse41.h
+++ b/library/cpp/digest/argonish/internal/blake2b/load_sse41.h
@@ -1,301 +1,301 @@
-#pragma once
-
-/*
- BLAKE2 reference source code package - optimized C implementations
- Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
- terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
- your option. The terms of these licenses can be found at:
- - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
- - OpenSSL license : https://www.openssl.org/source/license.html
- - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
- More information about the BLAKE2 hash function can be found at
- https://blake2.net.
-*/
-
-#define LOAD_MSG_0_1(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m0, m1); \
- b1 = _mm_unpacklo_epi64(m2, m3); \
- } while (0)
-
-#define LOAD_MSG_0_2(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m0, m1); \
- b1 = _mm_unpackhi_epi64(m2, m3); \
- } while (0)
-
-#define LOAD_MSG_0_3(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m4, m5); \
- b1 = _mm_unpacklo_epi64(m6, m7); \
- } while (0)
-
-#define LOAD_MSG_0_4(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m4, m5); \
- b1 = _mm_unpackhi_epi64(m6, m7); \
- } while (0)
-
-#define LOAD_MSG_1_1(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m7, m2); \
- b1 = _mm_unpackhi_epi64(m4, m6); \
- } while (0)
-
-#define LOAD_MSG_1_2(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m5, m4); \
- b1 = _mm_alignr_epi8(m3, m7, 8); \
- } while (0)
-
-#define LOAD_MSG_1_3(b0, b1) \
- do { \
- b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
- b1 = _mm_unpackhi_epi64(m5, m2); \
- } while (0)
-
-#define LOAD_MSG_1_4(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m6, m1); \
- b1 = _mm_unpackhi_epi64(m3, m1); \
- } while (0)
-
-#define LOAD_MSG_2_1(b0, b1) \
- do { \
- b0 = _mm_alignr_epi8(m6, m5, 8); \
- b1 = _mm_unpackhi_epi64(m2, m7); \
- } while (0)
-
-#define LOAD_MSG_2_2(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m4, m0); \
- b1 = _mm_blend_epi16(m1, m6, 0xF0); \
- } while (0)
-
-#define LOAD_MSG_2_3(b0, b1) \
- do { \
- b0 = _mm_blend_epi16(m5, m1, 0xF0); \
- b1 = _mm_unpackhi_epi64(m3, m4); \
- } while (0)
-
-#define LOAD_MSG_2_4(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m7, m3); \
- b1 = _mm_alignr_epi8(m2, m0, 8); \
- } while (0)
-
-#define LOAD_MSG_3_1(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m3, m1); \
- b1 = _mm_unpackhi_epi64(m6, m5); \
- } while (0)
-
-#define LOAD_MSG_3_2(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m4, m0); \
- b1 = _mm_unpacklo_epi64(m6, m7); \
- } while (0)
-
-#define LOAD_MSG_3_3(b0, b1) \
- do { \
- b0 = _mm_blend_epi16(m1, m2, 0xF0); \
- b1 = _mm_blend_epi16(m2, m7, 0xF0); \
- } while (0)
-
-#define LOAD_MSG_3_4(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m3, m5); \
- b1 = _mm_unpacklo_epi64(m0, m4); \
- } while (0)
-
-#define LOAD_MSG_4_1(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m4, m2); \
- b1 = _mm_unpacklo_epi64(m1, m5); \
- } while (0)
-
-#define LOAD_MSG_4_2(b0, b1) \
- do { \
- b0 = _mm_blend_epi16(m0, m3, 0xF0); \
- b1 = _mm_blend_epi16(m2, m7, 0xF0); \
- } while (0)
-
-#define LOAD_MSG_4_3(b0, b1) \
- do { \
- b0 = _mm_blend_epi16(m7, m5, 0xF0); \
- b1 = _mm_blend_epi16(m3, m1, 0xF0); \
- } while (0)
-
-#define LOAD_MSG_4_4(b0, b1) \
- do { \
- b0 = _mm_alignr_epi8(m6, m0, 8); \
- b1 = _mm_blend_epi16(m4, m6, 0xF0); \
- } while (0)
-
-#define LOAD_MSG_5_1(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m1, m3); \
- b1 = _mm_unpacklo_epi64(m0, m4); \
- } while (0)
-
-#define LOAD_MSG_5_2(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m6, m5); \
- b1 = _mm_unpackhi_epi64(m5, m1); \
- } while (0)
-
-#define LOAD_MSG_5_3(b0, b1) \
- do { \
- b0 = _mm_blend_epi16(m2, m3, 0xF0); \
- b1 = _mm_unpackhi_epi64(m7, m0); \
- } while (0)
-
-#define LOAD_MSG_5_4(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m6, m2); \
- b1 = _mm_blend_epi16(m7, m4, 0xF0); \
- } while (0)
-
-#define LOAD_MSG_6_1(b0, b1) \
- do { \
- b0 = _mm_blend_epi16(m6, m0, 0xF0); \
- b1 = _mm_unpacklo_epi64(m7, m2); \
- } while (0)
-
-#define LOAD_MSG_6_2(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m2, m7); \
- b1 = _mm_alignr_epi8(m5, m6, 8); \
- } while (0)
-
-#define LOAD_MSG_6_3(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m0, m3); \
- b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
- } while (0)
-
-#define LOAD_MSG_6_4(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m3, m1); \
- b1 = _mm_blend_epi16(m1, m5, 0xF0); \
- } while (0)
-
-#define LOAD_MSG_7_1(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m6, m3); \
- b1 = _mm_blend_epi16(m6, m1, 0xF0); \
- } while (0)
-
-#define LOAD_MSG_7_2(b0, b1) \
- do { \
- b0 = _mm_alignr_epi8(m7, m5, 8); \
- b1 = _mm_unpackhi_epi64(m0, m4); \
- } while (0)
-
-#define LOAD_MSG_7_3(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m2, m7); \
- b1 = _mm_unpacklo_epi64(m4, m1); \
- } while (0)
-
-#define LOAD_MSG_7_4(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m0, m2); \
- b1 = _mm_unpacklo_epi64(m3, m5); \
- } while (0)
-
-#define LOAD_MSG_8_1(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m3, m7); \
- b1 = _mm_alignr_epi8(m0, m5, 8); \
- } while (0)
-
-#define LOAD_MSG_8_2(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m7, m4); \
- b1 = _mm_alignr_epi8(m4, m1, 8); \
- } while (0)
-
-#define LOAD_MSG_8_3(b0, b1) \
- do { \
- b0 = m6; \
- b1 = _mm_alignr_epi8(m5, m0, 8); \
- } while (0)
-
-#define LOAD_MSG_8_4(b0, b1) \
- do { \
- b0 = _mm_blend_epi16(m1, m3, 0xF0); \
- b1 = m2; \
- } while (0)
-
-#define LOAD_MSG_9_1(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m5, m4); \
- b1 = _mm_unpackhi_epi64(m3, m0); \
- } while (0)
-
-#define LOAD_MSG_9_2(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m1, m2); \
- b1 = _mm_blend_epi16(m3, m2, 0xF0); \
- } while (0)
-
-#define LOAD_MSG_9_3(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m7, m4); \
- b1 = _mm_unpackhi_epi64(m1, m6); \
- } while (0)
-
-#define LOAD_MSG_9_4(b0, b1) \
- do { \
- b0 = _mm_alignr_epi8(m7, m5, 8); \
- b1 = _mm_unpacklo_epi64(m6, m0); \
- } while (0)
-
-#define LOAD_MSG_10_1(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m0, m1); \
- b1 = _mm_unpacklo_epi64(m2, m3); \
- } while (0)
-
-#define LOAD_MSG_10_2(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m0, m1); \
- b1 = _mm_unpackhi_epi64(m2, m3); \
- } while (0)
-
-#define LOAD_MSG_10_3(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m4, m5); \
- b1 = _mm_unpacklo_epi64(m6, m7); \
- } while (0)
-
-#define LOAD_MSG_10_4(b0, b1) \
- do { \
- b0 = _mm_unpackhi_epi64(m4, m5); \
- b1 = _mm_unpackhi_epi64(m6, m7); \
- } while (0)
-
-#define LOAD_MSG_11_1(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m7, m2); \
- b1 = _mm_unpackhi_epi64(m4, m6); \
- } while (0)
-
-#define LOAD_MSG_11_2(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m5, m4); \
- b1 = _mm_alignr_epi8(m3, m7, 8); \
- } while (0)
-
-#define LOAD_MSG_11_3(b0, b1) \
- do { \
- b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
- b1 = _mm_unpackhi_epi64(m5, m2); \
- } while (0)
-
-#define LOAD_MSG_11_4(b0, b1) \
- do { \
- b0 = _mm_unpacklo_epi64(m6, m1); \
- b1 = _mm_unpackhi_epi64(m3, m1); \
- } while (0)
+#pragma once
+
+/*
+ BLAKE2 reference source code package - optimized C implementations
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+
+#define LOAD_MSG_0_1(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m0, m1); \
+ b1 = _mm_unpacklo_epi64(m2, m3); \
+ } while (0)
+
+#define LOAD_MSG_0_2(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m0, m1); \
+ b1 = _mm_unpackhi_epi64(m2, m3); \
+ } while (0)
+
+#define LOAD_MSG_0_3(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m4, m5); \
+ b1 = _mm_unpacklo_epi64(m6, m7); \
+ } while (0)
+
+#define LOAD_MSG_0_4(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m4, m5); \
+ b1 = _mm_unpackhi_epi64(m6, m7); \
+ } while (0)
+
+#define LOAD_MSG_1_1(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m7, m2); \
+ b1 = _mm_unpackhi_epi64(m4, m6); \
+ } while (0)
+
+#define LOAD_MSG_1_2(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m5, m4); \
+ b1 = _mm_alignr_epi8(m3, m7, 8); \
+ } while (0)
+
+#define LOAD_MSG_1_3(b0, b1) \
+ do { \
+ b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
+ b1 = _mm_unpackhi_epi64(m5, m2); \
+ } while (0)
+
+#define LOAD_MSG_1_4(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m6, m1); \
+ b1 = _mm_unpackhi_epi64(m3, m1); \
+ } while (0)
+
+#define LOAD_MSG_2_1(b0, b1) \
+ do { \
+ b0 = _mm_alignr_epi8(m6, m5, 8); \
+ b1 = _mm_unpackhi_epi64(m2, m7); \
+ } while (0)
+
+#define LOAD_MSG_2_2(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m4, m0); \
+ b1 = _mm_blend_epi16(m1, m6, 0xF0); \
+ } while (0)
+
+#define LOAD_MSG_2_3(b0, b1) \
+ do { \
+ b0 = _mm_blend_epi16(m5, m1, 0xF0); \
+ b1 = _mm_unpackhi_epi64(m3, m4); \
+ } while (0)
+
+#define LOAD_MSG_2_4(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m7, m3); \
+ b1 = _mm_alignr_epi8(m2, m0, 8); \
+ } while (0)
+
+#define LOAD_MSG_3_1(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m3, m1); \
+ b1 = _mm_unpackhi_epi64(m6, m5); \
+ } while (0)
+
+#define LOAD_MSG_3_2(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m4, m0); \
+ b1 = _mm_unpacklo_epi64(m6, m7); \
+ } while (0)
+
+#define LOAD_MSG_3_3(b0, b1) \
+ do { \
+ b0 = _mm_blend_epi16(m1, m2, 0xF0); \
+ b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+ } while (0)
+
+#define LOAD_MSG_3_4(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m3, m5); \
+ b1 = _mm_unpacklo_epi64(m0, m4); \
+ } while (0)
+
+#define LOAD_MSG_4_1(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m4, m2); \
+ b1 = _mm_unpacklo_epi64(m1, m5); \
+ } while (0)
+
+#define LOAD_MSG_4_2(b0, b1) \
+ do { \
+ b0 = _mm_blend_epi16(m0, m3, 0xF0); \
+ b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+ } while (0)
+
+#define LOAD_MSG_4_3(b0, b1) \
+ do { \
+ b0 = _mm_blend_epi16(m7, m5, 0xF0); \
+ b1 = _mm_blend_epi16(m3, m1, 0xF0); \
+ } while (0)
+
+#define LOAD_MSG_4_4(b0, b1) \
+ do { \
+ b0 = _mm_alignr_epi8(m6, m0, 8); \
+ b1 = _mm_blend_epi16(m4, m6, 0xF0); \
+ } while (0)
+
+#define LOAD_MSG_5_1(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m1, m3); \
+ b1 = _mm_unpacklo_epi64(m0, m4); \
+ } while (0)
+
+#define LOAD_MSG_5_2(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m6, m5); \
+ b1 = _mm_unpackhi_epi64(m5, m1); \
+ } while (0)
+
+#define LOAD_MSG_5_3(b0, b1) \
+ do { \
+ b0 = _mm_blend_epi16(m2, m3, 0xF0); \
+ b1 = _mm_unpackhi_epi64(m7, m0); \
+ } while (0)
+
+#define LOAD_MSG_5_4(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m6, m2); \
+ b1 = _mm_blend_epi16(m7, m4, 0xF0); \
+ } while (0)
+
+#define LOAD_MSG_6_1(b0, b1) \
+ do { \
+ b0 = _mm_blend_epi16(m6, m0, 0xF0); \
+ b1 = _mm_unpacklo_epi64(m7, m2); \
+ } while (0)
+
+#define LOAD_MSG_6_2(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m2, m7); \
+ b1 = _mm_alignr_epi8(m5, m6, 8); \
+ } while (0)
+
+#define LOAD_MSG_6_3(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m0, m3); \
+ b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
+ } while (0)
+
+#define LOAD_MSG_6_4(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m3, m1); \
+ b1 = _mm_blend_epi16(m1, m5, 0xF0); \
+ } while (0)
+
+#define LOAD_MSG_7_1(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m6, m3); \
+ b1 = _mm_blend_epi16(m6, m1, 0xF0); \
+ } while (0)
+
+#define LOAD_MSG_7_2(b0, b1) \
+ do { \
+ b0 = _mm_alignr_epi8(m7, m5, 8); \
+ b1 = _mm_unpackhi_epi64(m0, m4); \
+ } while (0)
+
+#define LOAD_MSG_7_3(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m2, m7); \
+ b1 = _mm_unpacklo_epi64(m4, m1); \
+ } while (0)
+
+#define LOAD_MSG_7_4(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m0, m2); \
+ b1 = _mm_unpacklo_epi64(m3, m5); \
+ } while (0)
+
+#define LOAD_MSG_8_1(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m3, m7); \
+ b1 = _mm_alignr_epi8(m0, m5, 8); \
+ } while (0)
+
+#define LOAD_MSG_8_2(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m7, m4); \
+ b1 = _mm_alignr_epi8(m4, m1, 8); \
+ } while (0)
+
+#define LOAD_MSG_8_3(b0, b1) \
+ do { \
+ b0 = m6; \
+ b1 = _mm_alignr_epi8(m5, m0, 8); \
+ } while (0)
+
+#define LOAD_MSG_8_4(b0, b1) \
+ do { \
+ b0 = _mm_blend_epi16(m1, m3, 0xF0); \
+ b1 = m2; \
+ } while (0)
+
+#define LOAD_MSG_9_1(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m5, m4); \
+ b1 = _mm_unpackhi_epi64(m3, m0); \
+ } while (0)
+
+#define LOAD_MSG_9_2(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m1, m2); \
+ b1 = _mm_blend_epi16(m3, m2, 0xF0); \
+ } while (0)
+
+#define LOAD_MSG_9_3(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m7, m4); \
+ b1 = _mm_unpackhi_epi64(m1, m6); \
+ } while (0)
+
+#define LOAD_MSG_9_4(b0, b1) \
+ do { \
+ b0 = _mm_alignr_epi8(m7, m5, 8); \
+ b1 = _mm_unpacklo_epi64(m6, m0); \
+ } while (0)
+
+#define LOAD_MSG_10_1(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m0, m1); \
+ b1 = _mm_unpacklo_epi64(m2, m3); \
+ } while (0)
+
+#define LOAD_MSG_10_2(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m0, m1); \
+ b1 = _mm_unpackhi_epi64(m2, m3); \
+ } while (0)
+
+#define LOAD_MSG_10_3(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m4, m5); \
+ b1 = _mm_unpacklo_epi64(m6, m7); \
+ } while (0)
+
+#define LOAD_MSG_10_4(b0, b1) \
+ do { \
+ b0 = _mm_unpackhi_epi64(m4, m5); \
+ b1 = _mm_unpackhi_epi64(m6, m7); \
+ } while (0)
+
+#define LOAD_MSG_11_1(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m7, m2); \
+ b1 = _mm_unpackhi_epi64(m4, m6); \
+ } while (0)
+
+#define LOAD_MSG_11_2(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m5, m4); \
+ b1 = _mm_alignr_epi8(m3, m7, 8); \
+ } while (0)
+
+#define LOAD_MSG_11_3(b0, b1) \
+ do { \
+ b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
+ b1 = _mm_unpackhi_epi64(m5, m2); \
+ } while (0)
+
+#define LOAD_MSG_11_4(b0, b1) \
+ do { \
+ b0 = _mm_unpacklo_epi64(m6, m1); \
+ b1 = _mm_unpackhi_epi64(m3, m1); \
+ } while (0)
diff --git a/library/cpp/digest/argonish/internal/blake2b/ya.make b/library/cpp/digest/argonish/internal/blake2b/ya.make
index 0aa6806b31..1f6d903166 100644
--- a/library/cpp/digest/argonish/internal/blake2b/ya.make
+++ b/library/cpp/digest/argonish/internal/blake2b/ya.make
@@ -1,9 +1,9 @@
-LIBRARY()
-
-OWNER(e-sidorov)
-
+LIBRARY()
+
+OWNER(e-sidorov)
+
PEERDIR(
library/cpp/digest/argonish/internal/rotations
)
-
-END()
+
+END()
diff --git a/library/cpp/digest/argonish/internal/blamka/blamka_avx2.h b/library/cpp/digest/argonish/internal/blamka/blamka_avx2.h
index 02c506d6ff..bb701799c4 100644
--- a/library/cpp/digest/argonish/internal/blamka/blamka_avx2.h
+++ b/library/cpp/digest/argonish/internal/blamka/blamka_avx2.h
@@ -1,136 +1,136 @@
-#pragma once
-
-#include <immintrin.h>
+#pragma once
+
+#include <immintrin.h>
#include <library/cpp/digest/argonish/internal/rotations/rotations_avx2.h>
-
-namespace NArgonish {
- static inline void BlamkaG1AVX2(
- __m256i& a0, __m256i& a1, __m256i& b0, __m256i& b1,
- __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) {
- __m256i ml = _mm256_mul_epu32(a0, b0);
- ml = _mm256_add_epi64(ml, ml);
- a0 = _mm256_add_epi64(a0, _mm256_add_epi64(b0, ml));
- d0 = _mm256_xor_si256(d0, a0);
- d0 = Rotr32(d0);
-
- ml = _mm256_mul_epu32(c0, d0);
- ml = _mm256_add_epi64(ml, ml);
- c0 = _mm256_add_epi64(c0, _mm256_add_epi64(d0, ml));
-
- b0 = _mm256_xor_si256(b0, c0);
- b0 = Rotr24(b0);
-
- ml = _mm256_mul_epu32(a1, b1);
- ml = _mm256_add_epi64(ml, ml);
- a1 = _mm256_add_epi64(a1, _mm256_add_epi64(b1, ml));
- d1 = _mm256_xor_si256(d1, a1);
- d1 = Rotr32(d1);
-
- ml = _mm256_mul_epu32(c1, d1);
- ml = _mm256_add_epi64(ml, ml);
- c1 = _mm256_add_epi64(c1, _mm256_add_epi64(d1, ml));
-
- b1 = _mm256_xor_si256(b1, c1);
- b1 = Rotr24(b1);
- }
-
- static inline void BlamkaG2AVX2(
- __m256i& a0, __m256i& a1, __m256i& b0, __m256i& b1,
- __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) {
- __m256i ml = _mm256_mul_epu32(a0, b0);
- ml = _mm256_add_epi64(ml, ml);
- a0 = _mm256_add_epi64(a0, _mm256_add_epi64(b0, ml));
- d0 = _mm256_xor_si256(d0, a0);
- d0 = Rotr16(d0);
-
- ml = _mm256_mul_epu32(c0, d0);
- ml = _mm256_add_epi64(ml, ml);
- c0 = _mm256_add_epi64(c0, _mm256_add_epi64(d0, ml));
- b0 = _mm256_xor_si256(b0, c0);
- b0 = Rotr63(b0);
-
- ml = _mm256_mul_epu32(a1, b1);
- ml = _mm256_add_epi64(ml, ml);
- a1 = _mm256_add_epi64(a1, _mm256_add_epi64(b1, ml));
- d1 = _mm256_xor_si256(d1, a1);
- d1 = Rotr16(d1);
-
- ml = _mm256_mul_epu32(c1, d1);
- ml = _mm256_add_epi64(ml, ml);
- c1 = _mm256_add_epi64(c1, _mm256_add_epi64(d1, ml));
- b1 = _mm256_xor_si256(b1, c1);
- b1 = Rotr63(b1);
- }
-
- /* a = ( v0, v1, v2, v3) */
- /* b = ( v4, v5, v6, v7) */
- /* c = ( v8, v9, v10, v11) */
- /* d = (v12, v13, v14, v15) */
- static inline void DiagonalizeAVX21(
- __m256i& b0, __m256i& c0, __m256i& d0, __m256i& b1, __m256i& c1, __m256i& d1) {
- /* (v4, v5, v6, v7) -> (v5, v6, v7, v4) */
- b0 = _mm256_permute4x64_epi64(b0, _MM_SHUFFLE(0, 3, 2, 1));
- /* (v8, v9, v10, v11) -> (v10, v11, v8, v9) */
- c0 = _mm256_permute4x64_epi64(c0, _MM_SHUFFLE(1, 0, 3, 2));
- /* (v12, v13, v14, v15) -> (v15, v12, v13, v14) */
- d0 = _mm256_permute4x64_epi64(d0, _MM_SHUFFLE(2, 1, 0, 3));
-
- b1 = _mm256_permute4x64_epi64(b1, _MM_SHUFFLE(0, 3, 2, 1));
- c1 = _mm256_permute4x64_epi64(c1, _MM_SHUFFLE(1, 0, 3, 2));
- d1 = _mm256_permute4x64_epi64(d1, _MM_SHUFFLE(2, 1, 0, 3));
- }
-
- static inline void DiagonalizeAVX22(
- __m256i& b0, __m256i& b1, __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) {
- /* (v4, v5, v6, v7) -> (v5, v6, v7, v4) */
- __m256i tmp1 = _mm256_blend_epi32(b0, b1, 0b11001100); /* v4v7 */
- __m256i tmp2 = _mm256_blend_epi32(b0, b1, 0b00110011); /* v6v5 */
- b1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v7v4 */
- b0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v5v6 */
-
- /* (v8, v9, v10, v11) -> (v10, v11, v8, v9) */
- tmp1 = c0;
- c0 = c1;
- c1 = tmp1;
-
- /* (v12, v13, v14, v15) -> (v15, v12, v13, v14) */
- tmp1 = _mm256_blend_epi32(d0, d1, 0b11001100); /* v12v15 */
- tmp2 = _mm256_blend_epi32(d0, d1, 0b00110011); /* v14v13 */
- d0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v15v12 */
- d1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v13v14 */
- }
-
- static inline void UndiagonalizeAVX21(
- __m256i& b0, __m256i& c0, __m256i& d0, __m256i& b1, __m256i& c1, __m256i& d1) {
- /* (v5, v6, v7, v4) -> (v4, v5, v6, v7) */
- b0 = _mm256_permute4x64_epi64(b0, _MM_SHUFFLE(2, 1, 0, 3));
- /* (v10, v11, v8, v9) -> (v8, v9, v10, v11) */
- c0 = _mm256_permute4x64_epi64(c0, _MM_SHUFFLE(1, 0, 3, 2));
- /* (v15, v12, v13, v14) -> (v12, v13, v14, v15) */
- d0 = _mm256_permute4x64_epi64(d0, _MM_SHUFFLE(0, 3, 2, 1));
-
- b1 = _mm256_permute4x64_epi64(b1, _MM_SHUFFLE(2, 1, 0, 3));
- c1 = _mm256_permute4x64_epi64(c1, _MM_SHUFFLE(1, 0, 3, 2));
- d1 = _mm256_permute4x64_epi64(d1, _MM_SHUFFLE(0, 3, 2, 1));
- }
-
- static inline void UndiagonalizeAVX22(
- __m256i& b0, __m256i& b1, __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) {
- /* (v5, v6, v7, v4) -> (v4, v5, v6, v7) */
- __m256i tmp1 = _mm256_blend_epi32(b0, b1, 0b11001100); /* v5v4 */
- __m256i tmp2 = _mm256_blend_epi32(b0, b1, 0b00110011); /* v7v6 */
- b0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v4v5 */
- b1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v6v7 */
-
- /* (v10,v11,v8,v9) -> (v8,v9,v10,v11) */
- tmp1 = c0;
- c0 = c1;
- c1 = tmp1;
-
- /* (v15,v12,v13,v14) -> (v12,v13,v14,v15) */
- tmp1 = _mm256_blend_epi32(d0, d1, 0b00110011); /* v13v12 */
- tmp2 = _mm256_blend_epi32(d0, d1, 0b11001100); /* v15v14 */
- d0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1));
- d1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1));
- }
-}
+
+namespace NArgonish {
+ static inline void BlamkaG1AVX2(
+ __m256i& a0, __m256i& a1, __m256i& b0, __m256i& b1,
+ __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) {
+ __m256i ml = _mm256_mul_epu32(a0, b0);
+ ml = _mm256_add_epi64(ml, ml);
+ a0 = _mm256_add_epi64(a0, _mm256_add_epi64(b0, ml));
+ d0 = _mm256_xor_si256(d0, a0);
+ d0 = Rotr32(d0);
+
+ ml = _mm256_mul_epu32(c0, d0);
+ ml = _mm256_add_epi64(ml, ml);
+ c0 = _mm256_add_epi64(c0, _mm256_add_epi64(d0, ml));
+
+ b0 = _mm256_xor_si256(b0, c0);
+ b0 = Rotr24(b0);
+
+ ml = _mm256_mul_epu32(a1, b1);
+ ml = _mm256_add_epi64(ml, ml);
+ a1 = _mm256_add_epi64(a1, _mm256_add_epi64(b1, ml));
+ d1 = _mm256_xor_si256(d1, a1);
+ d1 = Rotr32(d1);
+
+ ml = _mm256_mul_epu32(c1, d1);
+ ml = _mm256_add_epi64(ml, ml);
+ c1 = _mm256_add_epi64(c1, _mm256_add_epi64(d1, ml));
+
+ b1 = _mm256_xor_si256(b1, c1);
+ b1 = Rotr24(b1);
+ }
+
+ static inline void BlamkaG2AVX2(
+ __m256i& a0, __m256i& a1, __m256i& b0, __m256i& b1,
+ __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) {
+ __m256i ml = _mm256_mul_epu32(a0, b0);
+ ml = _mm256_add_epi64(ml, ml);
+ a0 = _mm256_add_epi64(a0, _mm256_add_epi64(b0, ml));
+ d0 = _mm256_xor_si256(d0, a0);
+ d0 = Rotr16(d0);
+
+ ml = _mm256_mul_epu32(c0, d0);
+ ml = _mm256_add_epi64(ml, ml);
+ c0 = _mm256_add_epi64(c0, _mm256_add_epi64(d0, ml));
+ b0 = _mm256_xor_si256(b0, c0);
+ b0 = Rotr63(b0);
+
+ ml = _mm256_mul_epu32(a1, b1);
+ ml = _mm256_add_epi64(ml, ml);
+ a1 = _mm256_add_epi64(a1, _mm256_add_epi64(b1, ml));
+ d1 = _mm256_xor_si256(d1, a1);
+ d1 = Rotr16(d1);
+
+ ml = _mm256_mul_epu32(c1, d1);
+ ml = _mm256_add_epi64(ml, ml);
+ c1 = _mm256_add_epi64(c1, _mm256_add_epi64(d1, ml));
+ b1 = _mm256_xor_si256(b1, c1);
+ b1 = Rotr63(b1);
+ }
+
+ /* a = ( v0, v1, v2, v3) */
+ /* b = ( v4, v5, v6, v7) */
+ /* c = ( v8, v9, v10, v11) */
+ /* d = (v12, v13, v14, v15) */
+ static inline void DiagonalizeAVX21(
+ __m256i& b0, __m256i& c0, __m256i& d0, __m256i& b1, __m256i& c1, __m256i& d1) {
+ /* (v4, v5, v6, v7) -> (v5, v6, v7, v4) */
+ b0 = _mm256_permute4x64_epi64(b0, _MM_SHUFFLE(0, 3, 2, 1));
+ /* (v8, v9, v10, v11) -> (v10, v11, v8, v9) */
+ c0 = _mm256_permute4x64_epi64(c0, _MM_SHUFFLE(1, 0, 3, 2));
+ /* (v12, v13, v14, v15) -> (v15, v12, v13, v14) */
+ d0 = _mm256_permute4x64_epi64(d0, _MM_SHUFFLE(2, 1, 0, 3));
+
+ b1 = _mm256_permute4x64_epi64(b1, _MM_SHUFFLE(0, 3, 2, 1));
+ c1 = _mm256_permute4x64_epi64(c1, _MM_SHUFFLE(1, 0, 3, 2));
+ d1 = _mm256_permute4x64_epi64(d1, _MM_SHUFFLE(2, 1, 0, 3));
+ }
+
+ static inline void DiagonalizeAVX22(
+ __m256i& b0, __m256i& b1, __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) {
+ /* (v4, v5, v6, v7) -> (v5, v6, v7, v4) */
+ __m256i tmp1 = _mm256_blend_epi32(b0, b1, 0b11001100); /* v4v7 */
+ __m256i tmp2 = _mm256_blend_epi32(b0, b1, 0b00110011); /* v6v5 */
+ b1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v7v4 */
+ b0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v5v6 */
+
+ /* (v8, v9, v10, v11) -> (v10, v11, v8, v9) */
+ tmp1 = c0;
+ c0 = c1;
+ c1 = tmp1;
+
+ /* (v12, v13, v14, v15) -> (v15, v12, v13, v14) */
+ tmp1 = _mm256_blend_epi32(d0, d1, 0b11001100); /* v12v15 */
+ tmp2 = _mm256_blend_epi32(d0, d1, 0b00110011); /* v14v13 */
+ d0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v15v12 */
+ d1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v13v14 */
+ }
+
+ static inline void UndiagonalizeAVX21(
+ __m256i& b0, __m256i& c0, __m256i& d0, __m256i& b1, __m256i& c1, __m256i& d1) {
+ /* (v5, v6, v7, v4) -> (v4, v5, v6, v7) */
+ b0 = _mm256_permute4x64_epi64(b0, _MM_SHUFFLE(2, 1, 0, 3));
+ /* (v10, v11, v8, v9) -> (v8, v9, v10, v11) */
+ c0 = _mm256_permute4x64_epi64(c0, _MM_SHUFFLE(1, 0, 3, 2));
+ /* (v15, v12, v13, v14) -> (v12, v13, v14, v15) */
+ d0 = _mm256_permute4x64_epi64(d0, _MM_SHUFFLE(0, 3, 2, 1));
+
+ b1 = _mm256_permute4x64_epi64(b1, _MM_SHUFFLE(2, 1, 0, 3));
+ c1 = _mm256_permute4x64_epi64(c1, _MM_SHUFFLE(1, 0, 3, 2));
+ d1 = _mm256_permute4x64_epi64(d1, _MM_SHUFFLE(0, 3, 2, 1));
+ }
+
+ static inline void UndiagonalizeAVX22(
+ __m256i& b0, __m256i& b1, __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) {
+ /* (v5, v6, v7, v4) -> (v4, v5, v6, v7) */
+ __m256i tmp1 = _mm256_blend_epi32(b0, b1, 0b11001100); /* v5v4 */
+ __m256i tmp2 = _mm256_blend_epi32(b0, b1, 0b00110011); /* v7v6 */
+ b0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v4v5 */
+ b1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v6v7 */
+
+ /* (v10,v11,v8,v9) -> (v8,v9,v10,v11) */
+ tmp1 = c0;
+ c0 = c1;
+ c1 = tmp1;
+
+ /* (v15,v12,v13,v14) -> (v12,v13,v14,v15) */
+ tmp1 = _mm256_blend_epi32(d0, d1, 0b00110011); /* v13v12 */
+ tmp2 = _mm256_blend_epi32(d0, d1, 0b11001100); /* v15v14 */
+ d0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1));
+ d1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1));
+ }
+}
diff --git a/library/cpp/digest/argonish/internal/blamka/blamka_sse2.h b/library/cpp/digest/argonish/internal/blamka/blamka_sse2.h
index 1b55651b34..b46fc7624a 100644
--- a/library/cpp/digest/argonish/internal/blamka/blamka_sse2.h
+++ b/library/cpp/digest/argonish/internal/blamka/blamka_sse2.h
@@ -1,95 +1,95 @@
-#pragma once
-
+#pragma once
+
#include <library/cpp/digest/argonish/internal/rotations/rotations_sse2.h>
-
-namespace NArgonish {
- static inline void BlamkaG1SSE2(
- __m128i& a0, __m128i& a1, __m128i& b0, __m128i& b1,
- __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
- __m128i ml = _mm_mul_epu32(a0, b0);
- ml = _mm_add_epi64(ml, ml);
- a0 = _mm_add_epi64(a0, _mm_add_epi64(b0, ml));
-
- ml = _mm_mul_epu32(a1, b1);
- ml = _mm_add_epi64(ml, ml);
- a1 = _mm_add_epi64(a1, _mm_add_epi64(b1, ml));
-
- d0 = _mm_xor_si128(d0, a0);
- d1 = _mm_xor_si128(d1, a1);
-
- d0 = Rotr32(d0);
- d1 = Rotr32(d1);
-
- ml = _mm_mul_epu32(c0, d0);
- ml = _mm_add_epi64(ml, ml);
- c0 = _mm_add_epi64(c0, _mm_add_epi64(d0, ml));
-
- ml = _mm_mul_epu32(c1, d1);
- ml = _mm_add_epi64(ml, ml);
- c1 = _mm_add_epi64(c1, _mm_add_epi64(ml, d1));
-
- b0 = _mm_xor_si128(b0, c0);
- b1 = _mm_xor_si128(b1, c1);
-
- b0 = Rotr24(b0);
- b1 = Rotr24(b1);
- }
-
- static inline void BlamkaG2SSE2(
- __m128i& a0, __m128i& a1, __m128i& b0, __m128i& b1,
- __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
- __m128i ml = _mm_mul_epu32(a0, b0);
- ml = _mm_add_epi64(ml, ml);
- a0 = _mm_add_epi64(a0, _mm_add_epi64(b0, ml));
-
- ml = _mm_mul_epu32(a1, b1);
- ml = _mm_add_epi64(ml, ml);
- a1 = _mm_add_epi64(a1, _mm_add_epi64(b1, ml));
-
- d0 = _mm_xor_si128(d0, a0);
- d1 = _mm_xor_si128(d1, a1);
-
- d0 = Rotr16(d0);
- d1 = Rotr16(d1);
-
- ml = _mm_mul_epu32(c0, d0);
- ml = _mm_add_epi64(ml, ml);
- c0 = _mm_add_epi64(c0, _mm_add_epi64(d0, ml));
-
- ml = _mm_mul_epu32(c1, d1);
- ml = _mm_add_epi64(ml, ml);
- c1 = _mm_add_epi64(c1, _mm_add_epi64(ml, d1));
-
- b0 = _mm_xor_si128(b0, c0);
- b1 = _mm_xor_si128(b1, c1);
-
- b0 = Rotr63(b0);
- b1 = Rotr63(b1);
- }
-
- static inline void DiagonalizeSSE2(
- __m128i& b0, __m128i& b1, __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
- __m128i tmp0 = d0;
- __m128i tmp1 = b0;
- d0 = c0;
- c0 = c1;
- c1 = d0;
- d0 = _mm_unpackhi_epi64(d1, _mm_unpacklo_epi64(tmp0, tmp0));
- d1 = _mm_unpackhi_epi64(tmp0, _mm_unpacklo_epi64(d1, d1));
- b0 = _mm_unpackhi_epi64(b0, _mm_unpacklo_epi64(b1, b1));
- b1 = _mm_unpackhi_epi64(b1, _mm_unpacklo_epi64(tmp1, tmp1));
- }
-
- static inline void UndiagonalizeSSE2(
- __m128i& b0, __m128i& b1, __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
- __m128i tmp0 = c0;
- c0 = c1;
- c1 = tmp0;
- tmp0 = b0;
- __m128i tmp1 = d0;
- b0 = _mm_unpackhi_epi64(b1, _mm_unpacklo_epi64(b0, b0));
- b1 = _mm_unpackhi_epi64(tmp0, _mm_unpacklo_epi64(b1, b1));
- d0 = _mm_unpackhi_epi64(d0, _mm_unpacklo_epi64(d1, d1));
- d1 = _mm_unpackhi_epi64(d1, _mm_unpacklo_epi64(tmp1, tmp1));
- }
-}
+
+namespace NArgonish {
+ static inline void BlamkaG1SSE2(
+ __m128i& a0, __m128i& a1, __m128i& b0, __m128i& b1,
+ __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
+ __m128i ml = _mm_mul_epu32(a0, b0);
+ ml = _mm_add_epi64(ml, ml);
+ a0 = _mm_add_epi64(a0, _mm_add_epi64(b0, ml));
+
+ ml = _mm_mul_epu32(a1, b1);
+ ml = _mm_add_epi64(ml, ml);
+ a1 = _mm_add_epi64(a1, _mm_add_epi64(b1, ml));
+
+ d0 = _mm_xor_si128(d0, a0);
+ d1 = _mm_xor_si128(d1, a1);
+
+ d0 = Rotr32(d0);
+ d1 = Rotr32(d1);
+
+ ml = _mm_mul_epu32(c0, d0);
+ ml = _mm_add_epi64(ml, ml);
+ c0 = _mm_add_epi64(c0, _mm_add_epi64(d0, ml));
+
+ ml = _mm_mul_epu32(c1, d1);
+ ml = _mm_add_epi64(ml, ml);
+ c1 = _mm_add_epi64(c1, _mm_add_epi64(ml, d1));
+
+ b0 = _mm_xor_si128(b0, c0);
+ b1 = _mm_xor_si128(b1, c1);
+
+ b0 = Rotr24(b0);
+ b1 = Rotr24(b1);
+ }
+
+ static inline void BlamkaG2SSE2(
+ __m128i& a0, __m128i& a1, __m128i& b0, __m128i& b1,
+ __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
+ __m128i ml = _mm_mul_epu32(a0, b0);
+ ml = _mm_add_epi64(ml, ml);
+ a0 = _mm_add_epi64(a0, _mm_add_epi64(b0, ml));
+
+ ml = _mm_mul_epu32(a1, b1);
+ ml = _mm_add_epi64(ml, ml);
+ a1 = _mm_add_epi64(a1, _mm_add_epi64(b1, ml));
+
+ d0 = _mm_xor_si128(d0, a0);
+ d1 = _mm_xor_si128(d1, a1);
+
+ d0 = Rotr16(d0);
+ d1 = Rotr16(d1);
+
+ ml = _mm_mul_epu32(c0, d0);
+ ml = _mm_add_epi64(ml, ml);
+ c0 = _mm_add_epi64(c0, _mm_add_epi64(d0, ml));
+
+ ml = _mm_mul_epu32(c1, d1);
+ ml = _mm_add_epi64(ml, ml);
+ c1 = _mm_add_epi64(c1, _mm_add_epi64(ml, d1));
+
+ b0 = _mm_xor_si128(b0, c0);
+ b1 = _mm_xor_si128(b1, c1);
+
+ b0 = Rotr63(b0);
+ b1 = Rotr63(b1);
+ }
+
+ static inline void DiagonalizeSSE2(
+ __m128i& b0, __m128i& b1, __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
+ __m128i tmp0 = d0;
+ __m128i tmp1 = b0;
+ d0 = c0;
+ c0 = c1;
+ c1 = d0;
+ d0 = _mm_unpackhi_epi64(d1, _mm_unpacklo_epi64(tmp0, tmp0));
+ d1 = _mm_unpackhi_epi64(tmp0, _mm_unpacklo_epi64(d1, d1));
+ b0 = _mm_unpackhi_epi64(b0, _mm_unpacklo_epi64(b1, b1));
+ b1 = _mm_unpackhi_epi64(b1, _mm_unpacklo_epi64(tmp1, tmp1));
+ }
+
+ static inline void UndiagonalizeSSE2(
+ __m128i& b0, __m128i& b1, __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
+ __m128i tmp0 = c0;
+ c0 = c1;
+ c1 = tmp0;
+ tmp0 = b0;
+ __m128i tmp1 = d0;
+ b0 = _mm_unpackhi_epi64(b1, _mm_unpacklo_epi64(b0, b0));
+ b1 = _mm_unpackhi_epi64(tmp0, _mm_unpacklo_epi64(b1, b1));
+ d0 = _mm_unpackhi_epi64(d0, _mm_unpacklo_epi64(d1, d1));
+ d1 = _mm_unpackhi_epi64(d1, _mm_unpacklo_epi64(tmp1, tmp1));
+ }
+}
diff --git a/library/cpp/digest/argonish/internal/blamka/blamka_ssse3.h b/library/cpp/digest/argonish/internal/blamka/blamka_ssse3.h
index 46e8500cd6..a7bd0c9539 100644
--- a/library/cpp/digest/argonish/internal/blamka/blamka_ssse3.h
+++ b/library/cpp/digest/argonish/internal/blamka/blamka_ssse3.h
@@ -1,103 +1,103 @@
-#pragma once
-
+#pragma once
+
#include <library/cpp/digest/argonish/internal/rotations/rotations_ssse3.h>
-
-namespace NArgonish {
- static inline void BlamkaG1SSSE3(
- __m128i& a0, __m128i& a1, __m128i& b0, __m128i& b1,
- __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
- __m128i ml = _mm_mul_epu32(a0, b0);
- ml = _mm_add_epi64(ml, ml);
- a0 = _mm_add_epi64(a0, _mm_add_epi64(b0, ml));
-
- ml = _mm_mul_epu32(a1, b1);
- ml = _mm_add_epi64(ml, ml);
- a1 = _mm_add_epi64(a1, _mm_add_epi64(b1, ml));
-
- d0 = _mm_xor_si128(d0, a0);
- d1 = _mm_xor_si128(d1, a1);
-
- d0 = Rotr32(d0);
- d1 = Rotr32(d1);
-
- ml = _mm_mul_epu32(c0, d0);
- ml = _mm_add_epi64(ml, ml);
- c0 = _mm_add_epi64(c0, _mm_add_epi64(d0, ml));
-
- ml = _mm_mul_epu32(c1, d1);
- ml = _mm_add_epi64(ml, ml);
- c1 = _mm_add_epi64(c1, _mm_add_epi64(ml, d1));
-
- b0 = _mm_xor_si128(b0, c0);
- b1 = _mm_xor_si128(b1, c1);
-
- b0 = Rotr24(b0);
- b1 = Rotr24(b1);
- }
-
- static inline void BlamkaG2SSSE3(
- __m128i& a0, __m128i& a1, __m128i& b0, __m128i& b1,
- __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
- __m128i ml = _mm_mul_epu32(a0, b0);
- ml = _mm_add_epi64(ml, ml);
- a0 = _mm_add_epi64(a0, _mm_add_epi64(b0, ml));
-
- ml = _mm_mul_epu32(a1, b1);
- ml = _mm_add_epi64(ml, ml);
- a1 = _mm_add_epi64(a1, _mm_add_epi64(b1, ml));
-
- d0 = _mm_xor_si128(d0, a0);
- d1 = _mm_xor_si128(d1, a1);
-
- d0 = Rotr16(d0);
- d1 = Rotr16(d1);
-
- ml = _mm_mul_epu32(c0, d0);
- ml = _mm_add_epi64(ml, ml);
- c0 = _mm_add_epi64(c0, _mm_add_epi64(d0, ml));
-
- ml = _mm_mul_epu32(c1, d1);
- ml = _mm_add_epi64(ml, ml);
- c1 = _mm_add_epi64(c1, _mm_add_epi64(ml, d1));
-
- b0 = _mm_xor_si128(b0, c0);
- b1 = _mm_xor_si128(b1, c1);
-
- b0 = Rotr63(b0);
- b1 = Rotr63(b1);
- }
-
- static inline void DiagonalizeSSSE3(
- __m128i& b0, __m128i& b1, __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
- __m128i t0 = _mm_alignr_epi8(b1, b0, 8);
- __m128i t1 = _mm_alignr_epi8(b0, b1, 8);
- b0 = t0;
- b1 = t1;
-
- t0 = c0;
- c0 = c1;
- c1 = t0;
-
- t0 = _mm_alignr_epi8(d1, d0, 8);
- t1 = _mm_alignr_epi8(d0, d1, 8);
- d0 = t1;
- d1 = t0;
- }
-
- static inline void UndiagonalizeSSSE3(
- __m128i& b0, __m128i& b1, __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
- __m128i t0 = _mm_alignr_epi8(b0, b1, 8);
- __m128i t1 = _mm_alignr_epi8(b1, b0, 8);
- b0 = t0;
- b1 = t1;
-
- t0 = c0;
- c0 = c1;
- c1 = t0;
-
- t0 = _mm_alignr_epi8(d0, d1, 8);
- t1 = _mm_alignr_epi8(d1, d0, 8);
- d0 = t1;
- d1 = t0;
- }
-}
+
+namespace NArgonish {
+ static inline void BlamkaG1SSSE3(
+ __m128i& a0, __m128i& a1, __m128i& b0, __m128i& b1,
+ __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
+ __m128i ml = _mm_mul_epu32(a0, b0);
+ ml = _mm_add_epi64(ml, ml);
+ a0 = _mm_add_epi64(a0, _mm_add_epi64(b0, ml));
+
+ ml = _mm_mul_epu32(a1, b1);
+ ml = _mm_add_epi64(ml, ml);
+ a1 = _mm_add_epi64(a1, _mm_add_epi64(b1, ml));
+
+ d0 = _mm_xor_si128(d0, a0);
+ d1 = _mm_xor_si128(d1, a1);
+
+ d0 = Rotr32(d0);
+ d1 = Rotr32(d1);
+
+ ml = _mm_mul_epu32(c0, d0);
+ ml = _mm_add_epi64(ml, ml);
+ c0 = _mm_add_epi64(c0, _mm_add_epi64(d0, ml));
+
+ ml = _mm_mul_epu32(c1, d1);
+ ml = _mm_add_epi64(ml, ml);
+ c1 = _mm_add_epi64(c1, _mm_add_epi64(ml, d1));
+
+ b0 = _mm_xor_si128(b0, c0);
+ b1 = _mm_xor_si128(b1, c1);
+
+ b0 = Rotr24(b0);
+ b1 = Rotr24(b1);
+ }
+
+ static inline void BlamkaG2SSSE3(
+ __m128i& a0, __m128i& a1, __m128i& b0, __m128i& b1,
+ __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
+ __m128i ml = _mm_mul_epu32(a0, b0);
+ ml = _mm_add_epi64(ml, ml);
+ a0 = _mm_add_epi64(a0, _mm_add_epi64(b0, ml));
+
+ ml = _mm_mul_epu32(a1, b1);
+ ml = _mm_add_epi64(ml, ml);
+ a1 = _mm_add_epi64(a1, _mm_add_epi64(b1, ml));
+
+ d0 = _mm_xor_si128(d0, a0);
+ d1 = _mm_xor_si128(d1, a1);
+
+ d0 = Rotr16(d0);
+ d1 = Rotr16(d1);
+
+ ml = _mm_mul_epu32(c0, d0);
+ ml = _mm_add_epi64(ml, ml);
+ c0 = _mm_add_epi64(c0, _mm_add_epi64(d0, ml));
+
+ ml = _mm_mul_epu32(c1, d1);
+ ml = _mm_add_epi64(ml, ml);
+ c1 = _mm_add_epi64(c1, _mm_add_epi64(ml, d1));
+
+ b0 = _mm_xor_si128(b0, c0);
+ b1 = _mm_xor_si128(b1, c1);
+
+ b0 = Rotr63(b0);
+ b1 = Rotr63(b1);
+ }
+
+ static inline void DiagonalizeSSSE3(
+ __m128i& b0, __m128i& b1, __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
+ __m128i t0 = _mm_alignr_epi8(b1, b0, 8);
+ __m128i t1 = _mm_alignr_epi8(b0, b1, 8);
+ b0 = t0;
+ b1 = t1;
+
+ t0 = c0;
+ c0 = c1;
+ c1 = t0;
+
+ t0 = _mm_alignr_epi8(d1, d0, 8);
+ t1 = _mm_alignr_epi8(d0, d1, 8);
+ d0 = t1;
+ d1 = t0;
+ }
+
+ static inline void UndiagonalizeSSSE3(
+ __m128i& b0, __m128i& b1, __m128i& c0, __m128i& c1, __m128i& d0, __m128i& d1) {
+ __m128i t0 = _mm_alignr_epi8(b0, b1, 8);
+ __m128i t1 = _mm_alignr_epi8(b1, b0, 8);
+ b0 = t0;
+ b1 = t1;
+
+ t0 = c0;
+ c0 = c1;
+ c1 = t0;
+
+ t0 = _mm_alignr_epi8(d0, d1, 8);
+ t1 = _mm_alignr_epi8(d1, d0, 8);
+ d0 = t1;
+ d1 = t0;
+ }
+}
diff --git a/library/cpp/digest/argonish/internal/blamka/ya.make b/library/cpp/digest/argonish/internal/blamka/ya.make
index 0aa6806b31..1f6d903166 100644
--- a/library/cpp/digest/argonish/internal/blamka/ya.make
+++ b/library/cpp/digest/argonish/internal/blamka/ya.make
@@ -1,9 +1,9 @@
-LIBRARY()
-
-OWNER(e-sidorov)
-
+LIBRARY()
+
+OWNER(e-sidorov)
+
PEERDIR(
library/cpp/digest/argonish/internal/rotations
)
-
-END()
+
+END()
diff --git a/library/cpp/digest/argonish/internal/proxies/avx2/proxy_avx2.cpp b/library/cpp/digest/argonish/internal/proxies/avx2/proxy_avx2.cpp
index c1cf004f58..8d320063f4 100644
--- a/library/cpp/digest/argonish/internal/proxies/avx2/proxy_avx2.cpp
+++ b/library/cpp/digest/argonish/internal/proxies/avx2/proxy_avx2.cpp
@@ -1,18 +1,18 @@
-//
-// Created by Evgeny Sidorov on 12/04/17.
-//
-
-#include "proxy_avx2.h"
+//
+// Created by Evgeny Sidorov on 12/04/17.
+//
+
+#include "proxy_avx2.h"
#include <library/cpp/digest/argonish/internal/argon2/argon2_base.h>
#include <library/cpp/digest/argonish/internal/argon2/argon2_avx2.h>
#include <library/cpp/digest/argonish/internal/blake2b/blake2b.h>
#include <library/cpp/digest/argonish/internal/blake2b/blake2b_avx2.h>
-
-#define ZEROUPPER _mm256_zeroupper();
-
-namespace NArgonish {
- ARGON2_PROXY_CLASS_IMPL(AVX2)
- BLAKE2B_PROXY_CLASS_IMPL(AVX2)
-}
-
-#undef ZEROUPPER
+
+#define ZEROUPPER _mm256_zeroupper();
+
+namespace NArgonish {
+ ARGON2_PROXY_CLASS_IMPL(AVX2)
+ BLAKE2B_PROXY_CLASS_IMPL(AVX2)
+}
+
+#undef ZEROUPPER
diff --git a/library/cpp/digest/argonish/internal/proxies/avx2/proxy_avx2.h b/library/cpp/digest/argonish/internal/proxies/avx2/proxy_avx2.h
index eec0094563..fca23250a2 100644
--- a/library/cpp/digest/argonish/internal/proxies/avx2/proxy_avx2.h
+++ b/library/cpp/digest/argonish/internal/proxies/avx2/proxy_avx2.h
@@ -1,11 +1,11 @@
-#pragma once
-
-#include <util/generic/yexception.h>
+#pragma once
+
+#include <util/generic/yexception.h>
#include <library/cpp/digest/argonish/argon2.h>
#include <library/cpp/digest/argonish/blake2b.h>
#include <library/cpp/digest/argonish/internal/proxies/macro/proxy_macros.h>
-
-namespace NArgonish {
- ARGON2_PROXY_CLASS_DECL(AVX2)
- BLAKE2B_PROXY_CLASS_DECL(AVX2)
-}
+
+namespace NArgonish {
+ ARGON2_PROXY_CLASS_DECL(AVX2)
+ BLAKE2B_PROXY_CLASS_DECL(AVX2)
+}
diff --git a/library/cpp/digest/argonish/internal/proxies/avx2/ya.make b/library/cpp/digest/argonish/internal/proxies/avx2/ya.make
index 53f814c48d..94ce211e06 100644
--- a/library/cpp/digest/argonish/internal/proxies/avx2/ya.make
+++ b/library/cpp/digest/argonish/internal/proxies/avx2/ya.make
@@ -1,18 +1,18 @@
-OWNER(e-sidorov)
-
-LIBRARY()
-
-NO_UTIL()
-
-IF (ARCH_X86_64 OR ARCH_I386)
- PEERDIR(
- library/cpp/digest/argonish/internal/proxies/macro
- library/cpp/digest/argonish/internal/argon2
- library/cpp/digest/argonish/internal/blake2b
- )
- SRC_CPP_AVX2(
- proxy_avx2.cpp
- )
-ENDIF()
-
-END()
+OWNER(e-sidorov)
+
+LIBRARY()
+
+NO_UTIL()
+
+IF (ARCH_X86_64 OR ARCH_I386)
+ PEERDIR(
+ library/cpp/digest/argonish/internal/proxies/macro
+ library/cpp/digest/argonish/internal/argon2
+ library/cpp/digest/argonish/internal/blake2b
+ )
+ SRC_CPP_AVX2(
+ proxy_avx2.cpp
+ )
+ENDIF()
+
+END()
diff --git a/library/cpp/digest/argonish/internal/proxies/macro/proxy_macros.h b/library/cpp/digest/argonish/internal/proxies/macro/proxy_macros.h
index 5ed5f53b4f..d9bddf55bd 100644
--- a/library/cpp/digest/argonish/internal/proxies/macro/proxy_macros.h
+++ b/library/cpp/digest/argonish/internal/proxies/macro/proxy_macros.h
@@ -1,194 +1,194 @@
-#pragma once
-
-//
-// Created by Evgeny Sidorov on 12/04/17.
-//
-/**
- * ZEROUPPER macro is only used for AVX2 instruction set to clear up the upper half of YMM registers
- * It's done to avoid performance penalty when CPU switches to non-AVX2 code (according to Agner)
- * and the post at https://software.intel.com/en-us/articles/intel-avx-state-transitions-migrating-sse-code-to-avx
- */
-
-#define ARGON2_PROXY_CLASS_DECL(IS) \
- class TArgon2Proxy##IS final: public IArgon2Base { \
- public: \
- TArgon2Proxy##IS(EArgon2Type atype, ui32 tcost, ui32 mcost, ui32 threads, \
- const ui8* key = nullptr, ui32 keylen = 0); \
- virtual ~TArgon2Proxy##IS(); \
- \
- virtual void Hash(const ui8* pwd, ui32 pwdlen, const ui8* salt, ui32 saltlen, \
- ui8* out, ui32 outlen, const ui8* aad = nullptr, ui32 aadlen = 0) const override; \
- virtual bool Verify(const ui8* pwd, ui32 pwdlen, const ui8* salt, ui32 saltlen, \
- const ui8* hash, ui32 hashlen, const ui8* aad = nullptr, ui32 aadlen = 0) const override; \
- virtual void HashWithCustomMemory(ui8* memory, size_t mlen, const ui8* pwd, ui32 pwdlen, \
- const ui8* salt, ui32 saltlen, ui8* out, ui32 outlen, \
- const ui8* aad = nullptr, ui32 aadlen = 0) const override; \
- virtual bool VerifyWithCustomMemory(ui8* memory, size_t mlen, const ui8* pwd, ui32 pwdlen, \
- const ui8* salt, ui32 saltlen, const ui8* hash, ui32 hashlen, \
- const ui8* aad = nullptr, ui32 aadlen = 0) const override; \
- virtual size_t GetMemorySize() const override; \
- \
- protected: \
- THolder<IArgon2Base> argon2; \
- };
-
-#define ARGON2_INSTANCE_DECL(IS_val, mcost_val, threads_val) \
- if (mcost == mcost_val && threads == threads_val) { \
- argon2 = MakeHolder<TArgon2##IS_val<mcost_val, threads_val>>(atype, tcost, key, keylen); \
- return; \
- }
-
-#define ARGON2_PROXY_CLASS_IMPL(IS) \
- TArgon2Proxy##IS::TArgon2Proxy##IS(EArgon2Type atype, ui32 tcost, ui32 mcost, ui32 threads, \
- const ui8* key, ui32 keylen) { \
- if ((key == nullptr && keylen > 0) || keylen > ARGON2_SECRET_MAX_LENGTH) \
- ythrow yexception() << "key is null or keylen equals 0 or key is too long"; \
- \
- ARGON2_INSTANCE_DECL(IS, 1, 1) \
+#pragma once
+
+//
+// Created by Evgeny Sidorov on 12/04/17.
+//
+/**
+ * ZEROUPPER macro is only used for AVX2 instruction set to clear up the upper half of YMM registers
+ * It's done to avoid performance penalty when CPU switches to non-AVX2 code (according to Agner)
+ * and the post at https://software.intel.com/en-us/articles/intel-avx-state-transitions-migrating-sse-code-to-avx
+ */
+
+#define ARGON2_PROXY_CLASS_DECL(IS) \
+ class TArgon2Proxy##IS final: public IArgon2Base { \
+ public: \
+ TArgon2Proxy##IS(EArgon2Type atype, ui32 tcost, ui32 mcost, ui32 threads, \
+ const ui8* key = nullptr, ui32 keylen = 0); \
+ virtual ~TArgon2Proxy##IS(); \
+ \
+ virtual void Hash(const ui8* pwd, ui32 pwdlen, const ui8* salt, ui32 saltlen, \
+ ui8* out, ui32 outlen, const ui8* aad = nullptr, ui32 aadlen = 0) const override; \
+ virtual bool Verify(const ui8* pwd, ui32 pwdlen, const ui8* salt, ui32 saltlen, \
+ const ui8* hash, ui32 hashlen, const ui8* aad = nullptr, ui32 aadlen = 0) const override; \
+ virtual void HashWithCustomMemory(ui8* memory, size_t mlen, const ui8* pwd, ui32 pwdlen, \
+ const ui8* salt, ui32 saltlen, ui8* out, ui32 outlen, \
+ const ui8* aad = nullptr, ui32 aadlen = 0) const override; \
+ virtual bool VerifyWithCustomMemory(ui8* memory, size_t mlen, const ui8* pwd, ui32 pwdlen, \
+ const ui8* salt, ui32 saltlen, const ui8* hash, ui32 hashlen, \
+ const ui8* aad = nullptr, ui32 aadlen = 0) const override; \
+ virtual size_t GetMemorySize() const override; \
+ \
+ protected: \
+ THolder<IArgon2Base> argon2; \
+ };
+
+#define ARGON2_INSTANCE_DECL(IS_val, mcost_val, threads_val) \
+ if (mcost == mcost_val && threads == threads_val) { \
+ argon2 = MakeHolder<TArgon2##IS_val<mcost_val, threads_val>>(atype, tcost, key, keylen); \
+ return; \
+ }
+
+#define ARGON2_PROXY_CLASS_IMPL(IS) \
+ TArgon2Proxy##IS::TArgon2Proxy##IS(EArgon2Type atype, ui32 tcost, ui32 mcost, ui32 threads, \
+ const ui8* key, ui32 keylen) { \
+ if ((key == nullptr && keylen > 0) || keylen > ARGON2_SECRET_MAX_LENGTH) \
+ ythrow yexception() << "key is null or keylen equals 0 or key is too long"; \
+ \
+ ARGON2_INSTANCE_DECL(IS, 1, 1) \
ARGON2_INSTANCE_DECL(IS, 8, 1) \
- ARGON2_INSTANCE_DECL(IS, 16, 1) \
- ARGON2_INSTANCE_DECL(IS, 32, 1) \
- ARGON2_INSTANCE_DECL(IS, 64, 1) \
+ ARGON2_INSTANCE_DECL(IS, 16, 1) \
+ ARGON2_INSTANCE_DECL(IS, 32, 1) \
+ ARGON2_INSTANCE_DECL(IS, 64, 1) \
ARGON2_INSTANCE_DECL(IS, 128, 1) \
ARGON2_INSTANCE_DECL(IS, 256, 1) \
- ARGON2_INSTANCE_DECL(IS, 512, 1) \
- ARGON2_INSTANCE_DECL(IS, 1024, 1) \
- ARGON2_INSTANCE_DECL(IS, 2048, 1) \
- ARGON2_INSTANCE_DECL(IS, 4096, 1) \
- ARGON2_INSTANCE_DECL(IS, 8192, 1) \
- ARGON2_INSTANCE_DECL(IS, 16384, 1) \
- ARGON2_INSTANCE_DECL(IS, 32768, 1) \
- ARGON2_INSTANCE_DECL(IS, 65536, 1) \
- ARGON2_INSTANCE_DECL(IS, 131072, 1) \
- ARGON2_INSTANCE_DECL(IS, 262144, 1) \
- ARGON2_INSTANCE_DECL(IS, 524288, 1) \
- ARGON2_INSTANCE_DECL(IS, 1048576, 1) \
- ARGON2_INSTANCE_DECL(IS, 1, 2) \
- ARGON2_INSTANCE_DECL(IS, 32, 2) \
- ARGON2_INSTANCE_DECL(IS, 64, 2) \
- ARGON2_INSTANCE_DECL(IS, 512, 2) \
- ARGON2_INSTANCE_DECL(IS, 1024, 2) \
- ARGON2_INSTANCE_DECL(IS, 2048, 2) \
- ARGON2_INSTANCE_DECL(IS, 4096, 2) \
- ARGON2_INSTANCE_DECL(IS, 8192, 2) \
- ARGON2_INSTANCE_DECL(IS, 16384, 2) \
- ARGON2_INSTANCE_DECL(IS, 32768, 2) \
- ARGON2_INSTANCE_DECL(IS, 65536, 2) \
- ARGON2_INSTANCE_DECL(IS, 131072, 2) \
- ARGON2_INSTANCE_DECL(IS, 262144, 2) \
- ARGON2_INSTANCE_DECL(IS, 524288, 2) \
- ARGON2_INSTANCE_DECL(IS, 1048576, 2) \
- ARGON2_INSTANCE_DECL(IS, 1, 4) \
- ARGON2_INSTANCE_DECL(IS, 32, 4) \
- ARGON2_INSTANCE_DECL(IS, 64, 4) \
- ARGON2_INSTANCE_DECL(IS, 512, 4) \
- ARGON2_INSTANCE_DECL(IS, 1024, 4) \
- ARGON2_INSTANCE_DECL(IS, 2048, 4) \
- ARGON2_INSTANCE_DECL(IS, 4096, 4) \
- ARGON2_INSTANCE_DECL(IS, 8192, 4) \
- ARGON2_INSTANCE_DECL(IS, 16384, 4) \
- ARGON2_INSTANCE_DECL(IS, 32768, 4) \
- ARGON2_INSTANCE_DECL(IS, 65536, 4) \
- ARGON2_INSTANCE_DECL(IS, 131072, 4) \
- ARGON2_INSTANCE_DECL(IS, 262144, 4) \
- ARGON2_INSTANCE_DECL(IS, 524288, 4) \
- ARGON2_INSTANCE_DECL(IS, 1048576, 4) \
- \
- ythrow yexception() << "These parameters are not supported. Please add the corresponding ARGON2_INSTANCE_DECL macro"; \
- } \
- \
- TArgon2Proxy##IS::~TArgon2Proxy##IS() { \
- } \
- \
- void TArgon2Proxy##IS::Hash(const ui8* pwd, ui32 pwdlen, const ui8* salt, ui32 saltlen, \
- ui8* out, ui32 outlen, const ui8* aad, ui32 aadlen) const { \
- if (saltlen < ARGON2_SALT_MIN_LEN) \
- ythrow yexception() << "salt is too short"; \
- if (outlen < ARGON2_MIN_OUTLEN) \
- ythrow yexception() << "output length is too short"; \
- \
- argon2->Hash(pwd, pwdlen, salt, saltlen, out, outlen, aad, aadlen); \
- ZEROUPPER \
- } \
- \
- bool TArgon2Proxy##IS::Verify(const ui8* pwd, ui32 pwdlen, const ui8* salt, ui32 saltlen, \
- const ui8* hash, ui32 hashlen, const ui8* aad, ui32 aadlen) const { \
- if (saltlen < ARGON2_SALT_MIN_LEN) \
- ythrow yexception() << "salt is too short"; \
- if (hashlen < ARGON2_MIN_OUTLEN) \
- ythrow yexception() << "hash length is too short"; \
- \
- return argon2->Verify(pwd, pwdlen, salt, saltlen, hash, hashlen, aad, aadlen); \
- ZEROUPPER \
- } \
- \
- void TArgon2Proxy##IS::HashWithCustomMemory(ui8* memory, size_t mlen, const ui8* pwd, ui32 pwdlen, \
- const ui8* salt, ui32 saltlen, ui8* out, ui32 outlen, \
- const ui8* aad, ui32 aadlen) const { \
- if (saltlen < ARGON2_SALT_MIN_LEN) \
- ythrow yexception() << "salt is too short"; \
- if (outlen < ARGON2_MIN_OUTLEN) \
- ythrow yexception() << "output length is too short"; \
- \
- argon2->HashWithCustomMemory(memory, mlen, pwd, pwdlen, salt, saltlen, out, outlen, aad, aadlen); \
- ZEROUPPER \
- } \
- \
- bool TArgon2Proxy##IS::VerifyWithCustomMemory(ui8* memory, size_t mlen, const ui8* pwd, ui32 pwdlen, \
- const ui8* salt, ui32 saltlen, const ui8* hash, ui32 hashlen, \
- const ui8* aad, ui32 aadlen) const { \
- if (saltlen < ARGON2_SALT_MIN_LEN) \
- ythrow yexception() << "salt is too short"; \
- if (hashlen < ARGON2_MIN_OUTLEN) \
- ythrow yexception() << "hash length is too short"; \
- \
- return argon2->VerifyWithCustomMemory(memory, mlen, pwd, pwdlen, salt, saltlen, hash, hashlen, aad, aadlen); \
- ZEROUPPER \
- } \
- \
- size_t TArgon2Proxy##IS::GetMemorySize() const { \
- return argon2->GetMemorySize(); \
- }
-
-#define BLAKE2B_PROXY_CLASS_DECL(IS) \
- class TBlake2BProxy##IS final: public IBlake2Base { \
- public: \
- TBlake2BProxy##IS(size_t outlen, const void* key = nullptr, size_t keylen = 0); \
- virtual void Update(ui32 in) override; \
- virtual void Update(const void* pin, size_t inlen) override; \
- virtual void Final(void* out, size_t outlen) override; \
- \
- protected: \
- THolder<IBlake2Base> blake2; \
- };
-
-#define BLAKE2B_PROXY_CLASS_IMPL(IS) \
- TBlake2BProxy##IS::TBlake2BProxy##IS(size_t outlen, const void* key, size_t keylen) { \
- if (!outlen || outlen > BLAKE2B_OUTBYTES) \
- ythrow yexception() << "outlen equals 0 or too long"; \
- \
- if (key == nullptr) { \
- blake2 = MakeHolder<TBlake2B<EInstructionSet::IS>>(outlen); \
- return; \
- } \
- \
- if (!key || !keylen || keylen > BLAKE2B_KEYBYTES) \
- ythrow yexception() << "key is null or too long"; \
- \
- blake2 = MakeHolder<TBlake2B<EInstructionSet::IS>>(outlen, key, keylen); \
- } \
- \
- void TBlake2BProxy##IS::Update(ui32 in) { \
- blake2->Update(in); \
- ZEROUPPER \
- } \
- \
- void TBlake2BProxy##IS::Update(const void* pin, size_t inlen) { \
- blake2->Update(pin, inlen); \
- ZEROUPPER \
- } \
- \
- void TBlake2BProxy##IS::Final(void* out, size_t outlen) { \
- blake2->Final(out, outlen); \
- ZEROUPPER \
- }
+ ARGON2_INSTANCE_DECL(IS, 512, 1) \
+ ARGON2_INSTANCE_DECL(IS, 1024, 1) \
+ ARGON2_INSTANCE_DECL(IS, 2048, 1) \
+ ARGON2_INSTANCE_DECL(IS, 4096, 1) \
+ ARGON2_INSTANCE_DECL(IS, 8192, 1) \
+ ARGON2_INSTANCE_DECL(IS, 16384, 1) \
+ ARGON2_INSTANCE_DECL(IS, 32768, 1) \
+ ARGON2_INSTANCE_DECL(IS, 65536, 1) \
+ ARGON2_INSTANCE_DECL(IS, 131072, 1) \
+ ARGON2_INSTANCE_DECL(IS, 262144, 1) \
+ ARGON2_INSTANCE_DECL(IS, 524288, 1) \
+ ARGON2_INSTANCE_DECL(IS, 1048576, 1) \
+ ARGON2_INSTANCE_DECL(IS, 1, 2) \
+ ARGON2_INSTANCE_DECL(IS, 32, 2) \
+ ARGON2_INSTANCE_DECL(IS, 64, 2) \
+ ARGON2_INSTANCE_DECL(IS, 512, 2) \
+ ARGON2_INSTANCE_DECL(IS, 1024, 2) \
+ ARGON2_INSTANCE_DECL(IS, 2048, 2) \
+ ARGON2_INSTANCE_DECL(IS, 4096, 2) \
+ ARGON2_INSTANCE_DECL(IS, 8192, 2) \
+ ARGON2_INSTANCE_DECL(IS, 16384, 2) \
+ ARGON2_INSTANCE_DECL(IS, 32768, 2) \
+ ARGON2_INSTANCE_DECL(IS, 65536, 2) \
+ ARGON2_INSTANCE_DECL(IS, 131072, 2) \
+ ARGON2_INSTANCE_DECL(IS, 262144, 2) \
+ ARGON2_INSTANCE_DECL(IS, 524288, 2) \
+ ARGON2_INSTANCE_DECL(IS, 1048576, 2) \
+ ARGON2_INSTANCE_DECL(IS, 1, 4) \
+ ARGON2_INSTANCE_DECL(IS, 32, 4) \
+ ARGON2_INSTANCE_DECL(IS, 64, 4) \
+ ARGON2_INSTANCE_DECL(IS, 512, 4) \
+ ARGON2_INSTANCE_DECL(IS, 1024, 4) \
+ ARGON2_INSTANCE_DECL(IS, 2048, 4) \
+ ARGON2_INSTANCE_DECL(IS, 4096, 4) \
+ ARGON2_INSTANCE_DECL(IS, 8192, 4) \
+ ARGON2_INSTANCE_DECL(IS, 16384, 4) \
+ ARGON2_INSTANCE_DECL(IS, 32768, 4) \
+ ARGON2_INSTANCE_DECL(IS, 65536, 4) \
+ ARGON2_INSTANCE_DECL(IS, 131072, 4) \
+ ARGON2_INSTANCE_DECL(IS, 262144, 4) \
+ ARGON2_INSTANCE_DECL(IS, 524288, 4) \
+ ARGON2_INSTANCE_DECL(IS, 1048576, 4) \
+ \
+ ythrow yexception() << "These parameters are not supported. Please add the corresponding ARGON2_INSTANCE_DECL macro"; \
+ } \
+ \
+ TArgon2Proxy##IS::~TArgon2Proxy##IS() { \
+ } \
+ \
+ void TArgon2Proxy##IS::Hash(const ui8* pwd, ui32 pwdlen, const ui8* salt, ui32 saltlen, \
+ ui8* out, ui32 outlen, const ui8* aad, ui32 aadlen) const { \
+ if (saltlen < ARGON2_SALT_MIN_LEN) \
+ ythrow yexception() << "salt is too short"; \
+ if (outlen < ARGON2_MIN_OUTLEN) \
+ ythrow yexception() << "output length is too short"; \
+ \
+ argon2->Hash(pwd, pwdlen, salt, saltlen, out, outlen, aad, aadlen); \
+ ZEROUPPER \
+ } \
+ \
+ bool TArgon2Proxy##IS::Verify(const ui8* pwd, ui32 pwdlen, const ui8* salt, ui32 saltlen, \
+ const ui8* hash, ui32 hashlen, const ui8* aad, ui32 aadlen) const { \
+ if (saltlen < ARGON2_SALT_MIN_LEN) \
+ ythrow yexception() << "salt is too short"; \
+ if (hashlen < ARGON2_MIN_OUTLEN) \
+ ythrow yexception() << "hash length is too short"; \
+ \
+ return argon2->Verify(pwd, pwdlen, salt, saltlen, hash, hashlen, aad, aadlen); \
+ ZEROUPPER \
+ } \
+ \
+ void TArgon2Proxy##IS::HashWithCustomMemory(ui8* memory, size_t mlen, const ui8* pwd, ui32 pwdlen, \
+ const ui8* salt, ui32 saltlen, ui8* out, ui32 outlen, \
+ const ui8* aad, ui32 aadlen) const { \
+ if (saltlen < ARGON2_SALT_MIN_LEN) \
+ ythrow yexception() << "salt is too short"; \
+ if (outlen < ARGON2_MIN_OUTLEN) \
+ ythrow yexception() << "output length is too short"; \
+ \
+ argon2->HashWithCustomMemory(memory, mlen, pwd, pwdlen, salt, saltlen, out, outlen, aad, aadlen); \
+ ZEROUPPER \
+ } \
+ \
+ bool TArgon2Proxy##IS::VerifyWithCustomMemory(ui8* memory, size_t mlen, const ui8* pwd, ui32 pwdlen, \
+ const ui8* salt, ui32 saltlen, const ui8* hash, ui32 hashlen, \
+ const ui8* aad, ui32 aadlen) const { \
+ if (saltlen < ARGON2_SALT_MIN_LEN) \
+ ythrow yexception() << "salt is too short"; \
+ if (hashlen < ARGON2_MIN_OUTLEN) \
+ ythrow yexception() << "hash length is too short"; \
+ \
+ return argon2->VerifyWithCustomMemory(memory, mlen, pwd, pwdlen, salt, saltlen, hash, hashlen, aad, aadlen); \
+ ZEROUPPER \
+ } \
+ \
+ size_t TArgon2Proxy##IS::GetMemorySize() const { \
+ return argon2->GetMemorySize(); \
+ }
+
+#define BLAKE2B_PROXY_CLASS_DECL(IS) \
+ class TBlake2BProxy##IS final: public IBlake2Base { \
+ public: \
+ TBlake2BProxy##IS(size_t outlen, const void* key = nullptr, size_t keylen = 0); \
+ virtual void Update(ui32 in) override; \
+ virtual void Update(const void* pin, size_t inlen) override; \
+ virtual void Final(void* out, size_t outlen) override; \
+ \
+ protected: \
+ THolder<IBlake2Base> blake2; \
+ };
+
+#define BLAKE2B_PROXY_CLASS_IMPL(IS) \
+ TBlake2BProxy##IS::TBlake2BProxy##IS(size_t outlen, const void* key, size_t keylen) { \
+ if (!outlen || outlen > BLAKE2B_OUTBYTES) \
+ ythrow yexception() << "outlen equals 0 or too long"; \
+ \
+ if (key == nullptr) { \
+ blake2 = MakeHolder<TBlake2B<EInstructionSet::IS>>(outlen); \
+ return; \
+ } \
+ \
+ if (!key || !keylen || keylen > BLAKE2B_KEYBYTES) \
+ ythrow yexception() << "key is null or too long"; \
+ \
+ blake2 = MakeHolder<TBlake2B<EInstructionSet::IS>>(outlen, key, keylen); \
+ } \
+ \
+ void TBlake2BProxy##IS::Update(ui32 in) { \
+ blake2->Update(in); \
+ ZEROUPPER \
+ } \
+ \
+ void TBlake2BProxy##IS::Update(const void* pin, size_t inlen) { \
+ blake2->Update(pin, inlen); \
+ ZEROUPPER \
+ } \
+ \
+ void TBlake2BProxy##IS::Final(void* out, size_t outlen) { \
+ blake2->Final(out, outlen); \
+ ZEROUPPER \
+ }
diff --git a/library/cpp/digest/argonish/internal/proxies/macro/ya.make b/library/cpp/digest/argonish/internal/proxies/macro/ya.make
index 5f639d4571..b2b79b2b2a 100644
--- a/library/cpp/digest/argonish/internal/proxies/macro/ya.make
+++ b/library/cpp/digest/argonish/internal/proxies/macro/ya.make
@@ -1,5 +1,5 @@
-LIBRARY()
-
-OWNER(e-sidorov)
-
-END()
+LIBRARY()
+
+OWNER(e-sidorov)
+
+END()
diff --git a/library/cpp/digest/argonish/internal/proxies/ref/proxy_ref.cpp b/library/cpp/digest/argonish/internal/proxies/ref/proxy_ref.cpp
index 0bc51866fd..55832396be 100644
--- a/library/cpp/digest/argonish/internal/proxies/ref/proxy_ref.cpp
+++ b/library/cpp/digest/argonish/internal/proxies/ref/proxy_ref.cpp
@@ -1,20 +1,20 @@
-//
-// Created by Evgeny Sidorov on 12/04/17.
-//
-
-#include "proxy_ref.h"
+//
+// Created by Evgeny Sidorov on 12/04/17.
+//
+
+#include "proxy_ref.h"
#include <library/cpp/digest/argonish/internal/argon2/argon2_base.h>
#include <library/cpp/digest/argonish/internal/argon2/argon2_ref.h>
#include <library/cpp/digest/argonish/internal/blake2b/blake2b.h>
#include <library/cpp/digest/argonish/internal/blake2b/blake2b_ref.h>
-
-#include <stdexcept>
-
-#define ZEROUPPER ;
-
-namespace NArgonish {
- ARGON2_PROXY_CLASS_IMPL(REF)
- BLAKE2B_PROXY_CLASS_IMPL(REF)
-}
-
-#undef ZEROUPPER
+
+#include <stdexcept>
+
+#define ZEROUPPER ;
+
+namespace NArgonish {
+ ARGON2_PROXY_CLASS_IMPL(REF)
+ BLAKE2B_PROXY_CLASS_IMPL(REF)
+}
+
+#undef ZEROUPPER
diff --git a/library/cpp/digest/argonish/internal/proxies/ref/proxy_ref.h b/library/cpp/digest/argonish/internal/proxies/ref/proxy_ref.h
index 821abc50cd..c9217a986c 100644
--- a/library/cpp/digest/argonish/internal/proxies/ref/proxy_ref.h
+++ b/library/cpp/digest/argonish/internal/proxies/ref/proxy_ref.h
@@ -1,11 +1,11 @@
-#pragma once
-
-#include <util/generic/yexception.h>
+#pragma once
+
+#include <util/generic/yexception.h>
#include <library/cpp/digest/argonish/argon2.h>
#include <library/cpp/digest/argonish/blake2b.h>
#include <library/cpp/digest/argonish/internal/proxies/macro/proxy_macros.h>
-
-namespace NArgonish {
- ARGON2_PROXY_CLASS_DECL(REF)
- BLAKE2B_PROXY_CLASS_DECL(REF)
-}
+
+namespace NArgonish {
+ ARGON2_PROXY_CLASS_DECL(REF)
+ BLAKE2B_PROXY_CLASS_DECL(REF)
+}
diff --git a/library/cpp/digest/argonish/internal/proxies/ref/ya.make b/library/cpp/digest/argonish/internal/proxies/ref/ya.make
index 7a15f44611..08ac4bb77d 100644
--- a/library/cpp/digest/argonish/internal/proxies/ref/ya.make
+++ b/library/cpp/digest/argonish/internal/proxies/ref/ya.make
@@ -1,17 +1,17 @@
-OWNER(e-sidorov)
-
-LIBRARY()
-
-NO_UTIL()
-
-PEERDIR(
+OWNER(e-sidorov)
+
+LIBRARY()
+
+NO_UTIL()
+
+PEERDIR(
library/cpp/digest/argonish/internal/proxies/macro
library/cpp/digest/argonish/internal/argon2
library/cpp/digest/argonish/internal/blake2b
-)
-
-SRCS(
- proxy_ref.cpp
-)
-
-END()
+)
+
+SRCS(
+ proxy_ref.cpp
+)
+
+END()
diff --git a/library/cpp/digest/argonish/internal/proxies/sse2/proxy_sse2.cpp b/library/cpp/digest/argonish/internal/proxies/sse2/proxy_sse2.cpp
index 3e63c9ad62..d56396cee8 100644
--- a/library/cpp/digest/argonish/internal/proxies/sse2/proxy_sse2.cpp
+++ b/library/cpp/digest/argonish/internal/proxies/sse2/proxy_sse2.cpp
@@ -1,18 +1,18 @@
-//
-// Created by Evgeny Sidorov on 12/04/17.
-//
-
-#include "proxy_sse2.h"
+//
+// Created by Evgeny Sidorov on 12/04/17.
+//
+
+#include "proxy_sse2.h"
#include <library/cpp/digest/argonish/internal/argon2/argon2_base.h>
#include <library/cpp/digest/argonish/internal/argon2/argon2_sse2.h>
#include <library/cpp/digest/argonish/internal/blake2b/blake2b.h>
#include <library/cpp/digest/argonish/internal/blake2b/blake2b_sse2.h>
-
-#define ZEROUPPER ;
-
-namespace NArgonish {
- ARGON2_PROXY_CLASS_IMPL(SSE2)
- BLAKE2B_PROXY_CLASS_IMPL(SSE2)
-}
-
-#undef ZEROUPPER
+
+#define ZEROUPPER ;
+
+namespace NArgonish {
+ ARGON2_PROXY_CLASS_IMPL(SSE2)
+ BLAKE2B_PROXY_CLASS_IMPL(SSE2)
+}
+
+#undef ZEROUPPER
diff --git a/library/cpp/digest/argonish/internal/proxies/sse2/proxy_sse2.h b/library/cpp/digest/argonish/internal/proxies/sse2/proxy_sse2.h
index a2b74cd9a7..553b5797a8 100644
--- a/library/cpp/digest/argonish/internal/proxies/sse2/proxy_sse2.h
+++ b/library/cpp/digest/argonish/internal/proxies/sse2/proxy_sse2.h
@@ -1,11 +1,11 @@
-#pragma once
-
-#include <util/generic/yexception.h>
+#pragma once
+
+#include <util/generic/yexception.h>
#include <library/cpp/digest/argonish/argon2.h>
#include <library/cpp/digest/argonish/blake2b.h>
#include <library/cpp/digest/argonish/internal/proxies/macro/proxy_macros.h>
-
-namespace NArgonish {
- ARGON2_PROXY_CLASS_DECL(SSE2)
- BLAKE2B_PROXY_CLASS_DECL(SSE2)
-}
+
+namespace NArgonish {
+ ARGON2_PROXY_CLASS_DECL(SSE2)
+ BLAKE2B_PROXY_CLASS_DECL(SSE2)
+}
diff --git a/library/cpp/digest/argonish/internal/proxies/sse2/ya.make b/library/cpp/digest/argonish/internal/proxies/sse2/ya.make
index 1c752f0dd5..1529a982fa 100644
--- a/library/cpp/digest/argonish/internal/proxies/sse2/ya.make
+++ b/library/cpp/digest/argonish/internal/proxies/sse2/ya.make
@@ -1,18 +1,18 @@
-OWNER(e-sidorov)
-
-LIBRARY()
-
-NO_UTIL()
-
-IF (ARCH_X86_64 OR ARCH_I386)
- PEERDIR(
- library/cpp/digest/argonish/internal/proxies/macro
- library/cpp/digest/argonish/internal/argon2
- library/cpp/digest/argonish/internal/blake2b
- )
- SRC_CPP_SSE2(
- proxy_sse2.cpp
- )
-ENDIF()
-
-END()
+OWNER(e-sidorov)
+
+LIBRARY()
+
+NO_UTIL()
+
+IF (ARCH_X86_64 OR ARCH_I386)
+ PEERDIR(
+ library/cpp/digest/argonish/internal/proxies/macro
+ library/cpp/digest/argonish/internal/argon2
+ library/cpp/digest/argonish/internal/blake2b
+ )
+ SRC_CPP_SSE2(
+ proxy_sse2.cpp
+ )
+ENDIF()
+
+END()
diff --git a/library/cpp/digest/argonish/internal/proxies/sse41/proxy_sse41.cpp b/library/cpp/digest/argonish/internal/proxies/sse41/proxy_sse41.cpp
index b633ad8cbf..fe1b28bf24 100644
--- a/library/cpp/digest/argonish/internal/proxies/sse41/proxy_sse41.cpp
+++ b/library/cpp/digest/argonish/internal/proxies/sse41/proxy_sse41.cpp
@@ -1,18 +1,18 @@
-//
-// Created by Evgeny Sidorov on 12/04/17.
-//
-
-#include "proxy_sse41.h"
+//
+// Created by Evgeny Sidorov on 12/04/17.
+//
+
+#include "proxy_sse41.h"
#include <library/cpp/digest/argonish/internal/argon2/argon2_base.h>
#include <library/cpp/digest/argonish/internal/argon2/argon2_sse41.h>
#include <library/cpp/digest/argonish/internal/blake2b/blake2b.h>
#include <library/cpp/digest/argonish/internal/blake2b/blake2b_sse41.h>
-
-#define ZEROUPPER ;
-
-namespace NArgonish {
- ARGON2_PROXY_CLASS_IMPL(SSE41)
- BLAKE2B_PROXY_CLASS_IMPL(SSE41)
-}
-
-#undef ZEROUPPER
+
+#define ZEROUPPER ;
+
+namespace NArgonish {
+ ARGON2_PROXY_CLASS_IMPL(SSE41)
+ BLAKE2B_PROXY_CLASS_IMPL(SSE41)
+}
+
+#undef ZEROUPPER
diff --git a/library/cpp/digest/argonish/internal/proxies/sse41/proxy_sse41.h b/library/cpp/digest/argonish/internal/proxies/sse41/proxy_sse41.h
index 2a4b6614aa..c56f41750c 100644
--- a/library/cpp/digest/argonish/internal/proxies/sse41/proxy_sse41.h
+++ b/library/cpp/digest/argonish/internal/proxies/sse41/proxy_sse41.h
@@ -1,11 +1,11 @@
-#pragma once
-
-#include <util/generic/yexception.h>
+#pragma once
+
+#include <util/generic/yexception.h>
#include <library/cpp/digest/argonish/argon2.h>
#include <library/cpp/digest/argonish/blake2b.h>
#include <library/cpp/digest/argonish/internal/proxies/macro/proxy_macros.h>
-
-namespace NArgonish {
- ARGON2_PROXY_CLASS_DECL(SSE41)
- BLAKE2B_PROXY_CLASS_DECL(SSE41)
-}
+
+namespace NArgonish {
+ ARGON2_PROXY_CLASS_DECL(SSE41)
+ BLAKE2B_PROXY_CLASS_DECL(SSE41)
+}
diff --git a/library/cpp/digest/argonish/internal/proxies/sse41/ya.make b/library/cpp/digest/argonish/internal/proxies/sse41/ya.make
index 16a9922016..5da63f0bbf 100644
--- a/library/cpp/digest/argonish/internal/proxies/sse41/ya.make
+++ b/library/cpp/digest/argonish/internal/proxies/sse41/ya.make
@@ -1,18 +1,18 @@
-OWNER(e-sidorov)
-
-LIBRARY()
-
-NO_UTIL()
-
-IF (ARCH_X86_64 OR ARCH_I386)
- PEERDIR(
- library/cpp/digest/argonish/internal/proxies/macro
- library/cpp/digest/argonish/internal/argon2
- library/cpp/digest/argonish/internal/blake2b
- )
- SRC_CPP_SSE41(
- proxy_sse41.cpp
- )
-ENDIF()
-
-END()
+OWNER(e-sidorov)
+
+LIBRARY()
+
+NO_UTIL()
+
+IF (ARCH_X86_64 OR ARCH_I386)
+ PEERDIR(
+ library/cpp/digest/argonish/internal/proxies/macro
+ library/cpp/digest/argonish/internal/argon2
+ library/cpp/digest/argonish/internal/blake2b
+ )
+ SRC_CPP_SSE41(
+ proxy_sse41.cpp
+ )
+ENDIF()
+
+END()
diff --git a/library/cpp/digest/argonish/internal/proxies/ssse3/proxy_ssse3.cpp b/library/cpp/digest/argonish/internal/proxies/ssse3/proxy_ssse3.cpp
index d77b55737c..24b70e22d3 100644
--- a/library/cpp/digest/argonish/internal/proxies/ssse3/proxy_ssse3.cpp
+++ b/library/cpp/digest/argonish/internal/proxies/ssse3/proxy_ssse3.cpp
@@ -1,18 +1,18 @@
-//
-// Created by Evgeny Sidorov on 12/04/17.
-//
-
-#include "proxy_ssse3.h"
+//
+// Created by Evgeny Sidorov on 12/04/17.
+//
+
+#include "proxy_ssse3.h"
#include <library/cpp/digest/argonish/internal/argon2/argon2_base.h>
#include <library/cpp/digest/argonish/internal/argon2/argon2_ssse3.h>
#include <library/cpp/digest/argonish/internal/blake2b/blake2b.h>
#include <library/cpp/digest/argonish/internal/blake2b/blake2b_ssse3.h>
-
-#define ZEROUPPER ;
-
-namespace NArgonish {
- ARGON2_PROXY_CLASS_IMPL(SSSE3)
- BLAKE2B_PROXY_CLASS_IMPL(SSSE3)
-}
-
-#undef ZEROUPPER
+
+#define ZEROUPPER ;
+
+namespace NArgonish {
+ ARGON2_PROXY_CLASS_IMPL(SSSE3)
+ BLAKE2B_PROXY_CLASS_IMPL(SSSE3)
+}
+
+#undef ZEROUPPER
diff --git a/library/cpp/digest/argonish/internal/proxies/ssse3/proxy_ssse3.h b/library/cpp/digest/argonish/internal/proxies/ssse3/proxy_ssse3.h
index 994133e88e..93be69e3c6 100644
--- a/library/cpp/digest/argonish/internal/proxies/ssse3/proxy_ssse3.h
+++ b/library/cpp/digest/argonish/internal/proxies/ssse3/proxy_ssse3.h
@@ -1,11 +1,11 @@
-#pragma once
-
-#include <util/generic/yexception.h>
+#pragma once
+
+#include <util/generic/yexception.h>
#include <library/cpp/digest/argonish/argon2.h>
#include <library/cpp/digest/argonish/blake2b.h>
#include <library/cpp/digest/argonish/internal/proxies/macro/proxy_macros.h>
-
-namespace NArgonish {
- ARGON2_PROXY_CLASS_DECL(SSSE3)
- BLAKE2B_PROXY_CLASS_DECL(SSSE3)
-}
+
+namespace NArgonish {
+ ARGON2_PROXY_CLASS_DECL(SSSE3)
+ BLAKE2B_PROXY_CLASS_DECL(SSSE3)
+}
diff --git a/library/cpp/digest/argonish/internal/proxies/ssse3/ya.make b/library/cpp/digest/argonish/internal/proxies/ssse3/ya.make
index 82d5116559..e585a09fca 100644
--- a/library/cpp/digest/argonish/internal/proxies/ssse3/ya.make
+++ b/library/cpp/digest/argonish/internal/proxies/ssse3/ya.make
@@ -1,19 +1,19 @@
-LIBRARY()
-
-OWNER(e-sidorov)
-
-NO_UTIL()
-
-IF (ARCH_X86_64 OR ARCH_I386)
- PEERDIR(
- library/cpp/digest/argonish/internal/proxies/macro
- library/cpp/digest/argonish/internal/argon2
- library/cpp/digest/argonish/internal/blake2b
- )
-
- SRC_CPP_SSSE3(
- proxy_ssse3.cpp
- )
-ENDIF()
-
-END()
+LIBRARY()
+
+OWNER(e-sidorov)
+
+NO_UTIL()
+
+IF (ARCH_X86_64 OR ARCH_I386)
+ PEERDIR(
+ library/cpp/digest/argonish/internal/proxies/macro
+ library/cpp/digest/argonish/internal/argon2
+ library/cpp/digest/argonish/internal/blake2b
+ )
+
+ SRC_CPP_SSSE3(
+ proxy_ssse3.cpp
+ )
+ENDIF()
+
+END()
diff --git a/library/cpp/digest/argonish/internal/proxies/ya.make b/library/cpp/digest/argonish/internal/proxies/ya.make
index 62bb1bcc50..f7cceda5f0 100644
--- a/library/cpp/digest/argonish/internal/proxies/ya.make
+++ b/library/cpp/digest/argonish/internal/proxies/ya.make
@@ -1,8 +1,8 @@
-RECURSE(
- avx2
- ref
- sse2
- sse41
- ssse3
- macro
-)
+RECURSE(
+ avx2
+ ref
+ sse2
+ sse41
+ ssse3
+ macro
+)
diff --git a/library/cpp/digest/argonish/internal/rotations/rotations_avx2.h b/library/cpp/digest/argonish/internal/rotations/rotations_avx2.h
index 81cd171f59..6d1910d34c 100644
--- a/library/cpp/digest/argonish/internal/rotations/rotations_avx2.h
+++ b/library/cpp/digest/argonish/internal/rotations/rotations_avx2.h
@@ -1,30 +1,30 @@
-#pragma once
-
-#include <immintrin.h>
-
-namespace NArgonish {
- static inline void XorValues(__m256i* result, const __m256i* val1, const __m256i* val2) {
- _mm256_storeu_si256(result, _mm256_xor_si256(
- _mm256_loadu_si256(val1), _mm256_loadu_si256(val2)));
- }
-
- static inline __m256i Rotr32(__m256i x) {
- return _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
- }
-
- static inline __m256i Rotr24(__m256i x) {
- return _mm256_shuffle_epi8(x, _mm256_setr_epi8(
- 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10,
- 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10));
- }
-
- static inline __m256i Rotr16(__m256i x) {
- return _mm256_shuffle_epi8(x, _mm256_setr_epi8(
- 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9,
- 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9));
- }
-
- static inline __m256i Rotr63(__m256i x) {
- return _mm256_xor_si256(_mm256_srli_epi64(x, 63), _mm256_add_epi64(x, x));
- }
-}
+#pragma once
+
+#include <immintrin.h>
+
+namespace NArgonish {
+ static inline void XorValues(__m256i* result, const __m256i* val1, const __m256i* val2) {
+ _mm256_storeu_si256(result, _mm256_xor_si256(
+ _mm256_loadu_si256(val1), _mm256_loadu_si256(val2)));
+ }
+
+ static inline __m256i Rotr32(__m256i x) {
+ return _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
+ }
+
+ static inline __m256i Rotr24(__m256i x) {
+ return _mm256_shuffle_epi8(x, _mm256_setr_epi8(
+ 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10,
+ 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10));
+ }
+
+ static inline __m256i Rotr16(__m256i x) {
+ return _mm256_shuffle_epi8(x, _mm256_setr_epi8(
+ 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9,
+ 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9));
+ }
+
+ static inline __m256i Rotr63(__m256i x) {
+ return _mm256_xor_si256(_mm256_srli_epi64(x, 63), _mm256_add_epi64(x, x));
+ }
+}
diff --git a/library/cpp/digest/argonish/internal/rotations/rotations_ref.h b/library/cpp/digest/argonish/internal/rotations/rotations_ref.h
index 6f59e233a5..82ffcae640 100644
--- a/library/cpp/digest/argonish/internal/rotations/rotations_ref.h
+++ b/library/cpp/digest/argonish/internal/rotations/rotations_ref.h
@@ -1,7 +1,7 @@
-#pragma once
-
-namespace NArgonish {
- static inline ui64 Rotr(const ui64 w, const unsigned c) {
- return (w >> c) | (w << (64 - c));
- }
-}
+#pragma once
+
+namespace NArgonish {
+ static inline ui64 Rotr(const ui64 w, const unsigned c) {
+ return (w >> c) | (w << (64 - c));
+ }
+}
diff --git a/library/cpp/digest/argonish/internal/rotations/rotations_sse2.h b/library/cpp/digest/argonish/internal/rotations/rotations_sse2.h
index 55a10a31b0..9af07b67f5 100644
--- a/library/cpp/digest/argonish/internal/rotations/rotations_sse2.h
+++ b/library/cpp/digest/argonish/internal/rotations/rotations_sse2.h
@@ -1,27 +1,27 @@
-#pragma once
-
-#include <emmintrin.h>
-
-namespace NArgonish {
- static inline void XorValues(__m128i* result, const __m128i* val1, const __m128i* val2) {
- _mm_storeu_si128(result, _mm_xor_si128(
- _mm_loadu_si128(val1),
- _mm_loadu_si128(val2)));
- }
-
- static inline __m128i Rotr32(__m128i x) {
- return _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
- }
-
- static inline __m128i Rotr24(__m128i x) {
- return _mm_xor_si128(_mm_srli_epi64(x, 24), _mm_slli_epi64(x, 40));
- }
-
- static inline __m128i Rotr16(__m128i x) {
- return _mm_xor_si128(_mm_srli_epi64(x, 16), _mm_slli_epi64(x, 48));
- }
-
- static inline __m128i Rotr63(__m128i x) {
- return _mm_xor_si128(_mm_srli_epi64(x, 63), _mm_add_epi64(x, x));
- }
-}
+#pragma once
+
+#include <emmintrin.h>
+
+namespace NArgonish {
+ static inline void XorValues(__m128i* result, const __m128i* val1, const __m128i* val2) {
+ _mm_storeu_si128(result, _mm_xor_si128(
+ _mm_loadu_si128(val1),
+ _mm_loadu_si128(val2)));
+ }
+
+ static inline __m128i Rotr32(__m128i x) {
+ return _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
+ }
+
+ static inline __m128i Rotr24(__m128i x) {
+ return _mm_xor_si128(_mm_srli_epi64(x, 24), _mm_slli_epi64(x, 40));
+ }
+
+ static inline __m128i Rotr16(__m128i x) {
+ return _mm_xor_si128(_mm_srli_epi64(x, 16), _mm_slli_epi64(x, 48));
+ }
+
+ static inline __m128i Rotr63(__m128i x) {
+ return _mm_xor_si128(_mm_srli_epi64(x, 63), _mm_add_epi64(x, x));
+ }
+}
diff --git a/library/cpp/digest/argonish/internal/rotations/rotations_ssse3.h b/library/cpp/digest/argonish/internal/rotations/rotations_ssse3.h
index 39c9c5491b..88669dc76a 100644
--- a/library/cpp/digest/argonish/internal/rotations/rotations_ssse3.h
+++ b/library/cpp/digest/argonish/internal/rotations/rotations_ssse3.h
@@ -1,28 +1,28 @@
-#pragma once
-
-#include <emmintrin.h>
-#include <tmmintrin.h>
-
-namespace NArgonish {
- static inline void XorValues(__m128i* result, __m128i* val1, __m128i* val2) {
- _mm_storeu_si128(result, _mm_xor_si128(
- _mm_loadu_si128(val1),
- _mm_loadu_si128(val2)));
- }
-
- static inline __m128i Rotr32(__m128i x) {
- return _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
- }
-
- static inline __m128i Rotr24(__m128i x) {
- return _mm_shuffle_epi8(x, _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10));
- }
-
- static inline __m128i Rotr16(__m128i x) {
- return _mm_shuffle_epi8(x, _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9));
- }
-
- static inline __m128i Rotr63(__m128i x) {
- return _mm_xor_si128(_mm_srli_epi64(x, 63), _mm_add_epi64(x, x));
- }
-}
+#pragma once
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+namespace NArgonish {
+ static inline void XorValues(__m128i* result, __m128i* val1, __m128i* val2) {
+ _mm_storeu_si128(result, _mm_xor_si128(
+ _mm_loadu_si128(val1),
+ _mm_loadu_si128(val2)));
+ }
+
+ static inline __m128i Rotr32(__m128i x) {
+ return _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
+ }
+
+ static inline __m128i Rotr24(__m128i x) {
+ return _mm_shuffle_epi8(x, _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10));
+ }
+
+ static inline __m128i Rotr16(__m128i x) {
+ return _mm_shuffle_epi8(x, _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9));
+ }
+
+ static inline __m128i Rotr63(__m128i x) {
+ return _mm_xor_si128(_mm_srli_epi64(x, 63), _mm_add_epi64(x, x));
+ }
+}
diff --git a/library/cpp/digest/argonish/internal/rotations/ya.make b/library/cpp/digest/argonish/internal/rotations/ya.make
index 5f639d4571..b2b79b2b2a 100644
--- a/library/cpp/digest/argonish/internal/rotations/ya.make
+++ b/library/cpp/digest/argonish/internal/rotations/ya.make
@@ -1,5 +1,5 @@
-LIBRARY()
-
-OWNER(e-sidorov)
-
-END()
+LIBRARY()
+
+OWNER(e-sidorov)
+
+END()
diff --git a/library/cpp/digest/argonish/internal/ya.make b/library/cpp/digest/argonish/internal/ya.make
index 4a69395970..35003e964e 100644
--- a/library/cpp/digest/argonish/internal/ya.make
+++ b/library/cpp/digest/argonish/internal/ya.make
@@ -1,7 +1,7 @@
-RECURSE(
- proxies
- argon2
- blake2b
- blamka
- rotations
-)
+RECURSE(
+ proxies
+ argon2
+ blake2b
+ blamka
+ rotations
+)