diff options
author | e-sidorov <e-sidorov@yandex-team.ru> | 2022-02-10 16:46:06 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:06 +0300 |
commit | ce2ad6f6a6f6025e37fb7f8debe7cefd3aa2307c (patch) | |
tree | 1a2c5ffcf89eb53ecd79dbc9bc0a195c27404d0c /library/cpp/digest/argonish/internal/blamka/blamka_avx2.h | |
parent | 1ec091f8998d76a211c6015ba6865a73b29d676a (diff) | |
download | ydb-ce2ad6f6a6f6025e37fb7f8debe7cefd3aa2307c.tar.gz |
Restoring authorship annotation for <e-sidorov@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/digest/argonish/internal/blamka/blamka_avx2.h')
-rw-r--r-- | library/cpp/digest/argonish/internal/blamka/blamka_avx2.h | 270 |
1 files changed, 135 insertions, 135 deletions
diff --git a/library/cpp/digest/argonish/internal/blamka/blamka_avx2.h b/library/cpp/digest/argonish/internal/blamka/blamka_avx2.h index bb701799c4..02c506d6ff 100644 --- a/library/cpp/digest/argonish/internal/blamka/blamka_avx2.h +++ b/library/cpp/digest/argonish/internal/blamka/blamka_avx2.h @@ -1,136 +1,136 @@ -#pragma once - -#include <immintrin.h> +#pragma once + +#include <immintrin.h> #include <library/cpp/digest/argonish/internal/rotations/rotations_avx2.h> - -namespace NArgonish { - static inline void BlamkaG1AVX2( - __m256i& a0, __m256i& a1, __m256i& b0, __m256i& b1, - __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) { - __m256i ml = _mm256_mul_epu32(a0, b0); - ml = _mm256_add_epi64(ml, ml); - a0 = _mm256_add_epi64(a0, _mm256_add_epi64(b0, ml)); - d0 = _mm256_xor_si256(d0, a0); - d0 = Rotr32(d0); - - ml = _mm256_mul_epu32(c0, d0); - ml = _mm256_add_epi64(ml, ml); - c0 = _mm256_add_epi64(c0, _mm256_add_epi64(d0, ml)); - - b0 = _mm256_xor_si256(b0, c0); - b0 = Rotr24(b0); - - ml = _mm256_mul_epu32(a1, b1); - ml = _mm256_add_epi64(ml, ml); - a1 = _mm256_add_epi64(a1, _mm256_add_epi64(b1, ml)); - d1 = _mm256_xor_si256(d1, a1); - d1 = Rotr32(d1); - - ml = _mm256_mul_epu32(c1, d1); - ml = _mm256_add_epi64(ml, ml); - c1 = _mm256_add_epi64(c1, _mm256_add_epi64(d1, ml)); - - b1 = _mm256_xor_si256(b1, c1); - b1 = Rotr24(b1); - } - - static inline void BlamkaG2AVX2( - __m256i& a0, __m256i& a1, __m256i& b0, __m256i& b1, - __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) { - __m256i ml = _mm256_mul_epu32(a0, b0); - ml = _mm256_add_epi64(ml, ml); - a0 = _mm256_add_epi64(a0, _mm256_add_epi64(b0, ml)); - d0 = _mm256_xor_si256(d0, a0); - d0 = Rotr16(d0); - - ml = _mm256_mul_epu32(c0, d0); - ml = _mm256_add_epi64(ml, ml); - c0 = _mm256_add_epi64(c0, _mm256_add_epi64(d0, ml)); - b0 = _mm256_xor_si256(b0, c0); - b0 = Rotr63(b0); - - ml = _mm256_mul_epu32(a1, b1); - ml = _mm256_add_epi64(ml, ml); - a1 = _mm256_add_epi64(a1, _mm256_add_epi64(b1, ml)); - d1 = _mm256_xor_si256(d1, a1); - d1 = Rotr16(d1); - - ml = _mm256_mul_epu32(c1, d1); - ml = _mm256_add_epi64(ml, ml); - c1 = _mm256_add_epi64(c1, _mm256_add_epi64(d1, ml)); - b1 = _mm256_xor_si256(b1, c1); - b1 = Rotr63(b1); - } - - /* a = ( v0, v1, v2, v3) */ - /* b = ( v4, v5, v6, v7) */ - /* c = ( v8, v9, v10, v11) */ - /* d = (v12, v13, v14, v15) */ - static inline void DiagonalizeAVX21( - __m256i& b0, __m256i& c0, __m256i& d0, __m256i& b1, __m256i& c1, __m256i& d1) { - /* (v4, v5, v6, v7) -> (v5, v6, v7, v4) */ - b0 = _mm256_permute4x64_epi64(b0, _MM_SHUFFLE(0, 3, 2, 1)); - /* (v8, v9, v10, v11) -> (v10, v11, v8, v9) */ - c0 = _mm256_permute4x64_epi64(c0, _MM_SHUFFLE(1, 0, 3, 2)); - /* (v12, v13, v14, v15) -> (v15, v12, v13, v14) */ - d0 = _mm256_permute4x64_epi64(d0, _MM_SHUFFLE(2, 1, 0, 3)); - - b1 = _mm256_permute4x64_epi64(b1, _MM_SHUFFLE(0, 3, 2, 1)); - c1 = _mm256_permute4x64_epi64(c1, _MM_SHUFFLE(1, 0, 3, 2)); - d1 = _mm256_permute4x64_epi64(d1, _MM_SHUFFLE(2, 1, 0, 3)); - } - - static inline void DiagonalizeAVX22( - __m256i& b0, __m256i& b1, __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) { - /* (v4, v5, v6, v7) -> (v5, v6, v7, v4) */ - __m256i tmp1 = _mm256_blend_epi32(b0, b1, 0b11001100); /* v4v7 */ - __m256i tmp2 = _mm256_blend_epi32(b0, b1, 0b00110011); /* v6v5 */ - b1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v7v4 */ - b0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v5v6 */ - - /* (v8, v9, v10, v11) -> (v10, v11, v8, v9) */ - tmp1 = c0; - c0 = c1; - c1 = tmp1; - - /* (v12, v13, v14, v15) -> (v15, v12, v13, v14) */ - tmp1 = _mm256_blend_epi32(d0, d1, 0b11001100); /* v12v15 */ - tmp2 = _mm256_blend_epi32(d0, d1, 0b00110011); /* v14v13 */ - d0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v15v12 */ - d1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v13v14 */ - } - - static inline void UndiagonalizeAVX21( - __m256i& b0, __m256i& c0, __m256i& d0, __m256i& b1, __m256i& c1, __m256i& d1) { - /* (v5, v6, v7, v4) -> (v4, v5, v6, v7) */ - b0 = _mm256_permute4x64_epi64(b0, _MM_SHUFFLE(2, 1, 0, 3)); - /* (v10, v11, v8, v9) -> (v8, v9, v10, v11) */ - c0 = _mm256_permute4x64_epi64(c0, _MM_SHUFFLE(1, 0, 3, 2)); - /* (v15, v12, v13, v14) -> (v12, v13, v14, v15) */ - d0 = _mm256_permute4x64_epi64(d0, _MM_SHUFFLE(0, 3, 2, 1)); - - b1 = _mm256_permute4x64_epi64(b1, _MM_SHUFFLE(2, 1, 0, 3)); - c1 = _mm256_permute4x64_epi64(c1, _MM_SHUFFLE(1, 0, 3, 2)); - d1 = _mm256_permute4x64_epi64(d1, _MM_SHUFFLE(0, 3, 2, 1)); - } - - static inline void UndiagonalizeAVX22( - __m256i& b0, __m256i& b1, __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) { - /* (v5, v6, v7, v4) -> (v4, v5, v6, v7) */ - __m256i tmp1 = _mm256_blend_epi32(b0, b1, 0b11001100); /* v5v4 */ - __m256i tmp2 = _mm256_blend_epi32(b0, b1, 0b00110011); /* v7v6 */ - b0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v4v5 */ - b1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v6v7 */ - - /* (v10,v11,v8,v9) -> (v8,v9,v10,v11) */ - tmp1 = c0; - c0 = c1; - c1 = tmp1; - - /* (v15,v12,v13,v14) -> (v12,v13,v14,v15) */ - tmp1 = _mm256_blend_epi32(d0, d1, 0b00110011); /* v13v12 */ - tmp2 = _mm256_blend_epi32(d0, d1, 0b11001100); /* v15v14 */ - d0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); - d1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); - } -} + +namespace NArgonish { + static inline void BlamkaG1AVX2( + __m256i& a0, __m256i& a1, __m256i& b0, __m256i& b1, + __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) { + __m256i ml = _mm256_mul_epu32(a0, b0); + ml = _mm256_add_epi64(ml, ml); + a0 = _mm256_add_epi64(a0, _mm256_add_epi64(b0, ml)); + d0 = _mm256_xor_si256(d0, a0); + d0 = Rotr32(d0); + + ml = _mm256_mul_epu32(c0, d0); + ml = _mm256_add_epi64(ml, ml); + c0 = _mm256_add_epi64(c0, _mm256_add_epi64(d0, ml)); + + b0 = _mm256_xor_si256(b0, c0); + b0 = Rotr24(b0); + + ml = _mm256_mul_epu32(a1, b1); + ml = _mm256_add_epi64(ml, ml); + a1 = _mm256_add_epi64(a1, _mm256_add_epi64(b1, ml)); + d1 = _mm256_xor_si256(d1, a1); + d1 = Rotr32(d1); + + ml = _mm256_mul_epu32(c1, d1); + ml = _mm256_add_epi64(ml, ml); + c1 = _mm256_add_epi64(c1, _mm256_add_epi64(d1, ml)); + + b1 = _mm256_xor_si256(b1, c1); + b1 = Rotr24(b1); + } + + static inline void BlamkaG2AVX2( + __m256i& a0, __m256i& a1, __m256i& b0, __m256i& b1, + __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) { + __m256i ml = _mm256_mul_epu32(a0, b0); + ml = _mm256_add_epi64(ml, ml); + a0 = _mm256_add_epi64(a0, _mm256_add_epi64(b0, ml)); + d0 = _mm256_xor_si256(d0, a0); + d0 = Rotr16(d0); + + ml = _mm256_mul_epu32(c0, d0); + ml = _mm256_add_epi64(ml, ml); + c0 = _mm256_add_epi64(c0, _mm256_add_epi64(d0, ml)); + b0 = _mm256_xor_si256(b0, c0); + b0 = Rotr63(b0); + + ml = _mm256_mul_epu32(a1, b1); + ml = _mm256_add_epi64(ml, ml); + a1 = _mm256_add_epi64(a1, _mm256_add_epi64(b1, ml)); + d1 = _mm256_xor_si256(d1, a1); + d1 = Rotr16(d1); + + ml = _mm256_mul_epu32(c1, d1); + ml = _mm256_add_epi64(ml, ml); + c1 = _mm256_add_epi64(c1, _mm256_add_epi64(d1, ml)); + b1 = _mm256_xor_si256(b1, c1); + b1 = Rotr63(b1); + } + + /* a = ( v0, v1, v2, v3) */ + /* b = ( v4, v5, v6, v7) */ + /* c = ( v8, v9, v10, v11) */ + /* d = (v12, v13, v14, v15) */ + static inline void DiagonalizeAVX21( + __m256i& b0, __m256i& c0, __m256i& d0, __m256i& b1, __m256i& c1, __m256i& d1) { + /* (v4, v5, v6, v7) -> (v5, v6, v7, v4) */ + b0 = _mm256_permute4x64_epi64(b0, _MM_SHUFFLE(0, 3, 2, 1)); + /* (v8, v9, v10, v11) -> (v10, v11, v8, v9) */ + c0 = _mm256_permute4x64_epi64(c0, _MM_SHUFFLE(1, 0, 3, 2)); + /* (v12, v13, v14, v15) -> (v15, v12, v13, v14) */ + d0 = _mm256_permute4x64_epi64(d0, _MM_SHUFFLE(2, 1, 0, 3)); + + b1 = _mm256_permute4x64_epi64(b1, _MM_SHUFFLE(0, 3, 2, 1)); + c1 = _mm256_permute4x64_epi64(c1, _MM_SHUFFLE(1, 0, 3, 2)); + d1 = _mm256_permute4x64_epi64(d1, _MM_SHUFFLE(2, 1, 0, 3)); + } + + static inline void DiagonalizeAVX22( + __m256i& b0, __m256i& b1, __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) { + /* (v4, v5, v6, v7) -> (v5, v6, v7, v4) */ + __m256i tmp1 = _mm256_blend_epi32(b0, b1, 0b11001100); /* v4v7 */ + __m256i tmp2 = _mm256_blend_epi32(b0, b1, 0b00110011); /* v6v5 */ + b1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v7v4 */ + b0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v5v6 */ + + /* (v8, v9, v10, v11) -> (v10, v11, v8, v9) */ + tmp1 = c0; + c0 = c1; + c1 = tmp1; + + /* (v12, v13, v14, v15) -> (v15, v12, v13, v14) */ + tmp1 = _mm256_blend_epi32(d0, d1, 0b11001100); /* v12v15 */ + tmp2 = _mm256_blend_epi32(d0, d1, 0b00110011); /* v14v13 */ + d0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v15v12 */ + d1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v13v14 */ + } + + static inline void UndiagonalizeAVX21( + __m256i& b0, __m256i& c0, __m256i& d0, __m256i& b1, __m256i& c1, __m256i& d1) { + /* (v5, v6, v7, v4) -> (v4, v5, v6, v7) */ + b0 = _mm256_permute4x64_epi64(b0, _MM_SHUFFLE(2, 1, 0, 3)); + /* (v10, v11, v8, v9) -> (v8, v9, v10, v11) */ + c0 = _mm256_permute4x64_epi64(c0, _MM_SHUFFLE(1, 0, 3, 2)); + /* (v15, v12, v13, v14) -> (v12, v13, v14, v15) */ + d0 = _mm256_permute4x64_epi64(d0, _MM_SHUFFLE(0, 3, 2, 1)); + + b1 = _mm256_permute4x64_epi64(b1, _MM_SHUFFLE(2, 1, 0, 3)); + c1 = _mm256_permute4x64_epi64(c1, _MM_SHUFFLE(1, 0, 3, 2)); + d1 = _mm256_permute4x64_epi64(d1, _MM_SHUFFLE(0, 3, 2, 1)); + } + + static inline void UndiagonalizeAVX22( + __m256i& b0, __m256i& b1, __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) { + /* (v5, v6, v7, v4) -> (v4, v5, v6, v7) */ + __m256i tmp1 = _mm256_blend_epi32(b0, b1, 0b11001100); /* v5v4 */ + __m256i tmp2 = _mm256_blend_epi32(b0, b1, 0b00110011); /* v7v6 */ + b0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v4v5 */ + b1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v6v7 */ + + /* (v10,v11,v8,v9) -> (v8,v9,v10,v11) */ + tmp1 = c0; + c0 = c1; + c1 = tmp1; + + /* (v15,v12,v13,v14) -> (v12,v13,v14,v15) */ + tmp1 = _mm256_blend_epi32(d0, d1, 0b00110011); /* v13v12 */ + tmp2 = _mm256_blend_epi32(d0, d1, 0b11001100); /* v15v14 */ + d0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); + d1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); + } +} |