aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx2.c
blob: ea8b91a4990b5f7203931b19335483431ef7b490 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 * SPDX-License-Identifier: Apache-2.0"
 *
 * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
 * AWS Cryptographic Algorithms Group.
 *
 * The rotate functions are based on the Barrel shifter described in [1] and
 * some code snippets from [2]:
 *
 * [1] Chou, T.: QcBits: Constant-Time Small-Key Code-Based Cryptography.
 *     In: Gier-lichs, B., Poschmann, A.Y. (eds.) Cryptographic Hardware
 *     and Embedded Systems– CHES 2016. pp. 280–300. Springer Berlin Heidelberg,
 *     Berlin, Heidelberg (2016)
 *
 * [2] Guimarães, Antonio, Diego F Aranha, and Edson Borin. 2019.
 *     “Optimized Implementation of QC-MDPC Code-Based Cryptography.”
 *     Concurrency and Computation: Practice and Experience 31 (18):
 *     e5089. https://doi.org/10.1002/cpe.5089.
 */

#if defined(S2N_BIKE_R3_AVX2)

#include "decode.h"
#include "decode_internal.h"
#include "utilities.h"

#define AVX2_INTERNAL
#include "x86_64_intrinsic.h"

#define R_YMM_HALF_LOG2 UPTOPOW2(R_YMM / 2)

_INLINE_ void
rotate256_big(OUT syndrome_t *out, IN const syndrome_t *in, IN size_t ymm_num)
{
  // For preventing overflows (comparison in bytes)
  bike_static_assert(sizeof(*out) >
                       (BYTES_IN_YMM * (R_YMM + (2 * R_YMM_HALF_LOG2))),
                     rotr_big_err);

  *out = *in;

  for(uint32_t idx = R_YMM_HALF_LOG2; idx >= 1; idx >>= 1) {
    const uint8_t mask       = secure_l32_mask(ymm_num, idx);
    const __m256i blend_mask = SET1_I8(mask);
    ymm_num                  = ymm_num - (idx & mask);

    for(size_t i = 0; i < (R_YMM + idx); i++) {
      __m256i a = LOAD(&out->qw[4 * (i + idx)]);
      __m256i b = LOAD(&out->qw[4 * i]);
      b         = BLENDV_I8(b, a, blend_mask);
      STORE(&out->qw[4 * i], b);
    }
  }
}

_INLINE_ void
rotate256_small(OUT syndrome_t *out, IN const syndrome_t *in, size_t count)
{
  __m256i        carry_in   = SET_ZERO;
  const int      count64    = (int)count & 0x3f;
  const uint64_t count_mask = (count >> 5) & 0xe;

  __m256i       idx       = SET_I32(7, 6, 5, 4, 3, 2, 1, 0);
  const __m256i zero_mask = SET_I64(-1, -1, -1, 0);
  const __m256i count_vet = SET1_I8(count_mask);

  ALIGN(ALIGN_BYTES)
  const uint8_t zero_mask2_buf[] = {
    0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x84, 0x84, 0x84,
    0x84, 0x84, 0x84, 0x84, 0x84, 0x82, 0x82, 0x82, 0x82, 0x82, 0x82,
    0x82, 0x82, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
  __m256i zero_mask2 = LOAD(zero_mask2_buf);

  zero_mask2 = SUB_I8(zero_mask2, count_vet);
  idx        = ADD_I8(idx, count_vet);

  for(int i = R_YMM; i >= 0; i--) {
    // Load the next 256 bits
    __m256i in256 = LOAD(&in->qw[4 * i]);

    // Rotate the current and previous 256 registers so that their quadwords
    // would be in the right positions.
    __m256i carry_out = PERMVAR_I32(in256, idx);
    in256             = BLENDV_I8(carry_in, carry_out, zero_mask2);

    // Shift less than 64 (quadwords internal)
    __m256i inner_carry = BLENDV_I8(carry_in, in256, zero_mask);
    inner_carry         = PERM_I64(inner_carry, 0x39);
    const __m256i out256 =
      SRLI_I64(in256, count64) | SLLI_I64(inner_carry, (int)64 - count64);

    // Store the rotated value
    STORE(&out->qw[4 * i], out256);
    carry_in = carry_out;
  }
}

void rotate_right_avx2(OUT syndrome_t *out,
                       IN const syndrome_t *in,
                       IN const uint32_t    bitscount)
{
  // 1) Rotate in granularity of 256 bits blocks, using YMMs
  rotate256_big(out, in, (bitscount / BITS_IN_YMM));
  // 2) Rotate in smaller granularity (less than 256 bits), using YMMs
  rotate256_small(out, out, (bitscount % BITS_IN_YMM));
}

// Duplicates the first R_BITS of the syndrome three times
// |------------------------------------------|
// |  Third copy | Second copy | first R_BITS |
// |------------------------------------------|
// This is required by the rotate functions.
void dup_avx2(IN OUT syndrome_t *s)
{
  s->qw[R_QWORDS - 1] =
    (s->qw[0] << LAST_R_QWORD_LEAD) | (s->qw[R_QWORDS - 1] & LAST_R_QWORD_MASK);

  for(size_t i = 0; i < (2 * R_QWORDS) - 1; i++) {
    s->qw[R_QWORDS + i] =
      (s->qw[i] >> LAST_R_QWORD_TRAIL) | (s->qw[i + 1] << LAST_R_QWORD_LEAD);
  }
}

// Use half-adder as described in [1].
void bit_sliced_adder_avx2(OUT upc_t *upc,
                           IN OUT syndrome_t *rotated_syndrome,
                           IN const size_t    num_of_slices)
{
  // From cache-memory perspective this loop should be the outside loop
  for(size_t j = 0; j < num_of_slices; j++) {
    for(size_t i = 0; i < R_QWORDS; i++) {
      const uint64_t carry = (upc->slice[j].u.qw[i] & rotated_syndrome->qw[i]);
      upc->slice[j].u.qw[i] ^= rotated_syndrome->qw[i];
      rotated_syndrome->qw[i] = carry;
    }
  }
}

void bit_slice_full_subtract_avx2(OUT upc_t *upc, IN uint8_t val)
{
  // Borrow
  uint64_t br[R_QWORDS] = {0};

  for(size_t j = 0; j < SLICES; j++) {

    const uint64_t lsb_mask = 0 - (val & 0x1);
    val >>= 1;

    // Perform a - b with c as the input/output carry
    // br = 0 0 0 0 1 1 1 1
    // a  = 0 0 1 1 0 0 1 1
    // b  = 0 1 0 1 0 1 0 1
    // -------------------
    // o  = 0 1 1 0 0 1 1 1
    // c  = 0 1 0 0 1 1 0 1
    //
    // o  = a^b^c
    //            _     __    _ _   _ _     _
    // br = abc + abc + abc + abc = abc + ((a+b))c

    for(size_t i = 0; i < R_QWORDS; i++) {
      const uint64_t a      = upc->slice[j].u.qw[i];
      const uint64_t b      = lsb_mask;
      const uint64_t tmp    = ((~a) & b & (~br[i])) | ((((~a) | b) & br[i]));
      upc->slice[j].u.qw[i] = a ^ b ^ br[i];
      br[i]                 = tmp;
    }
  }
}

#endif

typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;