aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/restricted/aws/s2n/pq-crypto
diff options
context:
space:
mode:
authorthegeorg <thegeorg@yandex-team.ru>2022-05-10 22:16:03 +0300
committerthegeorg <thegeorg@yandex-team.ru>2022-05-10 22:16:03 +0300
commit09c71d918d4d0b0ebf67e1ab41aa90ddf587a3f2 (patch)
treedd44d2cb68e2845c2d4c367b66893f3e043a6e8e /contrib/restricted/aws/s2n/pq-crypto
parent5eb4a8a2d487411924e1d1b27c454223dcf35005 (diff)
downloadydb-09c71d918d4d0b0ebf67e1ab41aa90ddf587a3f2.tar.gz
Update contrib/restricted/aws/s2n to 1.3.12
ref:f8279d764b4c00974a63543a1364c91e2b81b7a6
Diffstat (limited to 'contrib/restricted/aws/s2n/pq-crypto')
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c8
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c40
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c18
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c14
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c6
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h4
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c8
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c74
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c18
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c14
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c6
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h4
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/LICENSE202
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes.h62
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.c97
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.h43
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_defs.h91
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_r3_kem.c288
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/cleanup.h63
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.c280
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.h12
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx2.c173
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx512.c167
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_internal.h86
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_portable.c126
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/defs.h107
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.c10
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.h33
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x.h29
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_internal.h177
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_inv.c156
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx2.c188
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx512.c135
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_portable.c48
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul.c113
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx2.c109
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx512.c109
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_pclmul.c155
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_portable.c77
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_vpclmul.c135
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_portable.c103
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.c170
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.h40
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx2.c123
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx512.c123
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_internal.h66
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_portable.c60
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/sha.h43
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/types.h120
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.c24
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.h139
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/bike_r3/x86_64_intrinsic.h132
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c2
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c10
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h4
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c2
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c10
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h4
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SIMD256_avx2.c1284
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SnP_avx2.h63
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-SIMD256-config_avx2.h3
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-align_avx2.h31
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-brg_endian_avx2.h139
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_align_avx2.h19
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_basemul_avx2.S105
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.c104
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.h11
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.c137
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.h15
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.c122
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.h43
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.c (renamed from contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.c)141
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.h68
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.c210
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.h70
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fq_avx2.S122
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.c323
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.h15
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.c363
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.h25
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_invntt_avx2.S255
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_kem.c158
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.c122
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.h19
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.S218
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.h28
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_params.h31
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.c300
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.h61
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.c453
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.h80
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.c186
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.h40
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.c227
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.h39
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.c60
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.h15
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce_avx2.h13
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.c420
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.h14
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_shuffle_avx2.S272
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric-shake.c49
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric.h17
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c232
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h21
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c8
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h2
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h2
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c2
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c22
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434.c117
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_api.h70
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_internal.h225
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r2/config.h218
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r2/ec_isogeny.c313
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.h14
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r2/fp.c241
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r2/fpx.c387
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r2/sidh.c286
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r2/sike_r2_kem.c120
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r2/sikep434r2_fp_x64_asm.S962
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.c146
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.h181
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_api.h78
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.c348
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.h46
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.c417
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.h23
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.c297
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.h39
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.S1054
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.h38
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.c478
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.h65
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_kem.c112
-rw-r--r--contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_sidh.c310
136 files changed, 14877 insertions, 3147 deletions
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c
index 26c99bc80d..2f211010df 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c
@@ -27,7 +27,7 @@ init_aes_ctr_prf_state(OUT aes_ctr_prf_state_t *s,
bike_static_assert(sizeof(*seed) == sizeof(key.raw), seed_size_equals_ky_size);
memcpy(key.raw, seed->raw, sizeof(key.raw));
- GUARD(aes256_key_expansion(&s->ks_ptr, &key));
+ POSIX_GUARD(aes256_key_expansion(&s->ks_ptr, &key));
// Initialize buffer and counter
s->ctr.u.qw[0] = 0;
@@ -59,7 +59,7 @@ perform_aes(OUT uint8_t *ct, IN OUT aes_ctr_prf_state_t *s)
BIKE_ERROR(E_AES_OVER_USED);
}
- GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks_ptr));
+ POSIX_GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks_ptr));
s->ctr.u.qw[0]++;
s->rem_invokations--;
@@ -91,11 +91,11 @@ aes_ctr_prf(OUT uint8_t *a, IN OUT aes_ctr_prf_state_t *s, IN const uint32_t len
// Copy full AES blocks
while((len - idx) >= AES256_BLOCK_SIZE)
{
- GUARD(perform_aes(&a[idx], s));
+ POSIX_GUARD(perform_aes(&a[idx], s));
idx += AES256_BLOCK_SIZE;
}
- GUARD(perform_aes(s->buffer.u.bytes, s));
+ POSIX_GUARD(perform_aes(s->buffer.u.bytes, s));
// Copy the tail
s->pos = len - idx;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c
index 21b0b6f5a3..ba43098837 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c
@@ -78,18 +78,18 @@ encrypt(OUT ct_t *ct,
p_pk[1].val = pk->val[1];
DMSG(" Sampling m.\n");
- GUARD(sample_uniform_r_bits(&m.val, seed, NO_RESTRICTION));
+ POSIX_GUARD(sample_uniform_r_bits(&m.val, seed, NO_RESTRICTION));
DMSG(" Calculating the ciphertext.\n");
- GUARD(gf2x_mod_mul((uint64_t *)&p_ct[0], (uint64_t *)&m, (uint64_t *)&p_pk[0]));
- GUARD(gf2x_mod_mul((uint64_t *)&p_ct[1], (uint64_t *)&m, (uint64_t *)&p_pk[1]));
+ POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_ct[0], (uint64_t *)&m, (uint64_t *)&p_pk[0]));
+ POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_ct[1], (uint64_t *)&m, (uint64_t *)&p_pk[1]));
DMSG(" Addding Error to the ciphertext.\n");
- GUARD(
+ POSIX_GUARD(
gf2x_add(p_ct[0].val.raw, p_ct[0].val.raw, splitted_e->val[0].raw, R_SIZE));
- GUARD(
+ POSIX_GUARD(
gf2x_add(p_ct[1].val.raw, p_ct[1].val.raw, splitted_e->val[1].raw, R_SIZE));
// Copy the data outside
@@ -113,12 +113,12 @@ calc_pk(OUT pk_t *pk, IN const seed_t *g_seed, IN const pad_sk_t p_sk)
// Intialized padding to zero
DEFER_CLEANUP(padded_r_t g = {0}, padded_r_cleanup);
- GUARD(sample_uniform_r_bits(&g.val, g_seed, MUST_BE_ODD));
+ POSIX_GUARD(sample_uniform_r_bits(&g.val, g_seed, MUST_BE_ODD));
// Calculate (g0, g1) = (g*h1, g*h0)
- GUARD(gf2x_mod_mul((uint64_t *)&p_pk[0], (const uint64_t *)&g,
+ POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_pk[0], (const uint64_t *)&g,
(const uint64_t *)&p_sk[1]));
- GUARD(gf2x_mod_mul((uint64_t *)&p_pk[1], (const uint64_t *)&g,
+ POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_pk[1], (const uint64_t *)&g,
(const uint64_t *)&p_sk[0]));
// Copy the data to the output parameters.
@@ -156,7 +156,7 @@ get_ss(OUT ss_t *out, IN const e_t *e)
int
BIKE1_L1_R1_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
{
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
// Convert to this implementation types
pk_t *l_pk = (pk_t *)pk;
@@ -177,14 +177,14 @@ BIKE1_L1_R1_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
DMSG(" Calculating the secret key.\n");
// h0 and h1 use the same context
- GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
+ POSIX_GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
- GUARD(generate_sparse_rep((uint64_t *)&p_sk[0], l_sk.wlist[0].val, DV, R_BITS,
+ POSIX_GUARD(generate_sparse_rep((uint64_t *)&p_sk[0], l_sk.wlist[0].val, DV, R_BITS,
sizeof(p_sk[0]), &h_prf_state));
// Copy data
l_sk.bin[0] = p_sk[0].val;
- GUARD(generate_sparse_rep((uint64_t *)&p_sk[1], l_sk.wlist[1].val, DV, R_BITS,
+ POSIX_GUARD(generate_sparse_rep((uint64_t *)&p_sk[1], l_sk.wlist[1].val, DV, R_BITS,
sizeof(p_sk[1]), &h_prf_state));
// Copy data
@@ -192,7 +192,7 @@ BIKE1_L1_R1_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
DMSG(" Calculating the public key.\n");
- GUARD(calc_pk(l_pk, &seeds.seed[1], p_sk));
+ POSIX_GUARD(calc_pk(l_pk, &seeds.seed[1], p_sk));
memcpy(sk, &l_sk, sizeof(l_sk));
@@ -214,7 +214,7 @@ BIKE1_L1_R1_crypto_kem_enc(OUT unsigned char * ct,
IN const unsigned char *pk)
{
DMSG(" Enter crypto_kem_enc.\n");
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
// Convert to this implementation types
const pk_t *l_pk = (const pk_t *)pk;
@@ -231,11 +231,11 @@ BIKE1_L1_R1_crypto_kem_enc(OUT unsigned char * ct,
// Random data generator
// Using first seed
- GUARD(init_aes_ctr_prf_state(&e_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
+ POSIX_GUARD(init_aes_ctr_prf_state(&e_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
DMSG(" Generating error.\n");
ALIGN(8) compressed_idx_t_t dummy;
- GUARD(generate_sparse_rep((uint64_t *)&e, dummy.val, T1, N_BITS, sizeof(e),
+ POSIX_GUARD(generate_sparse_rep((uint64_t *)&e, dummy.val, T1, N_BITS, sizeof(e),
&e_prf_state));
print("e: ", (uint64_t *)&e.val, sizeof(e) * 8);
@@ -250,7 +250,7 @@ BIKE1_L1_R1_crypto_kem_enc(OUT unsigned char * ct,
// Computing ct = enc(pk, e)
// Using second seed
DMSG(" Encrypting.\n");
- GUARD(encrypt(l_ct, l_pk, &seeds.seed[1], &splitted_e));
+ POSIX_GUARD(encrypt(l_ct, l_pk, &seeds.seed[1], &splitted_e));
DMSG(" Generating shared secret.\n");
get_ss(l_ss, &e.val);
@@ -269,7 +269,7 @@ BIKE1_L1_R1_crypto_kem_dec(OUT unsigned char * ss,
IN const unsigned char *sk)
{
DMSG(" Enter crypto_kem_dec.\n");
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
// Convert to this implementation types
const ct_t *l_ct = (const ct_t *)ct;
@@ -284,10 +284,10 @@ BIKE1_L1_R1_crypto_kem_dec(OUT unsigned char * ss,
DEFER_CLEANUP(e_t merged_e = {0}, e_cleanup);
DMSG(" Computing s.\n");
- GUARD(compute_syndrome(&syndrome, l_ct, &l_sk));
+ POSIX_GUARD(compute_syndrome(&syndrome, l_ct, &l_sk));
DMSG(" Decoding.\n");
- GUARD(decode(&e, &syndrome, l_ct, &l_sk));
+ POSIX_GUARD(decode(&e, &syndrome, l_ct, &l_sk));
// Check if the error weight equals T1
if(T1 != r_bits_vector_weight(&e.val[0]) + r_bits_vector_weight(&e.val[1]))
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c
index 404c6377da..b455cd7e82 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c
@@ -96,12 +96,12 @@ compute_syndrome(OUT syndrome_t *syndrome, IN const ct_t *ct, IN const sk_t *sk)
pad_ct[1].val = ct->val[1];
// Compute s = c0*h0 + c1*h1:
- GUARD(gf2x_mod_mul((uint64_t *)&pad_s[0], (uint64_t *)&pad_ct[0],
+ POSIX_GUARD(gf2x_mod_mul((uint64_t *)&pad_s[0], (uint64_t *)&pad_ct[0],
(uint64_t *)&pad_sk[0]));
- GUARD(gf2x_mod_mul((uint64_t *)&pad_s[1], (uint64_t *)&pad_ct[1],
+ POSIX_GUARD(gf2x_mod_mul((uint64_t *)&pad_s[1], (uint64_t *)&pad_ct[1],
(uint64_t *)&pad_sk[1]));
- GUARD(gf2x_add(pad_s[0].val.raw, pad_s[0].val.raw, pad_s[1].val.raw, R_SIZE));
+ POSIX_GUARD(gf2x_add(pad_s[0].val.raw, pad_s[0].val.raw, pad_s[1].val.raw, R_SIZE));
memcpy((uint8_t *)syndrome->qw, pad_s[0].val.raw, R_SIZE);
dup(syndrome);
@@ -118,13 +118,13 @@ recompute_syndrome(OUT syndrome_t *syndrome,
ct_t tmp_ct = *ct;
// Adapt the ciphertext
- GUARD(gf2x_add(tmp_ct.val[0].raw, tmp_ct.val[0].raw, splitted_e->val[0].raw,
+ POSIX_GUARD(gf2x_add(tmp_ct.val[0].raw, tmp_ct.val[0].raw, splitted_e->val[0].raw,
R_SIZE));
- GUARD(gf2x_add(tmp_ct.val[1].raw, tmp_ct.val[1].raw, splitted_e->val[1].raw,
+ POSIX_GUARD(gf2x_add(tmp_ct.val[1].raw, tmp_ct.val[1].raw, splitted_e->val[1].raw,
R_SIZE));
// Recompute the syndrome
- GUARD(compute_syndrome(syndrome, &tmp_ct, sk));
+ POSIX_GUARD(compute_syndrome(syndrome, &tmp_ct, sk));
return SUCCESS;
}
@@ -334,7 +334,7 @@ decode(OUT split_e_t *e,
DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
find_err1(e, &black_e, &gray_e, &s, sk->wlist, threshold);
- GUARD(recompute_syndrome(&s, ct, sk, e));
+ POSIX_GUARD(recompute_syndrome(&s, ct, sk, e));
#ifdef BGF_DECODER
if(iter >= 1)
{
@@ -346,14 +346,14 @@ decode(OUT split_e_t *e,
DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
find_err2(e, &black_e, &s, sk->wlist, ((DV + 1) / 2) + 1);
- GUARD(recompute_syndrome(&s, ct, sk, e));
+ POSIX_GUARD(recompute_syndrome(&s, ct, sk, e));
DMSG(" Weight of e: %lu\n",
r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1]));
DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
find_err2(e, &gray_e, &s, sk->wlist, ((DV + 1) / 2) + 1);
- GUARD(recompute_syndrome(&s, ct, sk, e));
+ POSIX_GUARD(recompute_syndrome(&s, ct, sk, e));
}
if(r_bits_vector_weight((r_t *)s.qw) > 0)
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c
index 09e0af3fde..c80d3365cb 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c
@@ -108,15 +108,15 @@ ossl_add(OUT uint8_t res_bin[R_SIZE],
BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
}
- GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
- GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
+ POSIX_GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
+ POSIX_GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
if(BN_GF2m_add(r, a, b) == 0)
{
BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
}
- GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
+ POSIX_GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
return SUCCESS;
}
@@ -176,10 +176,10 @@ cyclic_product(OUT uint8_t res_bin[R_SIZE],
BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
}
- GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
- GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
- GUARD(ossl_cyclic_product(r, a, b, bn_ctx));
- GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
+ POSIX_GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
+ POSIX_GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
+ POSIX_GUARD(ossl_cyclic_product(r, a, b, bn_ctx));
+ POSIX_GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
return SUCCESS;
}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c
index 3686338fad..d08fa5eea7 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c
@@ -20,7 +20,7 @@ get_rand_mod_len(OUT uint32_t * rand_pos,
do
{
// Generate 128bit of random numbers
- GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos)));
+ POSIX_GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos)));
// Mask only relevant bits
(*rand_pos) &= mask;
@@ -56,7 +56,7 @@ sample_uniform_r_bits_with_fixed_prf_context(OUT r_t *r,
IN const must_be_odd_t must_be_odd)
{
// Generate random data
- GUARD(aes_ctr_prf(r->raw, prf_state, R_SIZE));
+ POSIX_GUARD(aes_ctr_prf(r->raw, prf_state, R_SIZE));
// Mask upper bits of the MSByte
r->raw[R_SIZE - 1] &= MASK(R_BITS + 8 - (R_SIZE * 8));
@@ -104,7 +104,7 @@ generate_sparse_rep(OUT uint64_t * a,
// Generate weight rand numbers
do
{
- GUARD(get_rand_mod_len(&wlist[ctr], len, prf_state));
+ POSIX_GUARD(get_rand_mod_len(&wlist[ctr], len, prf_state));
ctr += is_new(wlist, ctr);
} while(ctr < weight);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h
index 1ffd56f34a..4ec60683de 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h
@@ -53,9 +53,9 @@ sample_uniform_r_bits(OUT r_t *r,
// For the seedexpander
DEFER_CLEANUP(aes_ctr_prf_state_t prf_state = {0}, aes_ctr_prf_state_cleanup);
- GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed));
+ POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed));
- GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd));
+ POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd));
return SUCCESS;
}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c
index 26c99bc80d..2f211010df 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c
@@ -27,7 +27,7 @@ init_aes_ctr_prf_state(OUT aes_ctr_prf_state_t *s,
bike_static_assert(sizeof(*seed) == sizeof(key.raw), seed_size_equals_ky_size);
memcpy(key.raw, seed->raw, sizeof(key.raw));
- GUARD(aes256_key_expansion(&s->ks_ptr, &key));
+ POSIX_GUARD(aes256_key_expansion(&s->ks_ptr, &key));
// Initialize buffer and counter
s->ctr.u.qw[0] = 0;
@@ -59,7 +59,7 @@ perform_aes(OUT uint8_t *ct, IN OUT aes_ctr_prf_state_t *s)
BIKE_ERROR(E_AES_OVER_USED);
}
- GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks_ptr));
+ POSIX_GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks_ptr));
s->ctr.u.qw[0]++;
s->rem_invokations--;
@@ -91,11 +91,11 @@ aes_ctr_prf(OUT uint8_t *a, IN OUT aes_ctr_prf_state_t *s, IN const uint32_t len
// Copy full AES blocks
while((len - idx) >= AES256_BLOCK_SIZE)
{
- GUARD(perform_aes(&a[idx], s));
+ POSIX_GUARD(perform_aes(&a[idx], s));
idx += AES256_BLOCK_SIZE;
}
- GUARD(perform_aes(s->buffer.u.bytes, s));
+ POSIX_GUARD(perform_aes(s->buffer.u.bytes, s));
// Copy the tail
s->pos = len - idx;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c
index 8f29f3add9..e7797848a0 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c
@@ -61,12 +61,12 @@ calc_pk(OUT pk_t *pk, IN const seed_t *g_seed, IN const pad_sk_t p_sk)
// Intialized padding to zero
DEFER_CLEANUP(padded_r_t g = {0}, padded_r_cleanup);
- GUARD(sample_uniform_r_bits(&g.val, g_seed, MUST_BE_ODD));
+ POSIX_GUARD(sample_uniform_r_bits(&g.val, g_seed, MUST_BE_ODD));
// Calculate (g0, g1) = (g*h1, g*h0)
- GUARD(gf2x_mod_mul((uint64_t *)&p_pk[0], (const uint64_t *)&g,
+ POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_pk[0], (const uint64_t *)&g,
(const uint64_t *)&p_sk[1]));
- GUARD(gf2x_mod_mul((uint64_t *)&p_pk[1], (const uint64_t *)&g,
+ POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_pk[1], (const uint64_t *)&g,
(const uint64_t *)&p_sk[0]));
// Copy the data to the output parameters.
@@ -102,12 +102,12 @@ function_h(OUT split_e_t *splitted_e, IN const r_t *in0, IN const r_t *in1)
// Use the seed to generate a sparse error vector e:
DMSG(" Generating random error.\n");
- GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, &seed_for_hash));
+ POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, &seed_for_hash));
DEFER_CLEANUP(padded_e_t e, padded_e_cleanup);
DEFER_CLEANUP(ALIGN(8) compressed_idx_t_t dummy, compressed_idx_t_cleanup);
- GUARD(generate_sparse_rep((uint64_t *)&e, dummy.val, T1, N_BITS, sizeof(e),
+ POSIX_GUARD(generate_sparse_rep((uint64_t *)&e, dummy.val, T1, N_BITS, sizeof(e),
&prf_state));
split_e(splitted_e, &e.val);
@@ -120,7 +120,7 @@ encrypt(OUT ct_t *ct, OUT split_e_t *mf, IN const pk_t *pk, IN const seed_t *see
DEFER_CLEANUP(padded_r_t m = {0}, padded_r_cleanup);
DMSG(" Sampling m.\n");
- GUARD(sample_uniform_r_bits(&m.val, seed, NO_RESTRICTION));
+ POSIX_GUARD(sample_uniform_r_bits(&m.val, seed, NO_RESTRICTION));
// Pad the public key
pad_pk_t p_pk = {0};
@@ -135,20 +135,20 @@ encrypt(OUT ct_t *ct, OUT split_e_t *mf, IN const pk_t *pk, IN const seed_t *see
DEFER_CLEANUP(dbl_pad_ct_t mf_int = {0}, dbl_pad_ct_cleanup);
DMSG(" Computing m*f0 and m*f1.\n");
- GUARD(
+ POSIX_GUARD(
gf2x_mod_mul((uint64_t *)&mf_int[0], (uint64_t *)&m, (uint64_t *)&p_pk[0]));
- GUARD(
+ POSIX_GUARD(
gf2x_mod_mul((uint64_t *)&mf_int[1], (uint64_t *)&m, (uint64_t *)&p_pk[1]));
DEFER_CLEANUP(split_e_t splitted_e, split_e_cleanup);
DMSG(" Computing the hash function e <- H(m*f0, m*f1).\n");
- GUARD(function_h(&splitted_e, &mf_int[0].val, &mf_int[1].val));
+ POSIX_GUARD(function_h(&splitted_e, &mf_int[0].val, &mf_int[1].val));
DMSG(" Addding Error to the ciphertext.\n");
- GUARD(gf2x_add(p_ct[0].val.raw, mf_int[0].val.raw, splitted_e.val[0].raw,
+ POSIX_GUARD(gf2x_add(p_ct[0].val.raw, mf_int[0].val.raw, splitted_e.val[0].raw,
R_SIZE));
- GUARD(gf2x_add(p_ct[1].val.raw, mf_int[1].val.raw, splitted_e.val[1].raw,
+ POSIX_GUARD(gf2x_add(p_ct[1].val.raw, mf_int[1].val.raw, splitted_e.val[1].raw,
R_SIZE));
// Copy the data to the output parameters.
@@ -174,11 +174,11 @@ reencrypt(OUT pad_ct_t ce,
IN const ct_t *l_ct)
{
// Compute (c0 + e0') and (c1 + e1')
- GUARD(gf2x_add(ce[0].val.raw, l_ct->val[0].raw, e->val[0].raw, R_SIZE));
- GUARD(gf2x_add(ce[1].val.raw, l_ct->val[1].raw, e->val[1].raw, R_SIZE));
+ POSIX_GUARD(gf2x_add(ce[0].val.raw, l_ct->val[0].raw, e->val[0].raw, R_SIZE));
+ POSIX_GUARD(gf2x_add(ce[1].val.raw, l_ct->val[1].raw, e->val[1].raw, R_SIZE));
// (e0'', e1'') <-- H(c0 + e0', c1 + e1')
- GUARD(function_h(e2, &ce[0].val, &ce[1].val));
+ POSIX_GUARD(function_h(e2, &ce[0].val, &ce[1].val));
return SUCCESS;
}
@@ -212,10 +212,10 @@ get_ss(OUT ss_t *out, IN const r_t *in0, IN const r_t *in1, IN const ct_t *ct)
int
BIKE1_L1_R2_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
{
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
- notnull_check(sk);
- notnull_check(pk);
+ POSIX_ENSURE_REF(sk);
+ POSIX_ENSURE_REF(pk);
// Convert to this implementation types
pk_t *l_pk = (pk_t *)pk;
@@ -232,27 +232,27 @@ BIKE1_L1_R2_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
DEFER_CLEANUP(pad_sk_t p_sk = {0}, pad_sk_cleanup);
// Get the entropy seeds.
- GUARD(get_seeds(&seeds));
+ POSIX_GUARD(get_seeds(&seeds));
DMSG(" Enter crypto_kem_keypair.\n");
DMSG(" Calculating the secret key.\n");
// h0 and h1 use the same context
- GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
+ POSIX_GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
// sigma0/1/2 use the same context.
- GUARD(init_aes_ctr_prf_state(&s_prf_state, MAX_AES_INVOKATION, &seeds.seed[2]));
+ POSIX_GUARD(init_aes_ctr_prf_state(&s_prf_state, MAX_AES_INVOKATION, &seeds.seed[2]));
- GUARD(generate_sparse_rep((uint64_t *)&p_sk[0], l_sk.wlist[0].val, DV, R_BITS,
+ POSIX_GUARD(generate_sparse_rep((uint64_t *)&p_sk[0], l_sk.wlist[0].val, DV, R_BITS,
sizeof(p_sk[0]), &h_prf_state));
// Sample the sigmas
- GUARD(sample_uniform_r_bits_with_fixed_prf_context(&l_sk.sigma0, &s_prf_state,
+ POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(&l_sk.sigma0, &s_prf_state,
NO_RESTRICTION));
- GUARD(sample_uniform_r_bits_with_fixed_prf_context(&l_sk.sigma1, &s_prf_state,
+ POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(&l_sk.sigma1, &s_prf_state,
NO_RESTRICTION));
- GUARD(generate_sparse_rep((uint64_t *)&p_sk[1], l_sk.wlist[1].val, DV, R_BITS,
+ POSIX_GUARD(generate_sparse_rep((uint64_t *)&p_sk[1], l_sk.wlist[1].val, DV, R_BITS,
sizeof(p_sk[1]), &h_prf_state));
// Copy data
@@ -261,7 +261,7 @@ BIKE1_L1_R2_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
DMSG(" Calculating the public key.\n");
- GUARD(calc_pk(l_pk, &seeds.seed[1], p_sk));
+ POSIX_GUARD(calc_pk(l_pk, &seeds.seed[1], p_sk));
memcpy(sk, &l_sk, sizeof(l_sk));
@@ -286,29 +286,29 @@ BIKE1_L1_R2_crypto_kem_enc(OUT unsigned char * ct,
IN const unsigned char *pk)
{
DMSG(" Enter crypto_kem_enc.\n");
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
// Convert to the types that are used by this implementation
const pk_t *l_pk = (const pk_t *)pk;
ct_t * l_ct = (ct_t *)ct;
ss_t * l_ss = (ss_t *)ss;
- notnull_check(pk);
- notnull_check(ct);
- notnull_check(ss);
+ POSIX_ENSURE_REF(pk);
+ POSIX_ENSURE_REF(ct);
+ POSIX_ENSURE_REF(ss);
// For NIST DRBG_CTR
DEFER_CLEANUP(seeds_t seeds = {0}, seeds_cleanup);
// Get the entropy seeds.
- GUARD(get_seeds(&seeds));
+ POSIX_GUARD(get_seeds(&seeds));
DMSG(" Encrypting.\n");
// In fact, seed[0] should be used.
// Here, we stay consistent with BIKE's reference code
// that chooses the seconde seed.
DEFER_CLEANUP(split_e_t mf, split_e_cleanup);
- GUARD(encrypt(l_ct, &mf, l_pk, &seeds.seed[1]));
+ POSIX_GUARD(encrypt(l_ct, &mf, l_pk, &seeds.seed[1]));
DMSG(" Generating shared secret.\n");
get_ss(l_ss, &mf.val[0], &mf.val[1], l_ct);
@@ -327,14 +327,14 @@ BIKE1_L1_R2_crypto_kem_dec(OUT unsigned char * ss,
IN const unsigned char *sk)
{
DMSG(" Enter crypto_kem_dec.\n");
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
// Convert to the types used by this implementation
const ct_t *l_ct = (const ct_t *)ct;
ss_t * l_ss = (ss_t *)ss;
- notnull_check(sk);
- notnull_check(ct);
- notnull_check(ss);
+ POSIX_ENSURE_REF(sk);
+ POSIX_ENSURE_REF(ct);
+ POSIX_ENSURE_REF(ss);
DEFER_CLEANUP(ALIGN(8) sk_t l_sk, sk_cleanup);
memcpy(&l_sk, sk, sizeof(l_sk));
@@ -344,14 +344,14 @@ BIKE1_L1_R2_crypto_kem_dec(OUT unsigned char * ss,
DEFER_CLEANUP(split_e_t e, split_e_cleanup);
DMSG(" Computing s.\n");
- GUARD(compute_syndrome(&syndrome, l_ct, &l_sk));
+ POSIX_GUARD(compute_syndrome(&syndrome, l_ct, &l_sk));
DMSG(" Decoding.\n");
uint32_t dec_ret = decode(&e, &syndrome, l_ct, &l_sk) != SUCCESS ? 0 : 1;
DEFER_CLEANUP(split_e_t e2, split_e_cleanup);
DEFER_CLEANUP(pad_ct_t ce, pad_ct_cleanup);
- GUARD(reencrypt(ce, &e2, &e, l_ct));
+ POSIX_GUARD(reencrypt(ce, &e2, &e, l_ct));
// Check if the decoding is successful.
// Check if the error weight equals T1.
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c
index 404c6377da..b455cd7e82 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c
@@ -96,12 +96,12 @@ compute_syndrome(OUT syndrome_t *syndrome, IN const ct_t *ct, IN const sk_t *sk)
pad_ct[1].val = ct->val[1];
// Compute s = c0*h0 + c1*h1:
- GUARD(gf2x_mod_mul((uint64_t *)&pad_s[0], (uint64_t *)&pad_ct[0],
+ POSIX_GUARD(gf2x_mod_mul((uint64_t *)&pad_s[0], (uint64_t *)&pad_ct[0],
(uint64_t *)&pad_sk[0]));
- GUARD(gf2x_mod_mul((uint64_t *)&pad_s[1], (uint64_t *)&pad_ct[1],
+ POSIX_GUARD(gf2x_mod_mul((uint64_t *)&pad_s[1], (uint64_t *)&pad_ct[1],
(uint64_t *)&pad_sk[1]));
- GUARD(gf2x_add(pad_s[0].val.raw, pad_s[0].val.raw, pad_s[1].val.raw, R_SIZE));
+ POSIX_GUARD(gf2x_add(pad_s[0].val.raw, pad_s[0].val.raw, pad_s[1].val.raw, R_SIZE));
memcpy((uint8_t *)syndrome->qw, pad_s[0].val.raw, R_SIZE);
dup(syndrome);
@@ -118,13 +118,13 @@ recompute_syndrome(OUT syndrome_t *syndrome,
ct_t tmp_ct = *ct;
// Adapt the ciphertext
- GUARD(gf2x_add(tmp_ct.val[0].raw, tmp_ct.val[0].raw, splitted_e->val[0].raw,
+ POSIX_GUARD(gf2x_add(tmp_ct.val[0].raw, tmp_ct.val[0].raw, splitted_e->val[0].raw,
R_SIZE));
- GUARD(gf2x_add(tmp_ct.val[1].raw, tmp_ct.val[1].raw, splitted_e->val[1].raw,
+ POSIX_GUARD(gf2x_add(tmp_ct.val[1].raw, tmp_ct.val[1].raw, splitted_e->val[1].raw,
R_SIZE));
// Recompute the syndrome
- GUARD(compute_syndrome(syndrome, &tmp_ct, sk));
+ POSIX_GUARD(compute_syndrome(syndrome, &tmp_ct, sk));
return SUCCESS;
}
@@ -334,7 +334,7 @@ decode(OUT split_e_t *e,
DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
find_err1(e, &black_e, &gray_e, &s, sk->wlist, threshold);
- GUARD(recompute_syndrome(&s, ct, sk, e));
+ POSIX_GUARD(recompute_syndrome(&s, ct, sk, e));
#ifdef BGF_DECODER
if(iter >= 1)
{
@@ -346,14 +346,14 @@ decode(OUT split_e_t *e,
DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
find_err2(e, &black_e, &s, sk->wlist, ((DV + 1) / 2) + 1);
- GUARD(recompute_syndrome(&s, ct, sk, e));
+ POSIX_GUARD(recompute_syndrome(&s, ct, sk, e));
DMSG(" Weight of e: %lu\n",
r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1]));
DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
find_err2(e, &gray_e, &s, sk->wlist, ((DV + 1) / 2) + 1);
- GUARD(recompute_syndrome(&s, ct, sk, e));
+ POSIX_GUARD(recompute_syndrome(&s, ct, sk, e));
}
if(r_bits_vector_weight((r_t *)s.qw) > 0)
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c
index 09e0af3fde..c80d3365cb 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c
@@ -108,15 +108,15 @@ ossl_add(OUT uint8_t res_bin[R_SIZE],
BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
}
- GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
- GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
+ POSIX_GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
+ POSIX_GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
if(BN_GF2m_add(r, a, b) == 0)
{
BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
}
- GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
+ POSIX_GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
return SUCCESS;
}
@@ -176,10 +176,10 @@ cyclic_product(OUT uint8_t res_bin[R_SIZE],
BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
}
- GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
- GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
- GUARD(ossl_cyclic_product(r, a, b, bn_ctx));
- GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
+ POSIX_GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
+ POSIX_GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
+ POSIX_GUARD(ossl_cyclic_product(r, a, b, bn_ctx));
+ POSIX_GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
return SUCCESS;
}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c
index 3686338fad..d08fa5eea7 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c
@@ -20,7 +20,7 @@ get_rand_mod_len(OUT uint32_t * rand_pos,
do
{
// Generate 128bit of random numbers
- GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos)));
+ POSIX_GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos)));
// Mask only relevant bits
(*rand_pos) &= mask;
@@ -56,7 +56,7 @@ sample_uniform_r_bits_with_fixed_prf_context(OUT r_t *r,
IN const must_be_odd_t must_be_odd)
{
// Generate random data
- GUARD(aes_ctr_prf(r->raw, prf_state, R_SIZE));
+ POSIX_GUARD(aes_ctr_prf(r->raw, prf_state, R_SIZE));
// Mask upper bits of the MSByte
r->raw[R_SIZE - 1] &= MASK(R_BITS + 8 - (R_SIZE * 8));
@@ -104,7 +104,7 @@ generate_sparse_rep(OUT uint64_t * a,
// Generate weight rand numbers
do
{
- GUARD(get_rand_mod_len(&wlist[ctr], len, prf_state));
+ POSIX_GUARD(get_rand_mod_len(&wlist[ctr], len, prf_state));
ctr += is_new(wlist, ctr);
} while(ctr < weight);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h
index 1ffd56f34a..4ec60683de 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h
@@ -53,9 +53,9 @@ sample_uniform_r_bits(OUT r_t *r,
// For the seedexpander
DEFER_CLEANUP(aes_ctr_prf_state_t prf_state = {0}, aes_ctr_prf_state_cleanup);
- GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed));
+ POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed));
- GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd));
+ POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd));
return SUCCESS;
}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/LICENSE b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/LICENSE
new file mode 100644
index 0000000000..7a4a3ea242
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. \ No newline at end of file
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes.h
new file mode 100644
index 0000000000..b8b04c3655
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes.h
@@ -0,0 +1,62 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include <openssl/evp.h>
+
+#include "cleanup.h"
+
+#define MAX_AES_INVOKATION (MASK(32))
+
+#define AES256_KEY_BYTES (32U)
+#define AES256_KEY_BITS (AES256_KEY_BYTES * 8)
+#define AES256_BLOCK_BYTES (16U)
+#define AES256_ROUNDS (14U)
+
+typedef ALIGN(16) struct aes256_key_s {
+ uint8_t raw[AES256_KEY_BYTES];
+} aes256_key_t;
+
+CLEANUP_FUNC(aes256_key, aes256_key_t)
+
+// Using OpenSSL structures
+typedef EVP_CIPHER_CTX *aes256_ks_t;
+
+_INLINE_ ret_t aes256_key_expansion(OUT aes256_ks_t *ks,
+ IN const aes256_key_t *key)
+{
+ *ks = EVP_CIPHER_CTX_new();
+ if(*ks == NULL) {
+ BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
+ }
+ if(0 == EVP_EncryptInit_ex(*ks, EVP_aes_256_ecb(), NULL, key->raw, NULL)) {
+ EVP_CIPHER_CTX_free(*ks);
+ BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
+ }
+
+ EVP_CIPHER_CTX_set_padding(*ks, 0);
+
+ return SUCCESS;
+}
+
+_INLINE_ ret_t aes256_enc(OUT uint8_t *ct,
+ IN const uint8_t *pt,
+ IN const aes256_ks_t *ks)
+{
+ int outlen = 0;
+ if(0 == EVP_EncryptUpdate(*ks, ct, &outlen, pt, AES256_BLOCK_BYTES)) {
+ BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
+ }
+ return SUCCESS;
+}
+
+_INLINE_ void aes256_free_ks(OUT aes256_ks_t *ks)
+{
+ EVP_CIPHER_CTX_free(*ks);
+ ks = NULL;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.c
new file mode 100644
index 0000000000..9b50469ef1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.c
@@ -0,0 +1,97 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include "aes_ctr_prf.h"
+#include "utilities.h"
+
+ret_t init_aes_ctr_prf_state(OUT aes_ctr_prf_state_t *s,
+ IN const uint32_t max_invokations,
+ IN const seed_t *seed)
+{
+ if(0 == max_invokations) {
+ BIKE_ERROR(E_AES_CTR_PRF_INIT_FAIL);
+ }
+
+ // Set the key schedule (from seed).
+ // Make sure the size matches the AES256 key size.
+ DEFER_CLEANUP(aes256_key_t key, aes256_key_cleanup);
+
+ bike_static_assert(sizeof(*seed) == sizeof(key.raw), seed_size_equals_ky_size);
+ bike_memcpy(key.raw, seed->raw, sizeof(key.raw));
+
+ POSIX_GUARD(aes256_key_expansion(&s->ks, &key));
+
+ // Initialize buffer and counter
+ s->ctr.u.qw[0] = 0;
+ s->ctr.u.qw[1] = 0;
+ s->buffer.u.qw[0] = 0;
+ s->buffer.u.qw[1] = 0;
+
+ s->pos = AES256_BLOCK_BYTES;
+ s->rem_invokations = max_invokations;
+
+ DMSG(" Init aes_prf_ctr state:\n");
+ DMSG(" s.pos = %d\n", s->pos);
+ DMSG(" s.rem_invokations = %u\n", s->rem_invokations);
+
+ return SUCCESS;
+}
+
+_INLINE_ ret_t perform_aes(OUT uint8_t *ct, IN OUT aes_ctr_prf_state_t *s)
+{
+ // Ensure that the CTR is large enough
+ bike_static_assert(
+ ((sizeof(s->ctr.u.qw[0]) == 8) && (BIT(33) >= MAX_AES_INVOKATION)),
+ ctr_size_is_too_small);
+
+ if(0 == s->rem_invokations) {
+ BIKE_ERROR(E_AES_OVER_USED);
+ }
+
+ POSIX_GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks));
+
+ s->ctr.u.qw[0]++;
+ s->rem_invokations--;
+
+ return SUCCESS;
+}
+
+ret_t aes_ctr_prf(OUT uint8_t *a,
+ IN OUT aes_ctr_prf_state_t *s,
+ IN const uint32_t len)
+{
+ // When Len is smaller than use what's left in the buffer,
+ // there is no need for additional AES invocations.
+ if((len + s->pos) <= AES256_BLOCK_BYTES) {
+ bike_memcpy(a, &s->buffer.u.bytes[s->pos], len);
+ s->pos += len;
+
+ return SUCCESS;
+ }
+
+ // If s.pos != AES256_BLOCK_BYTES then copy what's left in the buffer.
+ // Else copy zero bytes
+ uint32_t idx = AES256_BLOCK_BYTES - s->pos;
+ bike_memcpy(a, &s->buffer.u.bytes[s->pos], idx);
+
+ // Init s.pos
+ s->pos = 0;
+
+ // Copy full AES blocks
+ while((len - idx) >= AES256_BLOCK_BYTES) {
+ POSIX_GUARD(perform_aes(&a[idx], s));
+ idx += AES256_BLOCK_BYTES;
+ }
+
+ POSIX_GUARD(perform_aes(s->buffer.u.bytes, s));
+
+ // Copy the tail
+ s->pos = len - idx;
+ bike_memcpy(&a[idx], s->buffer.u.bytes, s->pos);
+
+ return SUCCESS;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.h
new file mode 100644
index 0000000000..684a52a6fc
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.h
@@ -0,0 +1,43 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "aes.h"
+
+//////////////////////////////
+// Types
+/////////////////////////////
+
+typedef struct aes_ctr_prf_state_s {
+ uint128_t ctr;
+ uint128_t buffer;
+ aes256_ks_t ks;
+ uint32_t rem_invokations;
+ uint8_t pos;
+} aes_ctr_prf_state_t;
+
+//////////////////////////////
+// Methods
+/////////////////////////////
+
+ret_t init_aes_ctr_prf_state(OUT aes_ctr_prf_state_t *s,
+ IN uint32_t max_invokations,
+ IN const seed_t *seed);
+
+ret_t aes_ctr_prf(OUT uint8_t *a, IN OUT aes_ctr_prf_state_t *s, IN uint32_t len);
+
+_INLINE_ void finalize_aes_ctr_prf(IN OUT aes_ctr_prf_state_t *s)
+{
+ aes256_free_ks(&s->ks);
+ secure_clean((uint8_t *)s, sizeof(*s));
+}
+
+_INLINE_ void aes_ctr_prf_state_cleanup(IN OUT aes_ctr_prf_state_t *s)
+{
+ finalize_aes_ctr_prf(s);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_defs.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_defs.h
new file mode 100644
index 0000000000..697efd0627
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_defs.h
@@ -0,0 +1,91 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "defs.h"
+
+////////////////////////////////////////////
+// BIKE Parameters
+///////////////////////////////////////////
+#define N0 2
+
+#if !defined(LEVEL)
+# define LEVEL 1
+#endif
+
+#if(LEVEL == 3)
+# define R_BITS 24659
+# define DV 103
+# define T1 199
+
+# define THRESHOLD_COEFF0 15.2588
+# define THRESHOLD_COEFF1 0.005265
+# define THRESHOLD_MIN 52
+
+// The gf2m code is optimized to a block in this case:
+# define BLOCK_BITS 32768
+#elif(LEVEL == 1)
+// 64-bits of post-quantum security parameters (BIKE paper):
+# define R_BITS 12323
+# define DV 71
+# define T1 134
+
+# define THRESHOLD_COEFF0 13.530
+# define THRESHOLD_COEFF1 0.0069722
+# define THRESHOLD_MIN 36
+
+// The gf2x code is optimized to a block in this case:
+# define BLOCK_BITS (16384)
+#else
+# error "Bad level, choose one of 1/3/5"
+#endif
+
+#define NUM_OF_SEEDS 2
+
+// Round the size to the nearest byte.
+// SIZE suffix, is the number of bytes (uint8_t).
+#define N_BITS (R_BITS * N0)
+#define R_BYTES DIVIDE_AND_CEIL(R_BITS, 8)
+#define R_QWORDS DIVIDE_AND_CEIL(R_BITS, 8 * BYTES_IN_QWORD)
+#define R_XMM DIVIDE_AND_CEIL(R_BITS, 8 * BYTES_IN_XMM)
+#define R_YMM DIVIDE_AND_CEIL(R_BITS, 8 * BYTES_IN_YMM)
+#define R_ZMM DIVIDE_AND_CEIL(R_BITS, 8 * BYTES_IN_ZMM)
+
+#define R_BLOCKS DIVIDE_AND_CEIL(R_BITS, BLOCK_BITS)
+#define R_PADDED (R_BLOCKS * BLOCK_BITS)
+#define R_PADDED_BYTES (R_PADDED / 8)
+#define R_PADDED_QWORDS (R_PADDED / 64)
+
+#define LAST_R_QWORD_LEAD (R_BITS & MASK(6))
+#define LAST_R_QWORD_TRAIL (64 - LAST_R_QWORD_LEAD)
+#define LAST_R_QWORD_MASK MASK(LAST_R_QWORD_LEAD)
+
+#define LAST_R_BYTE_LEAD (R_BITS & MASK(3))
+#define LAST_R_BYTE_TRAIL (8 - LAST_R_BYTE_LEAD)
+#define LAST_R_BYTE_MASK MASK(LAST_R_BYTE_LEAD)
+
+// Data alignement
+#define ALIGN_BYTES (BYTES_IN_ZMM)
+
+#define M_BITS 256
+#define M_BYTES (M_BITS / 8)
+
+#define SS_BITS 256
+#define SS_BYTES (SS_BITS / 8)
+
+#define SEED_BYTES (256 / 8)
+
+//////////////////////////////////
+// Parameters for the BGF decoder.
+//////////////////////////////////
+#define BGF_DECODER
+#define DELTA 3
+#define SLICES (LOG2_MSB(DV) + 1)
+
+// GF2X inversion can only handle R < 32768
+bike_static_assert((R_BITS < 32768), r_too_large_for_inversion);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_r3_kem.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_r3_kem.c
new file mode 100644
index 0000000000..328bb52db8
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_r3_kem.c
@@ -0,0 +1,288 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron, and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include "decode.h"
+#include "gf2x.h"
+#include "sampling.h"
+#include "sha.h"
+#include "tls/s2n_kem.h"
+#include "pq-crypto/s2n_pq.h"
+
+// m_t and seed_t have the same size and thus can be considered
+// to be of the same type. However, for security reasons we distinguish
+// these types, even on the costs of small extra complexity.
+_INLINE_ void convert_seed_to_m_type(OUT m_t *m, IN const seed_t *seed)
+{
+ bike_static_assert(sizeof(*m) == sizeof(*seed), m_size_eq_seed_size);
+ bike_memcpy(m->raw, seed->raw, sizeof(*m));
+}
+
+_INLINE_ void convert_m_to_seed_type(OUT seed_t *seed, IN const m_t *m)
+{
+ bike_static_assert(sizeof(*m) == sizeof(*seed), m_size_eq_seed_size);
+ bike_memcpy(seed->raw, m->raw, sizeof(*seed));
+}
+
+// (e0, e1) = H(m)
+_INLINE_ ret_t function_h(OUT pad_e_t *e, IN const m_t *m)
+{
+ DEFER_CLEANUP(seed_t seed = {0}, seed_cleanup);
+
+ convert_m_to_seed_type(&seed, m);
+ return generate_error_vector(e, &seed);
+}
+
+// out = L(e)
+_INLINE_ ret_t function_l(OUT m_t *out, IN const pad_e_t *e)
+{
+ DEFER_CLEANUP(sha_dgst_t dgst = {0}, sha_dgst_cleanup);
+ DEFER_CLEANUP(e_t tmp, e_cleanup);
+
+ // Take the padding away
+ tmp.val[0] = e->val[0].val;
+ tmp.val[1] = e->val[1].val;
+
+ POSIX_GUARD(sha(&dgst, sizeof(tmp), (uint8_t *)&tmp));
+
+ // Truncate the SHA384 digest to a 256-bits m_t
+ bike_static_assert(sizeof(dgst) >= sizeof(*out), dgst_size_lt_m_size);
+ bike_memcpy(out->raw, dgst.u.raw, sizeof(*out));
+
+ return SUCCESS;
+}
+
+// Generate the Shared Secret K(m, c0, c1)
+_INLINE_ ret_t function_k(OUT ss_t *out, IN const m_t *m, IN const ct_t *ct)
+{
+ DEFER_CLEANUP(func_k_t tmp, func_k_cleanup);
+ DEFER_CLEANUP(sha_dgst_t dgst = {0}, sha_dgst_cleanup);
+
+ // Copy every element, padded to the nearest byte
+ tmp.m = *m;
+ tmp.c0 = ct->c0;
+ tmp.c1 = ct->c1;
+
+ POSIX_GUARD(sha(&dgst, sizeof(tmp), (uint8_t *)&tmp));
+
+ // Truncate the SHA384 digest to a 256-bits value
+ // to subsequently use it as a seed.
+ bike_static_assert(sizeof(dgst) >= sizeof(*out), dgst_size_lt_out_size);
+ bike_memcpy(out->raw, dgst.u.raw, sizeof(*out));
+
+ return SUCCESS;
+}
+
+_INLINE_ ret_t encrypt(OUT ct_t *ct,
+ IN const pad_e_t *e,
+ IN const pk_t *pk,
+ IN const m_t *m)
+{
+ // Pad the public key and the ciphertext
+ pad_r_t p_ct = {0};
+ pad_r_t p_pk = {0};
+ p_pk.val = *pk;
+
+ // Generate the ciphertext
+ // ct = pk * e1 + e0
+ gf2x_mod_mul(&p_ct, &e->val[1], &p_pk);
+ gf2x_mod_add(&p_ct, &p_ct, &e->val[0]);
+
+ ct->c0 = p_ct.val;
+
+ // c1 = L(e0, e1)
+ POSIX_GUARD(function_l(&ct->c1, e));
+
+ // m xor L(e0, e1)
+ for(size_t i = 0; i < sizeof(*m); i++) {
+ ct->c1.raw[i] ^= m->raw[i];
+ }
+
+ return SUCCESS;
+}
+
+_INLINE_ ret_t reencrypt(OUT m_t *m, IN const pad_e_t *e, IN const ct_t *l_ct)
+{
+ DEFER_CLEANUP(m_t tmp, m_cleanup);
+
+ POSIX_GUARD(function_l(&tmp, e));
+
+ // m' = c1 ^ L(e')
+ for(size_t i = 0; i < sizeof(*m); i++) {
+ m->raw[i] = tmp.raw[i] ^ l_ct->c1.raw[i];
+ }
+
+ return SUCCESS;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The three APIs below (keypair, encapsulate, decapsulate) are defined by NIST:
+////////////////////////////////////////////////////////////////////////////////
+int BIKE_L1_R3_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
+{
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE_REF(sk);
+ POSIX_ENSURE_REF(pk);
+
+ DEFER_CLEANUP(aligned_sk_t l_sk = {0}, sk_cleanup);
+
+ // The secret key is (h0, h1),
+ // and the public key h=(h0^-1 * h1).
+ // Padded structures are used internally, and are required by the
+ // decoder and the gf2x multiplication.
+ DEFER_CLEANUP(pad_r_t h0 = {0}, pad_r_cleanup);
+ DEFER_CLEANUP(pad_r_t h1 = {0}, pad_r_cleanup);
+ DEFER_CLEANUP(pad_r_t h0inv = {0}, pad_r_cleanup);
+ DEFER_CLEANUP(pad_r_t h = {0}, pad_r_cleanup);
+
+ // The randomness of the key generation
+ DEFER_CLEANUP(seeds_t seeds = {0}, seeds_cleanup);
+
+ // An AES_PRF state for the secret key
+ DEFER_CLEANUP(aes_ctr_prf_state_t h_prf_state = {0}, aes_ctr_prf_state_cleanup);
+
+ POSIX_GUARD(get_seeds(&seeds));
+ POSIX_GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
+
+ // Generate the secret key (h0, h1) with weight w/2
+ POSIX_GUARD(generate_sparse_rep(&h0, l_sk.wlist[0].val, &h_prf_state));
+ POSIX_GUARD(generate_sparse_rep(&h1, l_sk.wlist[1].val, &h_prf_state));
+
+ // Generate sigma
+ convert_seed_to_m_type(&l_sk.sigma, &seeds.seed[1]);
+
+ // Calculate the public key
+ gf2x_mod_inv(&h0inv, &h0);
+ gf2x_mod_mul(&h, &h1, &h0inv);
+
+ // Fill the secret key data structure with contents - cancel the padding
+ l_sk.bin[0] = h0.val;
+ l_sk.bin[1] = h1.val;
+ l_sk.pk = h.val;
+
+ // Copy the data to the output buffers
+ bike_memcpy(sk, &l_sk, sizeof(l_sk));
+ bike_memcpy(pk, &l_sk.pk, sizeof(l_sk.pk));
+
+ return SUCCESS;
+}
+
+// Encapsulate - pk is the public key,
+// ct is a key encapsulation message (ciphertext),
+// ss is the shared secret.
+int BIKE_L1_R3_crypto_kem_enc(OUT unsigned char * ct,
+ OUT unsigned char * ss,
+ IN const unsigned char *pk)
+{
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE_REF(pk);
+ POSIX_ENSURE_REF(ct);
+ POSIX_ENSURE_REF(ss);
+
+ // Public values (they do not require cleanup on exit).
+ pk_t l_pk;
+ ct_t l_ct;
+
+ DEFER_CLEANUP(m_t m, m_cleanup);
+ DEFER_CLEANUP(ss_t l_ss, ss_cleanup);
+ DEFER_CLEANUP(seeds_t seeds = {0}, seeds_cleanup);
+ DEFER_CLEANUP(pad_e_t e, pad_e_cleanup);
+
+ // Copy the data from the input buffer. This is required in order to avoid
+ // alignment issues on non x86_64 processors.
+ bike_memcpy(&l_pk, pk, sizeof(l_pk));
+
+ POSIX_GUARD(get_seeds(&seeds));
+
+ // e = H(m) = H(seed[0])
+ convert_seed_to_m_type(&m, &seeds.seed[0]);
+ POSIX_GUARD(function_h(&e, &m));
+
+ // Calculate the ciphertext
+ POSIX_GUARD(encrypt(&l_ct, &e, &l_pk, &m));
+
+ // Generate the shared secret
+ POSIX_GUARD(function_k(&l_ss, &m, &l_ct));
+
+ // Copy the data to the output buffers
+ bike_memcpy(ct, &l_ct, sizeof(l_ct));
+ bike_memcpy(ss, &l_ss, sizeof(l_ss));
+
+ return SUCCESS;
+}
+
+// Decapsulate - ct is a key encapsulation message (ciphertext),
+// sk is the private key,
+// ss is the shared secret
+int BIKE_L1_R3_crypto_kem_dec(OUT unsigned char * ss,
+ IN const unsigned char *ct,
+ IN const unsigned char *sk)
+{
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE_REF(sk);
+ POSIX_ENSURE_REF(ct);
+ POSIX_ENSURE_REF(ss);
+
+ // Public values, does not require a cleanup on exit
+ ct_t l_ct;
+
+ DEFER_CLEANUP(seeds_t seeds = {0}, seeds_cleanup);
+
+ DEFER_CLEANUP(ss_t l_ss, ss_cleanup);
+ DEFER_CLEANUP(aligned_sk_t l_sk, sk_cleanup);
+ DEFER_CLEANUP(e_t e, e_cleanup);
+ DEFER_CLEANUP(m_t m_prime, m_cleanup);
+ DEFER_CLEANUP(pad_e_t e_tmp, pad_e_cleanup);
+ DEFER_CLEANUP(pad_e_t e_prime, pad_e_cleanup);
+
+ // Copy the data from the input buffers. This is required in order to avoid
+ // alignment issues on non x86_64 processors.
+ bike_memcpy(&l_ct, ct, sizeof(l_ct));
+ bike_memcpy(&l_sk, sk, sizeof(l_sk));
+
+ // Generate a random error vector to be used in case of decoding failure
+ // (Note: possibly, a "fixed" zeroed error vector could suffice too,
+ // and serve this generation)
+ POSIX_GUARD(get_seeds(&seeds));
+ POSIX_GUARD(generate_error_vector(&e_prime, &seeds.seed[0]));
+
+ // Decode and on success check if |e|=T (all in constant-time)
+ volatile uint32_t success_cond = (decode(&e, &l_ct, &l_sk) == SUCCESS);
+ success_cond &= secure_cmp32(T1, r_bits_vector_weight(&e.val[0]) +
+ r_bits_vector_weight(&e.val[1]));
+
+ // Set appropriate error based on the success condition
+ uint8_t mask = ~secure_l32_mask(0, success_cond);
+ for(size_t i = 0; i < R_BYTES; i++) {
+ PE0_RAW(&e_prime)[i] &= u8_barrier(~mask);
+ PE0_RAW(&e_prime)[i] |= (u8_barrier(mask) & E0_RAW(&e)[i]);
+ PE1_RAW(&e_prime)[i] &= u8_barrier(~mask);
+ PE1_RAW(&e_prime)[i] |= (u8_barrier(mask) & E1_RAW(&e)[i]);
+ }
+
+ POSIX_GUARD(reencrypt(&m_prime, &e_prime, &l_ct));
+
+ // Check if H(m') is equal to (e0', e1')
+ // (in constant-time)
+ POSIX_GUARD(function_h(&e_tmp, &m_prime));
+ success_cond = secure_cmp(PE0_RAW(&e_prime), PE0_RAW(&e_tmp), R_BYTES);
+ success_cond &= secure_cmp(PE1_RAW(&e_prime), PE1_RAW(&e_tmp), R_BYTES);
+
+ // Compute either K(m', C) or K(sigma, C) based on the success condition
+ mask = secure_l32_mask(0, success_cond);
+ for(size_t i = 0; i < M_BYTES; i++) {
+ m_prime.raw[i] &= u8_barrier(~mask);
+ m_prime.raw[i] |= (u8_barrier(mask) & l_sk.sigma.raw[i]);
+ }
+
+ // Generate the shared secret
+ POSIX_GUARD(function_k(&l_ss, &m_prime, &l_ct));
+
+ // Copy the data into the output buffer
+ bike_memcpy(ss, &l_ss, sizeof(l_ss));
+
+ return SUCCESS;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/cleanup.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/cleanup.h
new file mode 100644
index 0000000000..22e8c44250
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/cleanup.h
@@ -0,0 +1,63 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "utilities.h"
+
+/* Runs _thecleanup function on _thealloc once _thealloc went out of scope */
+#define DEFER_CLEANUP(_thealloc, _thecleanup) \
+ __attribute__((cleanup(_thecleanup))) _thealloc
+
+// len is bytes length of in
+_INLINE_ void secure_clean(OUT uint8_t *p, IN const uint32_t len)
+{
+#if defined(_WIN32)
+ SecureZeroMemory(p, len);
+#else
+ typedef void *(*memset_t)(void *, int, size_t);
+ static volatile memset_t memset_func = bike_memset;
+ memset_func(p, 0, len);
+#endif
+}
+
+#define CLEANUP_FUNC(name, type) \
+ _INLINE_ void name##_cleanup(IN OUT type *o) \
+ { \
+ secure_clean((uint8_t *)o, sizeof(*o)); \
+ }
+
+CLEANUP_FUNC(r, r_t)
+CLEANUP_FUNC(m, m_t)
+CLEANUP_FUNC(e, e_t)
+CLEANUP_FUNC(sk, sk_t)
+CLEANUP_FUNC(ss, ss_t)
+CLEANUP_FUNC(ct, ct_t)
+CLEANUP_FUNC(pad_r, pad_r_t)
+CLEANUP_FUNC(pad_e, pad_e_t)
+CLEANUP_FUNC(seed, seed_t)
+CLEANUP_FUNC(syndrome, syndrome_t)
+CLEANUP_FUNC(upc, upc_t)
+CLEANUP_FUNC(func_k, func_k_t)
+CLEANUP_FUNC(dbl_pad_r, dbl_pad_r_t)
+
+// The functions below require special handling because we deal
+// with arrays and not structures.
+
+_INLINE_ void compressed_idx_d_ar_cleanup(IN OUT compressed_idx_d_ar_t *o)
+{
+ for(int i = 0; i < N0; i++) {
+ secure_clean((uint8_t *)&(*o)[i], sizeof((*o)[0]));
+ }
+}
+
+_INLINE_ void seeds_cleanup(IN OUT seeds_t *o)
+{
+ for(int i = 0; i < NUM_OF_SEEDS; i++) {
+ seed_cleanup(&(o->seed[i]));
+ }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.c
new file mode 100644
index 0000000000..c280b95f03
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.c
@@ -0,0 +1,280 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * [1] The optimizations are based on the description developed in the paper:
+ * Drucker, Nir, and Shay Gueron. 2019. “A Toolbox for Software Optimization
+ * of QC-MDPC Code-Based Cryptosystems.” Journal of Cryptographic Engineering,
+ * January, 1–17. https://doi.org/10.1007/s13389-018-00200-4.
+ *
+ * [2] The decoder algorithm is the Black-Gray decoder in
+ * the early submission of CAKE (due to N. Sandrier and R Misoczki).
+ *
+ * [3] The analysis for the constant time implementation is given in
+ * Drucker, Nir, Shay Gueron, and Dusan Kostic. 2019.
+ * “On Constant-Time QC-MDPC Decoding with Negligible Failure Rate.”
+ * Cryptology EPrint Archive, 2019. https://eprint.iacr.org/2019/1289.
+ *
+ * [4] it was adapted to BGF in:
+ * Drucker, Nir, Shay Gueron, and Dusan Kostic. 2019.
+ * “QC-MDPC decoders with several shades of gray.”
+ * Cryptology EPrint Archive, 2019. To be published.
+ *
+ * [5] Chou, T.: QcBits: Constant-Time Small-Key Code-Based Cryptography.
+ * In: Gier-lichs, B., Poschmann, A.Y. (eds.) Cryptographic Hardware
+ * and Embedded Systems– CHES 2016. pp. 280–300. Springer Berlin Heidelberg,
+ * Berlin, Heidelberg (2016)
+ *
+ * [6] The rotate512_small funciton is a derivative of the code described in:
+ * Guimarães, Antonio, Diego F Aranha, and Edson Borin. 2019.
+ * “Optimized Implementation of QC-MDPC Code-Based Cryptography.”
+ * Concurrency and Computation: Practice and Experience 31 (18):
+ * e5089. https://doi.org/10.1002/cpe.5089.
+ */
+
+#include "decode.h"
+#include "cleanup.h"
+#include "decode_internal.h"
+#include "gf2x.h"
+#include "utilities.h"
+
+// Decoding (bit-flipping) parameter
+#if defined(BG_DECODER)
+# if(LEVEL == 1)
+# define MAX_IT 3
+# elif(LEVEL == 3)
+# define MAX_IT 4
+# else
+# error "Level can only be 1/3"
+# endif
+#elif defined(BGF_DECODER)
+# if(LEVEL == 1)
+# define MAX_IT 5
+# elif(LEVEL == 3)
+# define MAX_IT 5
+# else
+# error "Level can only be 1/3"
+# endif
+#endif
+
+ret_t compute_syndrome(OUT syndrome_t *syndrome,
+ IN const pad_r_t *c0,
+ IN const pad_r_t *h0,
+ IN const decode_ctx *ctx)
+{
+ DEFER_CLEANUP(pad_r_t pad_s, pad_r_cleanup);
+
+ gf2x_mod_mul(&pad_s, c0, h0);
+
+ bike_memcpy((uint8_t *)syndrome->qw, pad_s.val.raw, R_BYTES);
+ ctx->dup(syndrome);
+
+ return SUCCESS;
+}
+
+_INLINE_ ret_t recompute_syndrome(OUT syndrome_t *syndrome,
+ IN const pad_r_t *c0,
+ IN const pad_r_t *h0,
+ IN const pad_r_t *pk,
+ IN const e_t *e,
+ IN const decode_ctx *ctx)
+{
+ DEFER_CLEANUP(pad_r_t tmp_c0, pad_r_cleanup);
+ DEFER_CLEANUP(pad_r_t e0 = {0}, pad_r_cleanup);
+ DEFER_CLEANUP(pad_r_t e1 = {0}, pad_r_cleanup);
+
+ e0.val = e->val[0];
+ e1.val = e->val[1];
+
+ // tmp_c0 = pk * e1 + c0 + e0
+ gf2x_mod_mul(&tmp_c0, &e1, pk);
+ gf2x_mod_add(&tmp_c0, &tmp_c0, c0);
+ gf2x_mod_add(&tmp_c0, &tmp_c0, &e0);
+
+ // Recompute the syndrome using the updated ciphertext
+ POSIX_GUARD(compute_syndrome(syndrome, &tmp_c0, h0, ctx));
+
+ return SUCCESS;
+}
+
+_INLINE_ uint8_t get_threshold(IN const syndrome_t *s)
+{
+ bike_static_assert(sizeof(*s) >= sizeof(r_t), syndrome_is_large_enough);
+
+ const uint32_t syndrome_weight = r_bits_vector_weight((const r_t *)s->qw);
+
+ // The equations below are defined in BIKE's specification p. 16, Section 5.2
+ uint32_t thr = THRESHOLD_COEFF0 + (THRESHOLD_COEFF1 * syndrome_weight);
+ const uint32_t mask = secure_l32_mask(thr, THRESHOLD_MIN);
+ thr = (u32_barrier(mask) & thr) | (u32_barrier(~mask) & THRESHOLD_MIN);
+
+ DMSG(" Threshold: %d\n", thr);
+ return thr;
+}
+
+// Calculate the Unsatisfied Parity Checks (UPCs) and update the errors
+// vector (e) accordingly. In addition, update the black and gray errors vector
+// with the relevant values.
+_INLINE_ void find_err1(OUT e_t *e,
+ OUT e_t *black_e,
+ OUT e_t *gray_e,
+ IN const syndrome_t * syndrome,
+ IN const compressed_idx_d_ar_t wlist,
+ IN const uint8_t threshold,
+ IN const decode_ctx *ctx)
+{
+ // This function uses the bit-slice-adder methodology of [5]:
+ DEFER_CLEANUP(syndrome_t rotated_syndrome = {0}, syndrome_cleanup);
+ DEFER_CLEANUP(upc_t upc, upc_cleanup);
+
+ for(uint32_t i = 0; i < N0; i++) {
+ // UPC must start from zero at every iteration
+ bike_memset(&upc, 0, sizeof(upc));
+
+ // 1) Right-rotate the syndrome for every secret key set bit index
+ // Then slice-add it to the UPC array.
+ for(size_t j = 0; j < DV; j++) {
+ ctx->rotate_right(&rotated_syndrome, syndrome, wlist[i].val[j]);
+ ctx->bit_sliced_adder(&upc, &rotated_syndrome, LOG2_MSB(j + 1));
+ }
+
+ // 2) Subtract the threshold from the UPC counters
+ ctx->bit_slice_full_subtract(&upc, threshold);
+
+ // 3) Update the errors and the black errors vectors.
+ // The last slice of the UPC array holds the MSB of the accumulated values
+ // minus the threshold. Every zero bit indicates a potential error bit.
+ // The errors values are stored in the black array and xored with the
+ // errors Of the previous iteration.
+ const r_t *last_slice = &(upc.slice[SLICES - 1].u.r.val);
+ for(size_t j = 0; j < R_BYTES; j++) {
+ const uint8_t sum_msb = (~last_slice->raw[j]);
+ black_e->val[i].raw[j] = sum_msb;
+ e->val[i].raw[j] ^= sum_msb;
+ }
+
+ // Ensure that the padding bits (upper bits of the last byte) are zero so
+ // they will not be included in the multiplication and in the hash function.
+ e->val[i].raw[R_BYTES - 1] &= LAST_R_BYTE_MASK;
+
+ // 4) Calculate the gray error array by adding "DELTA" to the UPC array.
+ // For that we reuse the rotated_syndrome variable setting it to all "1".
+ for(size_t l = 0; l < DELTA; l++) {
+ bike_memset((uint8_t *)rotated_syndrome.qw, 0xff, R_BYTES);
+ ctx->bit_sliced_adder(&upc, &rotated_syndrome, SLICES);
+ }
+
+ // 5) Update the gray list with the relevant bits that are not
+ // set in the black list.
+ for(size_t j = 0; j < R_BYTES; j++) {
+ const uint8_t sum_msb = (~last_slice->raw[j]);
+ gray_e->val[i].raw[j] = (~(black_e->val[i].raw[j])) & sum_msb;
+ }
+ }
+}
+
+// Recalculate the UPCs and update the errors vector (e) according to it
+// and to the black/gray vectors.
+_INLINE_ void find_err2(OUT e_t *e,
+ IN e_t * pos_e,
+ IN const syndrome_t * syndrome,
+ IN const compressed_idx_d_ar_t wlist,
+ IN const uint8_t threshold,
+ IN const decode_ctx *ctx)
+{
+ DEFER_CLEANUP(syndrome_t rotated_syndrome = {0}, syndrome_cleanup);
+ DEFER_CLEANUP(upc_t upc, upc_cleanup);
+
+ for(uint32_t i = 0; i < N0; i++) {
+ // UPC must start from zero at every iteration
+ bike_memset(&upc, 0, sizeof(upc));
+
+ // 1) Right-rotate the syndrome, for every index of a set bit in the secret
+ // key. Then slice-add it to the UPC array.
+ for(size_t j = 0; j < DV; j++) {
+ ctx->rotate_right(&rotated_syndrome, syndrome, wlist[i].val[j]);
+ ctx->bit_sliced_adder(&upc, &rotated_syndrome, LOG2_MSB(j + 1));
+ }
+
+ // 2) Subtract the threshold from the UPC counters
+ ctx->bit_slice_full_subtract(&upc, threshold);
+
+ // 3) Update the errors vector.
+ // The last slice of the UPC array holds the MSB of the accumulated values
+ // minus the threshold. Every zero bit indicates a potential error bit.
+ const r_t *last_slice = &(upc.slice[SLICES - 1].u.r.val);
+ for(size_t j = 0; j < R_BYTES; j++) {
+ const uint8_t sum_msb = (~last_slice->raw[j]);
+ e->val[i].raw[j] ^= (pos_e->val[i].raw[j] & sum_msb);
+ }
+
+ // Ensure that the padding bits (upper bits of the last byte) are zero, so
+ // they are not included in the multiplication, and in the hash function.
+ e->val[i].raw[R_BYTES - 1] &= LAST_R_BYTE_MASK;
+ }
+}
+
+ret_t decode(OUT e_t *e, IN const ct_t *ct, IN const sk_t *sk)
+{
+ // Initialize the decode methods struct
+ decode_ctx ctx;
+ decode_ctx_init(&ctx);
+
+ DEFER_CLEANUP(e_t black_e = {0}, e_cleanup);
+ DEFER_CLEANUP(e_t gray_e = {0}, e_cleanup);
+
+ DEFER_CLEANUP(pad_r_t c0 = {0}, pad_r_cleanup);
+ DEFER_CLEANUP(pad_r_t h0 = {0}, pad_r_cleanup);
+ pad_r_t pk = {0};
+
+ // Pad ciphertext (c0), secret key (h0), and public key (h)
+ c0.val = ct->c0;
+ h0.val = sk->bin[0];
+ pk.val = sk->pk;
+
+ DEFER_CLEANUP(syndrome_t s = {0}, syndrome_cleanup);
+ DMSG(" Computing s.\n");
+ POSIX_GUARD(compute_syndrome(&s, &c0, &h0, &ctx));
+ ctx.dup(&s);
+
+ // Reset (init) the error because it is xored in the find_err functions.
+ bike_memset(e, 0, sizeof(*e));
+
+ for(uint32_t iter = 0; iter < MAX_IT; iter++) {
+ const uint8_t threshold = get_threshold(&s);
+
+ DMSG(" Iteration: %d\n", iter);
+ DMSG(" Weight of e: %lu\n",
+ r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1]));
+ DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
+
+ find_err1(e, &black_e, &gray_e, &s, sk->wlist, threshold, &ctx);
+ POSIX_GUARD(recompute_syndrome(&s, &c0, &h0, &pk, e, &ctx));
+#if defined(BGF_DECODER)
+ if(iter >= 1) {
+ continue;
+ }
+#endif
+ DMSG(" Weight of e: %lu\n",
+ r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1]));
+ DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
+
+ find_err2(e, &black_e, &s, sk->wlist, ((DV + 1) / 2) + 1, &ctx);
+ POSIX_GUARD(recompute_syndrome(&s, &c0, &h0, &pk, e, &ctx));
+
+ DMSG(" Weight of e: %lu\n",
+ r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1]));
+ DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
+
+ find_err2(e, &gray_e, &s, sk->wlist, ((DV + 1) / 2) + 1, &ctx);
+ POSIX_GUARD(recompute_syndrome(&s, &c0, &h0, &pk, e, &ctx));
+ }
+
+ if(r_bits_vector_weight((r_t *)s.qw) > 0) {
+ BIKE_ERROR(E_DECODING_FAILURE);
+ }
+
+ return SUCCESS;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.h
new file mode 100644
index 0000000000..8e405ea12e
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.h
@@ -0,0 +1,12 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "types.h"
+
+ret_t decode(OUT e_t *e, IN const ct_t *ct, IN const sk_t *sk);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx2.c
new file mode 100644
index 0000000000..ea8b91a499
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx2.c
@@ -0,0 +1,173 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * The rotate functions are based on the Barrel shifter described in [1] and
+ * some code snippets from [2]:
+ *
+ * [1] Chou, T.: QcBits: Constant-Time Small-Key Code-Based Cryptography.
+ * In: Gier-lichs, B., Poschmann, A.Y. (eds.) Cryptographic Hardware
+ * and Embedded Systems– CHES 2016. pp. 280–300. Springer Berlin Heidelberg,
+ * Berlin, Heidelberg (2016)
+ *
+ * [2] Guimarães, Antonio, Diego F Aranha, and Edson Borin. 2019.
+ * “Optimized Implementation of QC-MDPC Code-Based Cryptography.”
+ * Concurrency and Computation: Practice and Experience 31 (18):
+ * e5089. https://doi.org/10.1002/cpe.5089.
+ */
+
+#if defined(S2N_BIKE_R3_AVX2)
+
+#include "decode.h"
+#include "decode_internal.h"
+#include "utilities.h"
+
+#define AVX2_INTERNAL
+#include "x86_64_intrinsic.h"
+
+#define R_YMM_HALF_LOG2 UPTOPOW2(R_YMM / 2)
+
+_INLINE_ void
+rotate256_big(OUT syndrome_t *out, IN const syndrome_t *in, IN size_t ymm_num)
+{
+ // For preventing overflows (comparison in bytes)
+ bike_static_assert(sizeof(*out) >
+ (BYTES_IN_YMM * (R_YMM + (2 * R_YMM_HALF_LOG2))),
+ rotr_big_err);
+
+ *out = *in;
+
+ for(uint32_t idx = R_YMM_HALF_LOG2; idx >= 1; idx >>= 1) {
+ const uint8_t mask = secure_l32_mask(ymm_num, idx);
+ const __m256i blend_mask = SET1_I8(mask);
+ ymm_num = ymm_num - (idx & mask);
+
+ for(size_t i = 0; i < (R_YMM + idx); i++) {
+ __m256i a = LOAD(&out->qw[4 * (i + idx)]);
+ __m256i b = LOAD(&out->qw[4 * i]);
+ b = BLENDV_I8(b, a, blend_mask);
+ STORE(&out->qw[4 * i], b);
+ }
+ }
+}
+
+_INLINE_ void
+rotate256_small(OUT syndrome_t *out, IN const syndrome_t *in, size_t count)
+{
+ __m256i carry_in = SET_ZERO;
+ const int count64 = (int)count & 0x3f;
+ const uint64_t count_mask = (count >> 5) & 0xe;
+
+ __m256i idx = SET_I32(7, 6, 5, 4, 3, 2, 1, 0);
+ const __m256i zero_mask = SET_I64(-1, -1, -1, 0);
+ const __m256i count_vet = SET1_I8(count_mask);
+
+ ALIGN(ALIGN_BYTES)
+ const uint8_t zero_mask2_buf[] = {
+ 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x84, 0x84, 0x84,
+ 0x84, 0x84, 0x84, 0x84, 0x84, 0x82, 0x82, 0x82, 0x82, 0x82, 0x82,
+ 0x82, 0x82, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+ __m256i zero_mask2 = LOAD(zero_mask2_buf);
+
+ zero_mask2 = SUB_I8(zero_mask2, count_vet);
+ idx = ADD_I8(idx, count_vet);
+
+ for(int i = R_YMM; i >= 0; i--) {
+ // Load the next 256 bits
+ __m256i in256 = LOAD(&in->qw[4 * i]);
+
+ // Rotate the current and previous 256 registers so that their quadwords
+ // would be in the right positions.
+ __m256i carry_out = PERMVAR_I32(in256, idx);
+ in256 = BLENDV_I8(carry_in, carry_out, zero_mask2);
+
+ // Shift less than 64 (quadwords internal)
+ __m256i inner_carry = BLENDV_I8(carry_in, in256, zero_mask);
+ inner_carry = PERM_I64(inner_carry, 0x39);
+ const __m256i out256 =
+ SRLI_I64(in256, count64) | SLLI_I64(inner_carry, (int)64 - count64);
+
+ // Store the rotated value
+ STORE(&out->qw[4 * i], out256);
+ carry_in = carry_out;
+ }
+}
+
+void rotate_right_avx2(OUT syndrome_t *out,
+ IN const syndrome_t *in,
+ IN const uint32_t bitscount)
+{
+ // 1) Rotate in granularity of 256 bits blocks, using YMMs
+ rotate256_big(out, in, (bitscount / BITS_IN_YMM));
+ // 2) Rotate in smaller granularity (less than 256 bits), using YMMs
+ rotate256_small(out, out, (bitscount % BITS_IN_YMM));
+}
+
+// Duplicates the first R_BITS of the syndrome three times
+// |------------------------------------------|
+// | Third copy | Second copy | first R_BITS |
+// |------------------------------------------|
+// This is required by the rotate functions.
+void dup_avx2(IN OUT syndrome_t *s)
+{
+ s->qw[R_QWORDS - 1] =
+ (s->qw[0] << LAST_R_QWORD_LEAD) | (s->qw[R_QWORDS - 1] & LAST_R_QWORD_MASK);
+
+ for(size_t i = 0; i < (2 * R_QWORDS) - 1; i++) {
+ s->qw[R_QWORDS + i] =
+ (s->qw[i] >> LAST_R_QWORD_TRAIL) | (s->qw[i + 1] << LAST_R_QWORD_LEAD);
+ }
+}
+
+// Use half-adder as described in [1].
+void bit_sliced_adder_avx2(OUT upc_t *upc,
+ IN OUT syndrome_t *rotated_syndrome,
+ IN const size_t num_of_slices)
+{
+ // From cache-memory perspective this loop should be the outside loop
+ for(size_t j = 0; j < num_of_slices; j++) {
+ for(size_t i = 0; i < R_QWORDS; i++) {
+ const uint64_t carry = (upc->slice[j].u.qw[i] & rotated_syndrome->qw[i]);
+ upc->slice[j].u.qw[i] ^= rotated_syndrome->qw[i];
+ rotated_syndrome->qw[i] = carry;
+ }
+ }
+}
+
+void bit_slice_full_subtract_avx2(OUT upc_t *upc, IN uint8_t val)
+{
+ // Borrow
+ uint64_t br[R_QWORDS] = {0};
+
+ for(size_t j = 0; j < SLICES; j++) {
+
+ const uint64_t lsb_mask = 0 - (val & 0x1);
+ val >>= 1;
+
+ // Perform a - b with c as the input/output carry
+ // br = 0 0 0 0 1 1 1 1
+ // a = 0 0 1 1 0 0 1 1
+ // b = 0 1 0 1 0 1 0 1
+ // -------------------
+ // o = 0 1 1 0 0 1 1 1
+ // c = 0 1 0 0 1 1 0 1
+ //
+ // o = a^b^c
+ // _ __ _ _ _ _ _
+ // br = abc + abc + abc + abc = abc + ((a+b))c
+
+ for(size_t i = 0; i < R_QWORDS; i++) {
+ const uint64_t a = upc->slice[j].u.qw[i];
+ const uint64_t b = lsb_mask;
+ const uint64_t tmp = ((~a) & b & (~br[i])) | ((((~a) | b) & br[i]));
+ upc->slice[j].u.qw[i] = a ^ b ^ br[i];
+ br[i] = tmp;
+ }
+ }
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx512.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx512.c
new file mode 100644
index 0000000000..ef7f6d29d5
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx512.c
@@ -0,0 +1,167 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * The rotation functions are based on the Barrel shifter described in [1]
+ * and some modifed snippet from [2]
+ * [1] Chou, T.: QcBits: Constant-Time Small-Key Code-Based Cryptography.
+ * In: Gier-lichs, B., Poschmann, A.Y. (eds.) Cryptographic Hardware
+ * and Embedded Systems– CHES 2016. pp. 280–300. Springer Berlin Heidelberg,
+ * Berlin, Heidelberg (2016)
+ *
+ * [2] Guimarães, Antonio, Diego F Aranha, and Edson Borin. 2019.
+ * “Optimized Implementation of QC-MDPC Code-Based Cryptography.”
+ * Concurrency and Computation: Practice and Experience 31 (18):
+ * e5089. https://doi.org/10.1002/cpe.5089.
+ */
+
+#if defined(S2N_BIKE_R3_AVX512)
+
+#include "decode.h"
+#include "decode_internal.h"
+#include "utilities.h"
+
+#define AVX512_INTERNAL
+#include "x86_64_intrinsic.h"
+
+#define R_ZMM_HALF_LOG2 UPTOPOW2(R_ZMM / 2)
+
+_INLINE_ void
+rotate512_big(OUT syndrome_t *out, IN const syndrome_t *in, size_t zmm_num)
+{
+ // For preventing overflows (comparison in bytes)
+ bike_static_assert(sizeof(*out) >
+ (BYTES_IN_ZMM * (R_ZMM + (2 * R_ZMM_HALF_LOG2))),
+ rotr_big_err);
+ *out = *in;
+
+ for(uint32_t idx = R_ZMM_HALF_LOG2; idx >= 1; idx >>= 1) {
+ const uint8_t mask = secure_l32_mask(zmm_num, idx);
+ zmm_num = zmm_num - (idx & mask);
+
+ for(size_t i = 0; i < (R_ZMM + idx); i++) {
+ const __m512i a = LOAD(&out->qw[8 * (i + idx)]);
+ MSTORE(&out->qw[8 * i], mask, a);
+ }
+ }
+}
+
+// The rotate512_small function is a derivative of the code described in [1]
+_INLINE_ void
+rotate512_small(OUT syndrome_t *out, IN const syndrome_t *in, size_t bitscount)
+{
+ __m512i previous = SET_ZERO;
+ const int count64 = (int)bitscount & 0x3f;
+ const __m512i count64_512 = SET1_I64(count64);
+ const __m512i count64_512r = SET1_I64((int)64 - count64);
+
+ const __m512i num_full_qw = SET1_I64(bitscount >> 6);
+ const __m512i one = SET1_I64(1);
+ __m512i a0, a1;
+
+ __m512i idx = SET_I64(7, 6, 5, 4, 3, 2, 1, 0);
+
+ // Positions above 7 are taken from the second register in
+ // _mm512_permutex2var_epi64
+ idx = ADD_I64(idx, num_full_qw);
+ __m512i idx1 = ADD_I64(idx, one);
+
+ for(int i = R_ZMM; i >= 0; i--) {
+ // Load the next 512 bits
+ const __m512i in512 = LOAD(&in->qw[8 * i]);
+
+ // Rotate the current and previous 512 registers so that their quadwords
+ // would be in the right positions.
+ a0 = PERMX2VAR_I64(in512, idx, previous);
+ a1 = PERMX2VAR_I64(in512, idx1, previous);
+
+ a0 = SRLV_I64(a0, count64_512);
+ a1 = SLLV_I64(a1, count64_512r);
+
+ // Shift less than 64 (quadwords internal)
+ const __m512i out512 = a0 | a1;
+
+ // Store the rotated value
+ STORE(&out->qw[8 * i], out512);
+ previous = in512;
+ }
+}
+
+void rotate_right_avx512(OUT syndrome_t *out,
+ IN const syndrome_t *in,
+ IN const uint32_t bitscount)
+{
+ // 1) Rotate in granularity of 512 bits blocks, using ZMMs
+ rotate512_big(out, in, (bitscount / BITS_IN_ZMM));
+ // 2) Rotate in smaller granularity (less than 512 bits), using ZMMs
+ rotate512_small(out, out, (bitscount % BITS_IN_ZMM));
+}
+
+// Duplicates the first R_BITS of the syndrome three times
+// |------------------------------------------|
+// | Third copy | Second copy | first R_BITS |
+// |------------------------------------------|
+// This is required by the rotate functions.
+void dup_avx512(IN OUT syndrome_t *s)
+{
+ s->qw[R_QWORDS - 1] =
+ (s->qw[0] << LAST_R_QWORD_LEAD) | (s->qw[R_QWORDS - 1] & LAST_R_QWORD_MASK);
+
+ for(size_t i = 0; i < (2 * R_QWORDS) - 1; i++) {
+ s->qw[R_QWORDS + i] =
+ (s->qw[i] >> LAST_R_QWORD_TRAIL) | (s->qw[i + 1] << LAST_R_QWORD_LEAD);
+ }
+}
+
+// Use half-adder as described in [1].
+void bit_sliced_adder_avx512(OUT upc_t *upc,
+ IN OUT syndrome_t *rotated_syndrome,
+ IN const size_t num_of_slices)
+{
+ // From cache-memory perspective this loop should be the outside loop
+ for(size_t j = 0; j < num_of_slices; j++) {
+ for(size_t i = 0; i < R_QWORDS; i++) {
+ const uint64_t carry = (upc->slice[j].u.qw[i] & rotated_syndrome->qw[i]);
+ upc->slice[j].u.qw[i] ^= rotated_syndrome->qw[i];
+ rotated_syndrome->qw[i] = carry;
+ }
+ }
+}
+
+void bit_slice_full_subtract_avx512(OUT upc_t *upc, IN uint8_t val)
+{
+ // Borrow
+ uint64_t br[R_QWORDS] = {0};
+
+ for(size_t j = 0; j < SLICES; j++) {
+
+ const uint64_t lsb_mask = 0 - (val & 0x1);
+ val >>= 1;
+
+ // Perform a - b with c as the input/output carry
+ // br = 0 0 0 0 1 1 1 1
+ // a = 0 0 1 1 0 0 1 1
+ // b = 0 1 0 1 0 1 0 1
+ // -------------------
+ // o = 0 1 1 0 0 1 1 1
+ // c = 0 1 0 0 1 1 0 1
+ //
+ // o = a^b^c
+ // _ __ _ _ _ _ _
+ // br = abc + abc + abc + abc = abc + ((a+b))c
+
+ for(size_t i = 0; i < R_QWORDS; i++) {
+ const uint64_t a = upc->slice[j].u.qw[i];
+ const uint64_t b = lsb_mask;
+ const uint64_t tmp = ((~a) & b & (~br[i])) | ((((~a) | b) & br[i]));
+ upc->slice[j].u.qw[i] = a ^ b ^ br[i];
+ br[i] = tmp;
+ }
+ }
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_internal.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_internal.h
new file mode 100644
index 0000000000..817cc4603a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_internal.h
@@ -0,0 +1,86 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "pq-crypto/s2n_pq.h"
+#include "defs.h"
+#include "types.h"
+
+// Rotate right the first R_BITS of a syndrome.
+// At input, the syndrome is stored as three R_BITS triplicate.
+// (this makes rotation easier to implement)
+// For the output: the output syndrome has only one R_BITS rotation, the remaining
+// (2 * R_BITS) bits are undefined.
+void rotate_right_port(OUT syndrome_t *out,
+ IN const syndrome_t *in,
+ IN uint32_t bitscount);
+void dup_port(IN OUT syndrome_t *s);
+void bit_sliced_adder_port(OUT upc_t *upc,
+ IN OUT syndrome_t *rotated_syndrome,
+ IN const size_t num_of_slices);
+void bit_slice_full_subtract_port(OUT upc_t *upc, IN uint8_t val);
+
+#if defined(S2N_BIKE_R3_AVX2)
+void rotate_right_avx2(OUT syndrome_t *out,
+ IN const syndrome_t *in,
+ IN uint32_t bitscount);
+void dup_avx2(IN OUT syndrome_t *s);
+void bit_sliced_adder_avx2(OUT upc_t *upc,
+ IN OUT syndrome_t *rotated_syndrome,
+ IN const size_t num_of_slices);
+void bit_slice_full_subtract_avx2(OUT upc_t *upc, IN uint8_t val);
+#endif
+
+#if defined(S2N_BIKE_R3_AVX512)
+void rotate_right_avx512(OUT syndrome_t *out,
+ IN const syndrome_t *in,
+ IN uint32_t bitscount);
+void dup_avx512(IN OUT syndrome_t *s);
+void bit_sliced_adder_avx512(OUT upc_t *upc,
+ IN OUT syndrome_t *rotated_syndrome,
+ IN const size_t num_of_slices);
+void bit_slice_full_subtract_avx512(OUT upc_t *upc, IN uint8_t val);
+#endif
+
+// Decode methods struct
+typedef struct decode_ctx_st {
+ void (*rotate_right)(OUT syndrome_t *out,
+ IN const syndrome_t *in,
+ IN uint32_t bitscount);
+ void (*dup)(IN OUT syndrome_t *s);
+ void (*bit_sliced_adder)(OUT upc_t *upc,
+ IN OUT syndrome_t *rotated_syndrom,
+ IN const size_t num_of_slices);
+ void (*bit_slice_full_subtract)(OUT upc_t *upc, IN uint8_t val);
+} decode_ctx;
+
+_INLINE_ void decode_ctx_init(decode_ctx *ctx)
+{
+#if defined(S2N_BIKE_R3_AVX512)
+ if(s2n_bike_r3_is_avx512_enabled()) {
+ ctx->rotate_right = rotate_right_avx512;
+ ctx->dup = dup_avx512;
+ ctx->bit_sliced_adder = bit_sliced_adder_avx512;
+ ctx->bit_slice_full_subtract = bit_slice_full_subtract_avx512;
+ } else
+#endif
+#if defined(S2N_BIKE_R3_AVX2)
+ if(s2n_bike_r3_is_avx2_enabled()) {
+ ctx->rotate_right = rotate_right_avx2;
+ ctx->dup = dup_avx2;
+ ctx->bit_sliced_adder = bit_sliced_adder_avx2;
+ ctx->bit_slice_full_subtract = bit_slice_full_subtract_avx2;
+ } else
+#endif
+ {
+ ctx->rotate_right = rotate_right_port;
+ ctx->dup = dup_port;
+ ctx->bit_sliced_adder = bit_sliced_adder_port;
+ ctx->bit_slice_full_subtract = bit_slice_full_subtract_port;
+ }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_portable.c
new file mode 100644
index 0000000000..846818386d
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_portable.c
@@ -0,0 +1,126 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include "decode.h"
+#include "decode_internal.h"
+#include "utilities.h"
+
+#define R_QWORDS_HALF_LOG2 UPTOPOW2(R_QWORDS / 2)
+
+_INLINE_ void
+rotr_big(OUT syndrome_t *out, IN const syndrome_t *in, IN size_t qw_num)
+{
+ // For preventing overflows (comparison in bytes)
+ bike_static_assert(sizeof(*out) > 8 * (R_QWORDS + (2 * R_QWORDS_HALF_LOG2)),
+ rotr_big_err);
+
+ *out = *in;
+
+ for(uint32_t idx = R_QWORDS_HALF_LOG2; idx >= 1; idx >>= 1) {
+ // Convert 32 bit mask to 64 bit mask
+ const uint64_t mask = ((uint32_t)secure_l32_mask(qw_num, idx) + 1U) - 1ULL;
+ qw_num = qw_num - (idx & u64_barrier(mask));
+
+ // Rotate R_QWORDS quadwords and another idx quadwords,
+ // as needed by the next iteration.
+ for(size_t i = 0; i < (R_QWORDS + idx); i++) {
+ out->qw[i] = (out->qw[i] & u64_barrier(~mask)) |
+ (out->qw[i + idx] & u64_barrier(mask));
+ }
+ }
+}
+
+_INLINE_ void
+rotr_small(OUT syndrome_t *out, IN const syndrome_t *in, IN const size_t bits)
+{
+ bike_static_assert(bits < 64, rotr_small_err);
+ bike_static_assert(sizeof(*out) > (8 * R_QWORDS), rotr_small_qw_err);
+
+ // Convert |bits| to 0/1 by using !!bits; then create a mask of 0 or
+ // 0xffffffffff Use high_shift to avoid undefined behaviour when doing x << 64;
+ const uint64_t mask = (0 - (!!bits));
+ const uint64_t high_shift = (64 - bits) & u64_barrier(mask);
+
+ for(size_t i = 0; i < R_QWORDS; i++) {
+ const uint64_t low_part = in->qw[i] >> bits;
+ const uint64_t high_part = (in->qw[i + 1] << high_shift) & u64_barrier(mask);
+ out->qw[i] = low_part | high_part;
+ }
+}
+
+void rotate_right_port(OUT syndrome_t *out,
+ IN const syndrome_t *in,
+ IN const uint32_t bitscount)
+{
+ // Rotate (64-bit) quad-words
+ rotr_big(out, in, (bitscount / 64));
+ // Rotate bits (less than 64)
+ rotr_small(out, out, (bitscount % 64));
+}
+
+// Duplicates the first R_BITS of the syndrome three times
+// |------------------------------------------|
+// | Third copy | Second copy | first R_BITS |
+// |------------------------------------------|
+// This is required by the rotate functions.
+void dup_port(IN OUT syndrome_t *s)
+{
+ s->qw[R_QWORDS - 1] =
+ (s->qw[0] << LAST_R_QWORD_LEAD) | (s->qw[R_QWORDS - 1] & LAST_R_QWORD_MASK);
+
+ for(size_t i = 0; i < (2 * R_QWORDS) - 1; i++) {
+ s->qw[R_QWORDS + i] =
+ (s->qw[i] >> LAST_R_QWORD_TRAIL) | (s->qw[i + 1] << LAST_R_QWORD_LEAD);
+ }
+}
+
+// Use half-adder as described in [1].
+void bit_sliced_adder_port(OUT upc_t *upc,
+ IN OUT syndrome_t *rotated_syndrome,
+ IN const size_t num_of_slices)
+{
+ // From cache-memory perspective this loop should be the outside loop
+ for(size_t j = 0; j < num_of_slices; j++) {
+ for(size_t i = 0; i < R_QWORDS; i++) {
+ const uint64_t carry = (upc->slice[j].u.qw[i] & rotated_syndrome->qw[i]);
+ upc->slice[j].u.qw[i] ^= rotated_syndrome->qw[i];
+ rotated_syndrome->qw[i] = carry;
+ }
+ }
+}
+
+void bit_slice_full_subtract_port(OUT upc_t *upc, IN uint8_t val)
+{
+ // Borrow
+ uint64_t br[R_QWORDS] = {0};
+
+ for(size_t j = 0; j < SLICES; j++) {
+
+ const uint64_t lsb_mask = 0 - (val & 0x1);
+ val >>= 1;
+
+ // Perform a - b with c as the input/output carry
+ // br = 0 0 0 0 1 1 1 1
+ // a = 0 0 1 1 0 0 1 1
+ // b = 0 1 0 1 0 1 0 1
+ // -------------------
+ // o = 0 1 1 0 0 1 1 1
+ // c = 0 1 0 0 1 1 0 1
+ //
+ // o = a^b^c
+ // _ __ _ _ _ _ _
+ // br = abc + abc + abc + abc = abc + ((a+b))c
+
+ for(size_t i = 0; i < R_QWORDS; i++) {
+ const uint64_t a = upc->slice[j].u.qw[i];
+ const uint64_t b = lsb_mask;
+ const uint64_t tmp = ((~a) & b & (~br[i])) | ((((~a) | b) & br[i]));
+ upc->slice[j].u.qw[i] = a ^ b ^ br[i];
+ br[i] = tmp;
+ }
+ }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/defs.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/defs.h
new file mode 100644
index 0000000000..ab3f5c7a32
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/defs.h
@@ -0,0 +1,107 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+////////////////////////////////////////////
+// Basic defs
+///////////////////////////////////////////
+
+// For code clarity.
+#define IN
+#define OUT
+
+#define ALIGN(n) __attribute__((aligned(n)))
+#define BIKE_UNUSED_ATT __attribute__((unused))
+
+#define _INLINE_ static inline
+
+// In asm the symbols '==' and '?' are not allowed. Therefore, if using
+// divide_and_ceil in asm files, we must ensure with static_assert its validity.
+#if(__cplusplus >= 201103L) || defined(static_assert)
+# define bike_static_assert(COND, MSG) static_assert(COND, "MSG")
+#else
+# define bike_static_assert(COND, MSG) \
+ typedef char static_assertion_##MSG[(COND) ? 1 : -1] BIKE_UNUSED_ATT
+#endif
+
+// Divide by the divider and round up to next integer
+#define DIVIDE_AND_CEIL(x, divider) (((x) + (divider) - 1) / (divider))
+
+// Bit manipulations
+// Linux Assemblies, except for Ubuntu, cannot understand what ULL mean.
+// Therefore, in that case len must be smaller than 31.
+#define BIT(len) (1ULL << (len))
+#define MASK(len) (BIT(len) - 1)
+#define SIZEOF_BITS(b) (sizeof(b) * 8)
+
+#define BYTES_IN_QWORD 0x8
+#define BYTES_IN_XMM 0x10
+#define BYTES_IN_YMM 0x20
+#define BYTES_IN_ZMM 0x40
+
+#define BITS_IN_YMM (BYTES_IN_YMM * 8)
+#define BITS_IN_ZMM (BYTES_IN_ZMM * 8)
+
+#define WORDS_IN_YMM (BYTES_IN_YMM / sizeof(uint16_t))
+#define WORDS_IN_ZMM (BYTES_IN_ZMM / sizeof(uint16_t))
+
+#define QWORDS_IN_XMM (BYTES_IN_XMM / sizeof(uint64_t))
+#define QWORDS_IN_YMM (BYTES_IN_YMM / sizeof(uint64_t))
+#define QWORDS_IN_ZMM (BYTES_IN_ZMM / sizeof(uint64_t))
+
+// Copied from (Kaz answer)
+// https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2
+#define UPTOPOW2_0(v) ((v)-1)
+#define UPTOPOW2_1(v) (UPTOPOW2_0(v) | (UPTOPOW2_0(v) >> 1))
+#define UPTOPOW2_2(v) (UPTOPOW2_1(v) | (UPTOPOW2_1(v) >> 2))
+#define UPTOPOW2_3(v) (UPTOPOW2_2(v) | (UPTOPOW2_2(v) >> 4))
+#define UPTOPOW2_4(v) (UPTOPOW2_3(v) | (UPTOPOW2_3(v) >> 8))
+#define UPTOPOW2_5(v) (UPTOPOW2_4(v) | (UPTOPOW2_4(v) >> 16))
+
+#define UPTOPOW2(v) (UPTOPOW2_5(v) + 1)
+
+// Works only for 0 < v < 512
+#define LOG2_MSB(v) \
+ ((v) == 0 \
+ ? 0 \
+ : ((v) < 2 \
+ ? 1 \
+ : ((v) < 4 \
+ ? 2 \
+ : ((v) < 8 \
+ ? 3 \
+ : ((v) < 16 \
+ ? 4 \
+ : ((v) < 32 \
+ ? 5 \
+ : ((v) < 64 \
+ ? 6 \
+ : ((v) < 128 ? 7 \
+ : ((v) < 256 ? 8 : 9)))))))))
+
+////////////////////////////////////////////
+// Debug
+///////////////////////////////////////////
+
+#if defined(VERBOSE)
+# include <stdio.h>
+
+# define DMSG(...) \
+ { \
+ printf(__VA_ARGS__); \
+ }
+#else
+# define DMSG(...)
+#endif
+
+////////////////////////////////////////////
+// Printing
+///////////////////////////////////////////
+//#define PRINT_IN_BE
+//#define NO_SPACE
+//#define NO_NEWLINE
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.c
new file mode 100644
index 0000000000..9f779b7df9
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.c
@@ -0,0 +1,10 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include "error.h"
+
+__thread _bike_err_t bike_errno;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.h
new file mode 100644
index 0000000000..b1b9db6d5e
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.h
@@ -0,0 +1,33 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "utils/s2n_safety.h"
+
+#define SUCCESS 0
+#define FAIL (-1)
+
+#define ret_t int __attribute__((warn_unused_result))
+
+enum _bike_err
+{
+ E_DECODING_FAILURE = 1,
+ E_AES_CTR_PRF_INIT_FAIL = 2,
+ E_AES_OVER_USED = 3,
+ EXTERNAL_LIB_ERROR_OPENSSL = 4,
+ E_FAIL_TO_GET_SEED = 5
+};
+
+typedef enum _bike_err _bike_err_t;
+
+extern __thread _bike_err_t bike_errno;
+#define BIKE_ERROR(x) \
+ do { \
+ bike_errno = (x); \
+ return FAIL; \
+ } while(0)
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x.h
new file mode 100644
index 0000000000..f4cdb53a80
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x.h
@@ -0,0 +1,29 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "types.h"
+
+// c = a+b mod (x^r - 1)
+_INLINE_ void
+gf2x_mod_add(OUT pad_r_t *c, IN const pad_r_t *a, IN const pad_r_t *b)
+{
+ const uint64_t *a_qwords = (const uint64_t *)a;
+ const uint64_t *b_qwords = (const uint64_t *)b;
+ uint64_t * c_qwords = (uint64_t *)c;
+
+ for(size_t i = 0; i < R_PADDED_QWORDS; i++) {
+ c_qwords[i] = a_qwords[i] ^ b_qwords[i];
+ }
+}
+
+// c = a*b mod (x^r - 1)
+void gf2x_mod_mul(OUT pad_r_t *c, IN const pad_r_t *a, IN const pad_r_t *b);
+
+// c = a^-1 mod (x^r - 1)
+void gf2x_mod_inv(OUT pad_r_t *c, IN const pad_r_t *a);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_internal.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_internal.h
new file mode 100644
index 0000000000..a87478aba1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_internal.h
@@ -0,0 +1,177 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+// For size_t
+#include <stdlib.h>
+
+#include "pq-crypto/s2n_pq.h"
+#include "types.h"
+
+// The size in quadwords of the operands in the gf2x_mul_base function
+// for different implementations.
+#define GF2X_PORT_BASE_QWORDS (1)
+#define GF2X_PCLMUL_BASE_QWORDS (8)
+#define GF2X_VPCLMUL_BASE_QWORDS (16)
+
+// ------------------ FUNCTIONS NEEDED FOR GF2X MULTIPLICATION ------------------
+// GF2X multiplication of a and b of size GF2X_BASE_QWORDS, c = a * b
+void gf2x_mul_base_port(OUT uint64_t *c,
+ IN const uint64_t *a,
+ IN const uint64_t *b);
+void karatzuba_add1_port(OUT uint64_t *alah,
+ OUT uint64_t *blbh,
+ IN const uint64_t *a,
+ IN const uint64_t *b,
+ IN const size_t qwords_len);
+void karatzuba_add2_port(OUT uint64_t *z,
+ IN const uint64_t *x,
+ IN const uint64_t *y,
+ IN const size_t qwords_len);
+void karatzuba_add3_port(OUT uint64_t *c,
+ IN const uint64_t *mid,
+ IN const size_t qwords_len);
+
+// -------------------- FUNCTIONS NEEDED FOR GF2X INVERSION --------------------
+// c = a^2
+void gf2x_sqr_port(OUT dbl_pad_r_t *c, IN const pad_r_t *a);
+// The k-squaring function computes c = a^(2^k) % (x^r - 1),
+// It is required by inversion, where l_param is derived from k.
+void k_sqr_port(OUT pad_r_t *c, IN const pad_r_t *a, IN size_t l_param);
+// c = a mod (x^r - 1)
+void gf2x_red_port(OUT pad_r_t *c, IN const dbl_pad_r_t *a);
+
+// AVX2 versions of the functions
+#if defined(S2N_BIKE_R3_AVX2)
+void karatzuba_add1_avx2(OUT uint64_t *alah,
+ OUT uint64_t *blbh,
+ IN const uint64_t *a,
+ IN const uint64_t *b,
+ IN const size_t qwords_len);
+void karatzuba_add2_avx2(OUT uint64_t *z,
+ IN const uint64_t *x,
+ IN const uint64_t *y,
+ IN const size_t qwords_len);
+void karatzuba_add3_avx2(OUT uint64_t *c,
+ IN const uint64_t *mid,
+ IN const size_t qwords_len);
+void k_sqr_avx2(OUT pad_r_t *c, IN const pad_r_t *a, IN size_t l_param);
+void gf2x_red_avx2(OUT pad_r_t *c, IN const dbl_pad_r_t *a);
+#endif
+
+// AVX512 versions of the functions
+#if defined(S2N_BIKE_R3_AVX512)
+void karatzuba_add1_avx512(OUT uint64_t *alah,
+ OUT uint64_t *blbh,
+ IN const uint64_t *a,
+ IN const uint64_t *b,
+ IN const size_t qwords_len);
+void karatzuba_add2_avx512(OUT uint64_t *z,
+ IN const uint64_t *x,
+ IN const uint64_t *y,
+ IN const size_t qwords_len);
+void karatzuba_add3_avx512(OUT uint64_t *c,
+ IN const uint64_t *mid,
+ IN const size_t qwords_len);
+void k_sqr_avx512(OUT pad_r_t *c, IN const pad_r_t *a, IN size_t l_param);
+void gf2x_red_avx512(OUT pad_r_t *c, IN const dbl_pad_r_t *a);
+#endif
+
+// PCLMUL based multiplication
+#if defined(S2N_BIKE_R3_PCLMUL)
+void gf2x_mul_base_pclmul(OUT uint64_t *c,
+ IN const uint64_t *a,
+ IN const uint64_t *b);
+void gf2x_sqr_pclmul(OUT dbl_pad_r_t *c, IN const pad_r_t *a);
+#endif
+
+// VPCLMUL based multiplication
+#if defined(S2N_BIKE_R3_VPCLMUL)
+void gf2x_mul_base_vpclmul(OUT uint64_t *c,
+ IN const uint64_t *a,
+ IN const uint64_t *b);
+void gf2x_sqr_vpclmul(OUT dbl_pad_r_t *c, IN const pad_r_t *a);
+#endif
+
+// GF2X methods struct
+typedef struct gf2x_ctx_st {
+ size_t mul_base_qwords;
+ void (*mul_base)(OUT uint64_t *c, IN const uint64_t *a, IN const uint64_t *b);
+ void (*karatzuba_add1)(OUT uint64_t *alah,
+ OUT uint64_t *blbh,
+ IN const uint64_t *a,
+ IN const uint64_t *b,
+ IN const size_t qwords_len);
+ void (*karatzuba_add2)(OUT uint64_t *z,
+ IN const uint64_t *x,
+ IN const uint64_t *y,
+ IN const size_t qwords_len);
+ void (*karatzuba_add3)(OUT uint64_t *c,
+ IN const uint64_t *mid,
+ IN const size_t qwords_len);
+
+ void (*sqr)(OUT dbl_pad_r_t *c, IN const pad_r_t *a);
+ void (*k_sqr)(OUT pad_r_t *c, IN const pad_r_t *a, IN size_t l_param);
+
+ void (*red)(OUT pad_r_t *c, IN const dbl_pad_r_t *a);
+} gf2x_ctx;
+
+// Used in gf2x_inv.c to avoid initializing the context many times.
+void gf2x_mod_mul_with_ctx(OUT pad_r_t *c,
+ IN const pad_r_t *a,
+ IN const pad_r_t *b,
+ IN const gf2x_ctx *ctx);
+
+_INLINE_ void gf2x_ctx_init(gf2x_ctx *ctx)
+{
+#if defined(S2N_BIKE_R3_AVX512)
+ if(s2n_bike_r3_is_avx512_enabled()) {
+ ctx->karatzuba_add1 = karatzuba_add1_avx512;
+ ctx->karatzuba_add2 = karatzuba_add2_avx512;
+ ctx->karatzuba_add3 = karatzuba_add3_avx512;
+ ctx->k_sqr = k_sqr_avx512;
+ ctx->red = gf2x_red_avx512;
+ } else
+#endif
+#if defined(S2N_BIKE_R3_AVX2)
+ if(s2n_bike_r3_is_avx2_enabled()) {
+ ctx->karatzuba_add1 = karatzuba_add1_avx2;
+ ctx->karatzuba_add2 = karatzuba_add2_avx2;
+ ctx->karatzuba_add3 = karatzuba_add3_avx2;
+ ctx->k_sqr = k_sqr_avx2;
+ ctx->red = gf2x_red_avx2;
+ } else
+#endif
+ {
+ ctx->karatzuba_add1 = karatzuba_add1_port;
+ ctx->karatzuba_add2 = karatzuba_add2_port;
+ ctx->karatzuba_add3 = karatzuba_add3_port;
+ ctx->k_sqr = k_sqr_port;
+ ctx->red = gf2x_red_port;
+ }
+
+#if defined(S2N_BIKE_R3_VPCLMUL)
+ if(s2n_bike_r3_is_vpclmul_enabled()) {
+ ctx->mul_base_qwords = GF2X_VPCLMUL_BASE_QWORDS;
+ ctx->mul_base = gf2x_mul_base_vpclmul;
+ ctx->sqr = gf2x_sqr_vpclmul;
+ } else
+#endif
+#if defined(S2N_BIKE_R3_PCLMUL)
+ if(s2n_bike_r3_is_pclmul_enabled()) {
+ ctx->mul_base_qwords = GF2X_PCLMUL_BASE_QWORDS;
+ ctx->mul_base = gf2x_mul_base_pclmul;
+ ctx->sqr = gf2x_sqr_pclmul;
+ } else
+#endif
+ {
+ ctx->mul_base_qwords = GF2X_PORT_BASE_QWORDS;
+ ctx->mul_base = gf2x_mul_base_port;
+ ctx->sqr = gf2x_sqr_port;
+ }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_inv.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_inv.c
new file mode 100644
index 0000000000..bea7ee84b1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_inv.c
@@ -0,0 +1,156 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * The inversion algorithm in this file is based on:
+ * [1] Nir Drucker, Shay Gueron, and Dusan Kostic. 2020. "Fast polynomial
+ * inversion for post quantum QC-MDPC cryptography". Cryptology ePrint Archive,
+ * 2020. https://eprint.iacr.org/2020/298.pdf
+ */
+
+#include "cleanup.h"
+#include "gf2x.h"
+#include "gf2x_internal.h"
+
+// a = a^2 mod (x^r - 1)
+_INLINE_ void gf2x_mod_sqr_in_place(IN OUT pad_r_t *a,
+ OUT dbl_pad_r_t *secure_buffer,
+ IN const gf2x_ctx *ctx)
+{
+ ctx->sqr(secure_buffer, a);
+ ctx->red(a, secure_buffer);
+}
+
+// c = a^2^2^num_sqrs
+_INLINE_ void repeated_squaring(OUT pad_r_t *c,
+ IN pad_r_t * a,
+ IN const size_t num_sqrs,
+ OUT dbl_pad_r_t *sec_buf,
+ IN const gf2x_ctx *ctx)
+{
+ c->val = a->val;
+
+ for(size_t i = 0; i < num_sqrs; i++) {
+ gf2x_mod_sqr_in_place(c, sec_buf, ctx);
+ }
+}
+
+// The gf2x_mod_inv function implements inversion in F_2[x]/(x^R - 1)
+// based on [1](Algorithm 2).
+
+// In every iteration, [1](Algorithm 2) performs two exponentiations:
+// exponentiation 0 (exp0) and exponentiation 1 (exp1) of the form f^(2^k).
+// These exponentiations are computed either by repeated squaring of f, k times,
+// or by a single k-squaring of f. The method for a specific value of k
+// is chosen based on the performance of squaring and k-squaring.
+//
+// Benchmarks on several platforms indicate that a good threshold
+// for switching from repeated squaring to k-squaring is k = 64.
+#define K_SQR_THR (64)
+
+// k-squaring is computed by a permutation of bits of the input polynomial,
+// as defined in [1](Observation 1). The required parameter for the permutation
+// is l = (2^k)^-1 % R.
+// Therefore, there are two sets of parameters for every exponentiation:
+// - exp0_k and exp1_k
+// - exp0_l and exp1_l
+
+// Exponentiation 0 computes f^2^2^(i-1) for 0 < i < MAX_I.
+// Exponentiation 1 computes f^2^((r-2) % 2^i) for 0 < i < MAX_I,
+// only when the i-th bit of (r-2) is 1. Therefore, the value 0 in
+// exp1_k[i] and exp1_l[i] means that exp1 is skipped in i-th iteration.
+
+// To quickly generate all the required parameters in Sage:
+// r = DESIRED_R
+// max_i = floor(log(r-2, 2)) + 1
+// exp0_k = [2^i for i in range(max_i)]
+// exp0_l = [inverse_mod((2^k) % r, r) for k in exp0_k]
+// exp1_k = [(r-2)%(2^i) if ((r-2) & (1<<i)) else 0 for i in range(max_i)]
+// exp1_l = [inverse_mod((2^k) % r, r) if k != 0 else 0 for k in exp1_k]
+
+#if(LEVEL == 1)
+// The parameters below are hard-coded for R=12323
+bike_static_assert((R_BITS == 12323), gf2x_inv_r_doesnt_match_parameters);
+
+// MAX_I = floor(log(r-2)) + 1
+# define MAX_I (14)
+# define EXP0_K_VALS \
+ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192
+# define EXP0_L_VALS \
+ 6162, 3081, 3851, 5632, 22, 484, 119, 1838, 1742, 3106, 10650, 1608, 10157, \
+ 8816
+# define EXP1_K_VALS 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 33, 4129
+# define EXP1_L_VALS 0, 0, 0, 0, 0, 6162, 0, 0, 0, 0, 0, 0, 242, 5717
+
+#else
+// The parameters below are hard-coded for R=24659
+bike_static_assert((R_BITS == 24659), gf2x_inv_r_doesnt_match_parameters);
+
+// MAX_I = floor(log(r-2)) + 1
+# define MAX_I (15)
+# define EXP0_K_VALS \
+ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384
+# define EXP0_L_VALS \
+ 12330, 6165, 7706, 3564, 2711, 1139, 15053, 1258, 4388, 20524, 9538, 6393, \
+ 10486, 1715, 6804
+# define EXP1_K_VALS 0, 0, 0, 0, 1, 0, 17, 0, 0, 0, 0, 0, 0, 81, 8273
+# define EXP1_L_VALS 0, 0, 0, 0, 12330, 0, 13685, 0, 0, 0, 0, 0, 0, 23678, 19056
+
+#endif
+
+// Inversion in F_2[x]/(x^R - 1), [1](Algorithm 2).
+// c = a^{-1} mod x^r-1
+void gf2x_mod_inv(OUT pad_r_t *c, IN const pad_r_t *a)
+{
+ // Initialize gf2x methods struct
+ gf2x_ctx ctx = {0};
+ gf2x_ctx_init(&ctx);
+
+ // Note that exp0/1_k/l are predefined constants that depend only on the value
+ // of R. This value is public. Therefore, branches in this function, which
+ // depends on R, are also "public". Code that releases these branches
+ // (taken/not-taken) does not leak secret information.
+ const size_t exp0_k[MAX_I] = {EXP0_K_VALS};
+ const size_t exp0_l[MAX_I] = {EXP0_L_VALS};
+ const size_t exp1_k[MAX_I] = {EXP1_K_VALS};
+ const size_t exp1_l[MAX_I] = {EXP1_L_VALS};
+
+ DEFER_CLEANUP(pad_r_t f = {0}, pad_r_cleanup);
+ DEFER_CLEANUP(pad_r_t g = {0}, pad_r_cleanup);
+ DEFER_CLEANUP(pad_r_t t = {0}, pad_r_cleanup);
+ DEFER_CLEANUP(dbl_pad_r_t sec_buf = {0}, dbl_pad_r_cleanup);
+
+ // Steps 2 and 3 in [1](Algorithm 2)
+ f.val = a->val;
+ t.val = a->val;
+
+ for(size_t i = 1; i < MAX_I; i++) {
+ // Step 5 in [1](Algorithm 2), exponentiation 0: g = f^2^2^(i-1)
+ if(exp0_k[i - 1] <= K_SQR_THR) {
+ repeated_squaring(&g, &f, exp0_k[i - 1], &sec_buf, &ctx);
+ } else {
+ ctx.k_sqr(&g, &f, exp0_l[i - 1]);
+ }
+
+ // Step 6, [1](Algorithm 2): f = f*g
+ gf2x_mod_mul_with_ctx(&f, &g, &f, &ctx);
+
+ if(exp1_k[i] != 0) {
+ // Step 8, [1](Algorithm 2), exponentiation 1: g = f^2^((r-2) % 2^i)
+ if(exp1_k[i] <= K_SQR_THR) {
+ repeated_squaring(&g, &f, exp1_k[i], &sec_buf, &ctx);
+ } else {
+ ctx.k_sqr(&g, &f, exp1_l[i]);
+ }
+
+ // Step 9, [1](Algorithm 2): t = t*g;
+ gf2x_mod_mul_with_ctx(&t, &g, &t, &ctx);
+ }
+ }
+
+ // Step 10, [1](Algorithm 2): c = t^2
+ gf2x_mod_sqr_in_place(&t, &sec_buf, &ctx);
+ c->val = t.val;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx2.c
new file mode 100644
index 0000000000..91ed73d3f2
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx2.c
@@ -0,0 +1,188 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * The k-squaring algorithm in this file is based on:
+ * [1] Nir Drucker, Shay Gueron, and Dusan Kostic. 2020. "Fast polynomial
+ * inversion for post quantum QC-MDPC cryptography". Cryptology ePrint Archive,
+ * 2020. https://eprint.iacr.org/2020/298.pdf
+ */
+
+#if defined(S2N_BIKE_R3_AVX2)
+
+#include "cleanup.h"
+#include "gf2x_internal.h"
+
+#define AVX2_INTERNAL
+#include "x86_64_intrinsic.h"
+
+#define NUM_YMMS (2)
+#define NUM_OF_VALS (NUM_YMMS * WORDS_IN_YMM)
+
+_INLINE_ void generate_map(OUT uint16_t *map, IN const uint16_t l_param)
+{
+ __m256i vmap[NUM_YMMS], vtmp[NUM_YMMS], vr, inc, zero;
+
+ // The permutation map is generated in the following way:
+ // 1. for i = 0 to map size:
+ // 2. map[i] = (i * l_param) % r
+ // However, to avoid the expensive multiplication and modulo operations
+ // we modify the algorithm to:
+ // 1. map[0] = l_param
+ // 2. for i = 1 to map size:
+ // 3. map[i] = map[i - 1] + l_param
+ // 4. if map[i] >= r:
+ // 5. map[i] = map[i] - r
+ // This algorithm is parallelized with vector instructions by processing
+ // certain number of values (NUM_OF_VALS) in parallel. Therefore,
+ // in the beginning we need to initialize the first NUM_OF_VALS elements.
+ for(size_t i = 0; i < NUM_OF_VALS; i++) {
+ map[i] = (i * l_param) % R_BITS;
+ }
+
+ vr = SET1_I16(R_BITS);
+ zero = SET_ZERO;
+
+ // Set the increment vector such that adding it to vmap vectors
+ // gives the next NUM_OF_VALS elements of the map. AVX2 does not
+ // support comparison of vectors where vector elements are considered
+ // as unsigned integers. This is a problem when r > 2^14 because
+ // sum of two values can be greater than 2^15 which would make the it
+ // a negative number when considered as a signed 16-bit integer,
+ // and therefore, the condition in step 4 of the algorithm would be
+ // evaluated incorrectly. So, we use the following trick:
+ // we subtract R from the increment and modify the algorithm:
+ // 1. map[0] = l_param
+ // 2. for i = 1 to map size:
+ // 3. map[i] = map[i - 1] + (l_param - r)
+ // 4. if map[i] < 0:
+ // 5. map[i] = map[i] + r
+ inc = SET1_I16((l_param * NUM_OF_VALS) % R_BITS);
+ inc = SUB_I16(inc, vr);
+
+ // Load the first NUM_OF_VALS elements in the vmap vectors
+ for(size_t i = 0; i < NUM_YMMS; i++) {
+ vmap[i] = LOAD(&map[i * WORDS_IN_YMM]);
+ }
+
+ for(size_t i = NUM_YMMS; i < (R_PADDED / WORDS_IN_YMM); i += NUM_YMMS) {
+ for(size_t j = 0; j < NUM_YMMS; j++) {
+ vmap[j] = ADD_I16(vmap[j], inc);
+ vtmp[j] = CMPGT_I16(zero, vmap[j]);
+ vmap[j] = ADD_I16(vmap[j], vtmp[j] & vr);
+
+ STORE(&map[(i + j) * WORDS_IN_YMM], vmap[j]);
+ }
+ }
+}
+
+// Convert from bytes representation, where every byte holds a single bit,
+// of the polynomial, to a binary representation where every byte
+// holds 8 bits of the polynomial.
+_INLINE_ void bytes_to_bin(OUT pad_r_t *bin_buf, IN const uint8_t *bytes_buf)
+{
+ uint32_t *bin32 = (uint32_t *)bin_buf;
+
+ for(size_t i = 0; i < R_QWORDS * 2; i++) {
+ __m256i t = LOAD(&bytes_buf[i * BYTES_IN_YMM]);
+ bin32[i] = MOVEMASK(t);
+ }
+}
+
+// Convert from binary representation where every byte holds 8 bits
+// of the polynomial, to byte representation where
+// every byte holds a single bit of the polynomial.
+_INLINE_ void bin_to_bytes(OUT uint8_t *bytes_buf, IN const pad_r_t *bin_buf)
+{
+ // The algorithm works by taking every 32 bits of the input and converting
+ // them to 32 bytes where each byte holds one of the bits. The first step is
+ // to broadcast a 32-bit value (call it a) to all elements of vector t.
+ // Then t contains bytes of a in the following order:
+ // t = [ a3 a2 a1 a0 ... a3 a2 a1 a0 ]
+ // where a0 contains the first 8 bits of a, a1 the second 8 bits, etc.
+ // Let the output vector be [ out31 out30 ... out0 ]. We want to store
+ // bit 0 of a in out0 byte, bit 1 of a in out1 byte, ect. (note that
+ // we want to store the bit in the most significant position of a byte
+ // because this is required by MOVEMASK instruction used in bytes_to_bin.)
+ //
+ // Ideally, we would shuffle the bytes of t such that the byte in
+ // i-th position contains i-th bit of val, shift t appropriately and obtain
+ // the result. However, AVX2 doesn't support shift operation on bytes, only
+ // shifts of individual QWORDS (64 bit) and DWORDS (32 bit) are allowed.
+ // Consider the two least significant DWORDS of t:
+ // t = [ ... | a3 a2 a1 a0 | a3 a2 a1 a0 ]
+ // and shift them by 6 and 4 to the left, respectively, to obtain:
+ // t = [ ... | t7 t6 t5 t4 | t3 t2 t1 t0 ]
+ // where t3 = a3 << 6, t2 = a2 << 6, t1 = a1 << 6, t0 = a0 << 6,
+ // and t7 = a3 << 4, t6 = a2 << 4, t5 = a1 << 4, t4 = a0 << 4.
+ // Now we shuffle vector t to obtain vector p such that:
+ // p = [ ... | t12 t12 t8 t8 | t4 t4 t0 t0 ]
+ // Note that in every even position of the vector p we have the right byte
+ // of the input shifted by the required shift. The values in the odd
+ // positions contain the right bytes of the input but they need to be shifted
+ // one more time to the left by 1. By shifting each DWORD of p by 1 we get:
+ // q = [ ... | p7 p6 p5 p4 | p3 p2 p1 p0 ]
+ // where p1 = t0 << 1 = a0 << 7, p3 = t4 << 1 = 5, etc. Therefore, by
+ // blending p and q (taking even positions from p and odd positions from q)
+ // we obtain the desired result.
+
+ __m256i t, p, q;
+
+ const __m256i shift_mask = SET_I32(0, 2, 4, 6, 0, 2, 4, 6);
+
+ const __m256i shuffle_mask =
+ SET_I8(15, 15, 11, 11, 7, 7, 3, 3, 14, 14, 10, 10, 6, 6, 2, 2, 13, 13, 9, 9,
+ 5, 5, 1, 1, 12, 12, 8, 8, 4, 4, 0, 0);
+
+ const __m256i blend_mask = SET1_I16(0x00ff);
+
+ const uint32_t *bin32 = (const uint32_t *)bin_buf;
+
+ for(size_t i = 0; i < R_QWORDS * 2; i++) {
+ t = SET1_I32(bin32[i]);
+ t = SLLV_I32(t, shift_mask);
+
+ p = SHUF_I8(t, shuffle_mask);
+ q = SLLI_I32(p, 1);
+
+ STORE(&bytes_buf[i * 32], BLENDV_I8(p, q, blend_mask));
+ }
+}
+
+// The k-squaring function computes c = a^(2^k) % (x^r - 1).
+// By [1](Observation 1), if
+// a = sum_{j in supp(a)} x^j,
+// then
+// a^(2^k) % (x^r - 1) = sum_{j in supp(a)} x^((j * 2^k) % r).
+// Therefore, k-squaring can be computed as permutation of the bits of "a":
+// pi0 : j --> (j * 2^k) % r.
+// For improved performance, we compute the result by inverted permutation pi1:
+// pi1 : (j * 2^-k) % r --> j.
+// Input argument l_param is defined as the value (2^-k) % r.
+void k_sqr_avx2(OUT pad_r_t *c, IN const pad_r_t *a, IN const size_t l_param)
+{
+ ALIGN(ALIGN_BYTES) uint16_t map[R_PADDED];
+ ALIGN(ALIGN_BYTES) uint8_t a_bytes[R_PADDED];
+ ALIGN(ALIGN_BYTES) uint8_t c_bytes[R_PADDED] = {0};
+
+ // Generate the permutation map defined by pi1 and l_param.
+ generate_map(map, l_param);
+
+ bin_to_bytes(a_bytes, a);
+
+ // Permute "a" using the generated permutation map.
+ for(size_t i = 0; i < R_BITS; i++) {
+ c_bytes[i] = a_bytes[map[i]];
+ }
+
+ bytes_to_bin(c, c_bytes);
+
+ secure_clean(a_bytes, sizeof(a_bytes));
+ secure_clean(c_bytes, sizeof(c_bytes));
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx512.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx512.c
new file mode 100644
index 0000000000..af2c5738a8
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx512.c
@@ -0,0 +1,135 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * The k-squaring algorithm in this file is based on:
+ * [1] Nir Drucker, Shay Gueron, and Dusan Kostic. 2020. "Fast polynomial
+ * inversion for post quantum QC-MDPC cryptography". Cryptology ePrint Archive,
+ * 2020. https://eprint.iacr.org/2020/298.pdf
+ */
+
+#if defined(S2N_BIKE_R3_AVX512)
+
+#include "cleanup.h"
+#include "gf2x_internal.h"
+
+#define AVX512_INTERNAL
+#include "x86_64_intrinsic.h"
+
+#define NUM_ZMMS (2)
+#define NUM_OF_VALS (NUM_ZMMS * WORDS_IN_ZMM)
+
+// clang-3.9 doesn't recognize these two macros
+#if !defined(_MM_CMPINT_EQ)
+# define _MM_CMPINT_EQ (0)
+#endif
+
+#if !defined(_MM_CMPINT_NLT)
+# define _MM_CMPINT_NLT (5)
+#endif
+
+_INLINE_ void generate_map(OUT uint16_t *map, IN const size_t l_param)
+{
+ __m512i vmap[NUM_ZMMS], vr, inc;
+ __mmask32 mask[NUM_ZMMS];
+
+ // The permutation map is generated in the following way:
+ // 1. for i = 0 to map size:
+ // 2. map[i] = (i * l_param) % r
+ // However, to avoid the expensive multiplication and modulo operations
+ // we modify the algorithm to:
+ // 1. map[0] = l_param
+ // 2. for i = 1 to map size:
+ // 3. map[i] = map[i - 1] + l_param
+ // 4. if map[i] >= r:
+ // 5. map[i] = map[i] - r
+ // This algorithm is parallelized with vector instructions by processing
+ // certain number of values (NUM_OF_VALS) in parallel. Therefore,
+ // in the beginning we need to initialize the first NUM_OF_VALS elements.
+ for(size_t i = 0; i < NUM_OF_VALS; i++) {
+ map[i] = (i * l_param) % R_BITS;
+ }
+
+ // Set the increment vector such that by adding it to vmap vectors
+ // we will obtain the next NUM_OF_VALS elements of the map.
+ inc = SET1_I16((l_param * NUM_OF_VALS) % R_BITS);
+ vr = SET1_I16(R_BITS);
+
+ // Load the first NUM_OF_VALS elements in the vmap vectors
+ for(size_t i = 0; i < NUM_ZMMS; i++) {
+ vmap[i] = LOAD(&map[i * WORDS_IN_ZMM]);
+ }
+
+ for(size_t i = NUM_ZMMS; i < (R_PADDED / WORDS_IN_ZMM); i += NUM_ZMMS) {
+ for(size_t j = 0; j < NUM_ZMMS; j++) {
+ vmap[j] = ADD_I16(vmap[j], inc);
+ mask[j] = CMPM_U16(vmap[j], vr, _MM_CMPINT_NLT);
+ vmap[j] = MSUB_I16(vmap[j], mask[j], vmap[j], vr);
+
+ STORE(&map[(i + j) * WORDS_IN_ZMM], vmap[j]);
+ }
+ }
+}
+
+// Convert from bytes representation where each byte holds a single bit
+// to binary representation where each byte holds 8 bits of the polynomial
+_INLINE_ void bytes_to_bin(OUT pad_r_t *bin_buf, IN const uint8_t *bytes_buf)
+{
+ uint64_t *bin64 = (uint64_t *)bin_buf;
+
+ __m512i first_bit_mask = SET1_I8(1);
+ for(size_t i = 0; i < R_QWORDS; i++) {
+ __m512i t = LOAD(&bytes_buf[i * BYTES_IN_ZMM]);
+ bin64[i] = CMPM_U8(t, first_bit_mask, _MM_CMPINT_EQ);
+ }
+}
+
+// Convert from binary representation where each byte holds 8 bits
+// to byte representation where each byte holds a single bit of the polynomial
+_INLINE_ void bin_to_bytes(OUT uint8_t *bytes_buf, IN const pad_r_t *bin_buf)
+{
+ const uint64_t *bin64 = (const uint64_t *)bin_buf;
+
+ for(size_t i = 0; i < R_QWORDS; i++) {
+ __m512i t = SET1MZ_I8(bin64[i], 1);
+ STORE(&bytes_buf[i * BYTES_IN_ZMM], t);
+ }
+}
+
+// The k-squaring function computes c = a^(2^k) % (x^r - 1),
+// By [1](Observation 1), if
+// a = sum_{j in supp(a)} x^j,
+// then
+// a^(2^k) % (x^r - 1) = sum_{j in supp(a)} x^((j * 2^k) % r).
+// Therefore, k-squaring can be computed as permutation of the bits of "a":
+// pi0 : j --> (j * 2^k) % r.
+// For improved performance, we compute the result by inverted permutation pi1:
+// pi1 : (j * 2^-k) % r --> j.
+// Input argument l_param is defined as the value (2^-k) % r.
+void k_sqr_avx512(OUT pad_r_t *c, IN const pad_r_t *a, IN const size_t l_param)
+{
+ ALIGN(ALIGN_BYTES) uint16_t map[R_PADDED];
+ ALIGN(ALIGN_BYTES) uint8_t a_bytes[R_PADDED];
+ ALIGN(ALIGN_BYTES) uint8_t c_bytes[R_PADDED] = {0};
+
+ // Generate the permutation map defined by pi1 and l_param.
+ generate_map(map, l_param);
+
+ bin_to_bytes(a_bytes, a);
+
+ // Permute "a" using the generated permutation map.
+ for(size_t i = 0; i < R_BITS; i++) {
+ c_bytes[i] = a_bytes[map[i]];
+ }
+
+ bytes_to_bin(c, c_bytes);
+
+ secure_clean(a_bytes, sizeof(a_bytes));
+ secure_clean(c_bytes, sizeof(c_bytes));
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_portable.c
new file mode 100644
index 0000000000..c757687f58
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_portable.c
@@ -0,0 +1,48 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * The k-squaring algorithm in this file is based on:
+ * [1] Nir Drucker, Shay Gueron, and Dusan Kostic. 2020. "Fast polynomial
+ * inversion for post quantum QC-MDPC cryptography". Cryptology ePrint Archive,
+ * 2020. https://eprint.iacr.org/2020/298.pdf
+ */
+
+#include "gf2x_internal.h"
+#include "utilities.h"
+
+#define BITS_IN_BYTE (8)
+
+// The k-squaring function computes c = a^(2^k) % (x^r - 1),
+// By [1](Observation 1), if
+// a = sum_{j in supp(a)} x^j,
+// then
+// a^(2^k) % (x^r - 1) = sum_{j in supp(a)} x^((j * 2^k) % r).
+// Therefore, k-squaring can be computed as permutation of the bits of "a":
+// pi0 : j --> (j * 2^k) % r.
+// For improved performance, we compute the result by inverted permutation pi1:
+// pi1 : (j * 2^-k) % r --> j.
+// Input argument l_param is defined as the value (2^-k) % r.
+void k_sqr_port(OUT pad_r_t *c, IN const pad_r_t *a, IN const size_t l_param)
+{
+ bike_memset(c->val.raw, 0, sizeof(c->val));
+
+ // Compute the result byte by byte
+ size_t idx = 0;
+ for(size_t i = 0; i < R_BYTES; i++) {
+ for(size_t j = 0; j < BITS_IN_BYTE; j++, idx++) {
+ // Bit of "c" at position idx is set to the value of
+ // the bit of "a" at position pi1(idx) = (l_param * idx) % R_BITS.
+ size_t pos = (l_param * idx) % R_BITS;
+
+ size_t pos_byte = pos >> 3;
+ size_t pos_bit = pos & 7;
+ uint8_t bit = (a->val.raw[pos_byte] >> pos_bit) & 1;
+
+ c->val.raw[i] |= (bit << j);
+ }
+ }
+ c->val.raw[R_BYTES - 1] &= LAST_R_BYTE_MASK;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul.c
new file mode 100644
index 0000000000..ae1d7a510a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul.c
@@ -0,0 +1,113 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include <assert.h>
+
+#include "cleanup.h"
+#include "gf2x.h"
+#include "gf2x_internal.h"
+
+// The secure buffer size required for Karatsuba is computed by:
+// size(n) = 3*n/2 + size(n/2) = 3*sum_{i}{n/2^i} < 3n
+#define SECURE_BUFFER_QWORDS (3 * R_PADDED_QWORDS)
+
+// Karatsuba multiplication algorithm.
+// Input arguments a and b are padded with zeros, here:
+// - n: real number of digits in a and b (R_QWORDS)
+// - n_padded: padded number of digits of a and b (assumed to be power of 2)
+// A buffer sec_buf is used for storing temporary data between recursion calls.
+// It might contain secrets, and therefore should be securely cleaned after
+// completion.
+_INLINE_ void karatzuba(OUT uint64_t *c,
+ IN const uint64_t *a,
+ IN const uint64_t *b,
+ IN const size_t qwords_len,
+ IN const size_t qwords_len_pad,
+ uint64_t * sec_buf,
+ IN const gf2x_ctx *ctx)
+{
+ if(qwords_len <= ctx->mul_base_qwords) {
+ ctx->mul_base(c, a, b);
+ return;
+ }
+
+ const size_t half_qw_len = qwords_len_pad >> 1;
+
+ // Split a and b into low and high parts of size n_padded/2
+ const uint64_t *a_lo = a;
+ const uint64_t *b_lo = b;
+ const uint64_t *a_hi = &a[half_qw_len];
+ const uint64_t *b_hi = &b[half_qw_len];
+
+ // Split c into 4 parts of size n_padded/2 (the last ptr is not needed)
+ uint64_t *c0 = c;
+ uint64_t *c1 = &c[half_qw_len];
+ uint64_t *c2 = &c[half_qw_len * 2];
+
+ // Allocate 3 ptrs of size n_padded/2 on sec_buf
+ uint64_t *alah = sec_buf;
+ uint64_t *blbh = &sec_buf[half_qw_len];
+ uint64_t *tmp = &sec_buf[half_qw_len * 2];
+
+ // Move sec_buf ptr to the first free location for the next recursion call
+ sec_buf = &sec_buf[half_qw_len * 3];
+
+ // Compute a_lo*b_lo and store the result in (c1|c0)
+ karatzuba(c0, a_lo, b_lo, half_qw_len, half_qw_len, sec_buf, ctx);
+
+ // If the real number of digits n is less or equal to n_padded/2 then:
+ // a_hi = 0 and b_hi = 0
+ // and
+ // (a_hi|a_lo)*(b_hi|b_lo) = a_lo*b_lo
+ // so we can skip the remaining two multiplications
+ if(qwords_len > half_qw_len) {
+ // Compute a_hi*b_hi and store the result in (c3|c2)
+ karatzuba(c2, a_hi, b_hi, qwords_len - half_qw_len, half_qw_len, sec_buf,
+ ctx);
+
+ // Compute alah = (a_lo + a_hi) and blbh = (b_lo + b_hi)
+ ctx->karatzuba_add1(alah, blbh, a, b, half_qw_len);
+
+ // Compute (c1 + c2) and store the result in tmp
+ ctx->karatzuba_add2(tmp, c1, c2, half_qw_len);
+
+ // Compute alah*blbh and store the result in (c2|c1)
+ karatzuba(c1, alah, blbh, half_qw_len, half_qw_len, sec_buf, ctx);
+
+ // Add (tmp|tmp) and (c3|c0) to (c2|c1)
+ ctx->karatzuba_add3(c0, tmp, half_qw_len);
+ }
+}
+
+void gf2x_mod_mul_with_ctx(OUT pad_r_t *c,
+ IN const pad_r_t *a,
+ IN const pad_r_t *b,
+ IN const gf2x_ctx *ctx)
+{
+ bike_static_assert((R_PADDED_BYTES % 2 == 0), karatzuba_n_is_odd);
+
+ DEFER_CLEANUP(dbl_pad_r_t t = {0}, dbl_pad_r_cleanup);
+ ALIGN(ALIGN_BYTES) uint64_t secure_buffer[SECURE_BUFFER_QWORDS];
+
+ karatzuba((uint64_t *)&t, (const uint64_t *)a, (const uint64_t *)b, R_QWORDS,
+ R_PADDED_QWORDS, secure_buffer, ctx);
+
+ ctx->red(c, &t);
+
+ secure_clean((uint8_t *)secure_buffer, sizeof(secure_buffer));
+}
+
+void gf2x_mod_mul(OUT pad_r_t *c, IN const pad_r_t *a, IN const pad_r_t *b)
+{
+ bike_static_assert((R_PADDED_BYTES % 2 == 0), karatzuba_n_is_odd);
+
+ // Initialize gf2x methods struct
+ gf2x_ctx ctx = {0};
+ gf2x_ctx_init(&ctx);
+
+ gf2x_mod_mul_with_ctx(c, a, b, &ctx);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx2.c
new file mode 100644
index 0000000000..8f9c17dc09
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx2.c
@@ -0,0 +1,109 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#if defined(S2N_BIKE_R3_AVX2)
+
+#include <assert.h>
+
+#include "cleanup.h"
+#include "gf2x_internal.h"
+
+#define AVX2_INTERNAL
+#include "x86_64_intrinsic.h"
+
+void karatzuba_add1_avx2(OUT uint64_t *alah,
+ OUT uint64_t *blbh,
+ IN const uint64_t *a,
+ IN const uint64_t *b,
+ IN const size_t qwords_len)
+{
+ assert(qwords_len % REG_QWORDS == 0);
+
+ REG_T va0, va1, vb0, vb1;
+
+ for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+ va0 = LOAD(&a[i]);
+ va1 = LOAD(&a[i + qwords_len]);
+ vb0 = LOAD(&b[i]);
+ vb1 = LOAD(&b[i + qwords_len]);
+
+ STORE(&alah[i], va0 ^ va1);
+ STORE(&blbh[i], vb0 ^ vb1);
+ }
+}
+
+void karatzuba_add2_avx2(OUT uint64_t *z,
+ IN const uint64_t *x,
+ IN const uint64_t *y,
+ IN const size_t qwords_len)
+{
+ assert(qwords_len % REG_QWORDS == 0);
+
+ REG_T vx, vy;
+
+ for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+ vx = LOAD(&x[i]);
+ vy = LOAD(&y[i]);
+
+ STORE(&z[i], vx ^ vy);
+ }
+}
+
+void karatzuba_add3_avx2(OUT uint64_t *c,
+ IN const uint64_t *mid,
+ IN const size_t qwords_len)
+{
+ assert(qwords_len % REG_QWORDS == 0);
+
+ REG_T vr0, vr1, vr2, vr3, vt;
+
+ uint64_t *c0 = c;
+ uint64_t *c1 = &c[qwords_len];
+ uint64_t *c2 = &c[2 * qwords_len];
+ uint64_t *c3 = &c[3 * qwords_len];
+
+ for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+ vr0 = LOAD(&c0[i]);
+ vr1 = LOAD(&c1[i]);
+ vr2 = LOAD(&c2[i]);
+ vr3 = LOAD(&c3[i]);
+ vt = LOAD(&mid[i]);
+
+ STORE(&c1[i], vt ^ vr0 ^ vr1);
+ STORE(&c2[i], vt ^ vr2 ^ vr3);
+ }
+}
+
+// c = a mod (x^r - 1)
+void gf2x_red_avx2(OUT pad_r_t *c, IN const dbl_pad_r_t *a)
+{
+ const uint64_t *a64 = (const uint64_t *)a;
+ uint64_t * c64 = (uint64_t *)c;
+
+ for(size_t i = 0; i < R_QWORDS; i += REG_QWORDS) {
+ REG_T vt0 = LOAD(&a64[i]);
+ REG_T vt1 = LOAD(&a64[i + R_QWORDS]);
+ REG_T vt2 = LOAD(&a64[i + R_QWORDS - 1]);
+
+ vt1 = SLLI_I64(vt1, LAST_R_QWORD_TRAIL);
+ vt2 = SRLI_I64(vt2, LAST_R_QWORD_LEAD);
+
+ vt0 ^= (vt1 | vt2);
+
+ STORE(&c64[i], vt0);
+ }
+
+ c64[R_QWORDS - 1] &= LAST_R_QWORD_MASK;
+
+ // Clean the secrets from the upper part of c
+ secure_clean((uint8_t *)&c64[R_QWORDS],
+ (R_PADDED_QWORDS - R_QWORDS) * sizeof(uint64_t));
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx512.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx512.c
new file mode 100644
index 0000000000..78ce9683ad
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx512.c
@@ -0,0 +1,109 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#if defined(S2N_BIKE_R3_AVX512)
+
+#include <assert.h>
+
+#include "cleanup.h"
+#include "gf2x_internal.h"
+
+#define AVX512_INTERNAL
+#include "x86_64_intrinsic.h"
+
+void karatzuba_add1_avx512(OUT uint64_t *alah,
+ OUT uint64_t *blbh,
+ IN const uint64_t *a,
+ IN const uint64_t *b,
+ IN const size_t qwords_len)
+{
+ assert(qwords_len % REG_QWORDS == 0);
+
+ REG_T va0, va1, vb0, vb1;
+
+ for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+ va0 = LOAD(&a[i]);
+ va1 = LOAD(&a[i + qwords_len]);
+ vb0 = LOAD(&b[i]);
+ vb1 = LOAD(&b[i + qwords_len]);
+
+ STORE(&alah[i], va0 ^ va1);
+ STORE(&blbh[i], vb0 ^ vb1);
+ }
+}
+
+void karatzuba_add2_avx512(OUT uint64_t *z,
+ IN const uint64_t *x,
+ IN const uint64_t *y,
+ IN const size_t qwords_len)
+{
+ assert(qwords_len % REG_QWORDS == 0);
+
+ REG_T vx, vy;
+
+ for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+ vx = LOAD(&x[i]);
+ vy = LOAD(&y[i]);
+
+ STORE(&z[i], vx ^ vy);
+ }
+}
+
+void karatzuba_add3_avx512(OUT uint64_t *c,
+ IN const uint64_t *mid,
+ IN const size_t qwords_len)
+{
+ assert(qwords_len % REG_QWORDS == 0);
+
+ REG_T vr0, vr1, vr2, vr3, vt;
+
+ uint64_t *c0 = c;
+ uint64_t *c1 = &c[qwords_len];
+ uint64_t *c2 = &c[2 * qwords_len];
+ uint64_t *c3 = &c[3 * qwords_len];
+
+ for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+ vr0 = LOAD(&c0[i]);
+ vr1 = LOAD(&c1[i]);
+ vr2 = LOAD(&c2[i]);
+ vr3 = LOAD(&c3[i]);
+ vt = LOAD(&mid[i]);
+
+ STORE(&c1[i], vt ^ vr0 ^ vr1);
+ STORE(&c2[i], vt ^ vr2 ^ vr3);
+ }
+}
+
+// c = a mod (x^r - 1)
+void gf2x_red_avx512(OUT pad_r_t *c, IN const dbl_pad_r_t *a)
+{
+ const uint64_t *a64 = (const uint64_t *)a;
+ uint64_t * c64 = (uint64_t *)c;
+
+ for(size_t i = 0; i < R_QWORDS; i += REG_QWORDS) {
+ REG_T vt0 = LOAD(&a64[i]);
+ REG_T vt1 = LOAD(&a64[i + R_QWORDS]);
+ REG_T vt2 = LOAD(&a64[i + R_QWORDS - 1]);
+
+ vt1 = SLLI_I64(vt1, LAST_R_QWORD_TRAIL);
+ vt2 = SRLI_I64(vt2, LAST_R_QWORD_LEAD);
+
+ vt0 ^= (vt1 | vt2);
+
+ STORE(&c64[i], vt0);
+ }
+
+ c64[R_QWORDS - 1] &= LAST_R_QWORD_MASK;
+
+ // Clean the secrets from the upper part of c
+ secure_clean((uint8_t *)&c64[R_QWORDS],
+ (R_PADDED_QWORDS - R_QWORDS) * sizeof(uint64_t));
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_pclmul.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_pclmul.c
new file mode 100644
index 0000000000..1d4553997c
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_pclmul.c
@@ -0,0 +1,155 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#if defined(S2N_BIKE_R3_PCLMUL)
+
+#include <immintrin.h>
+
+#include "gf2x_internal.h"
+
+#define LOAD128(mem) _mm_loadu_si128((const void *)(mem))
+#define STORE128(mem, reg) _mm_storeu_si128((void *)(mem), (reg))
+#define UNPACKLO(x, y) _mm_unpacklo_epi64((x), (y))
+#define UNPACKHI(x, y) _mm_unpackhi_epi64((x), (y))
+#define CLMUL(x, y, imm) _mm_clmulepi64_si128((x), (y), (imm))
+#define BSRLI(x, imm) _mm_srli_si128((x), (imm))
+#define BSLLI(x, imm) _mm_slli_si128((x), (imm))
+
+// 4x4 Karatsuba multiplication
+_INLINE_ void gf2x_mul4_int(OUT __m128i c[4],
+ IN const __m128i a_lo,
+ IN const __m128i a_hi,
+ IN const __m128i b_lo,
+ IN const __m128i b_hi)
+{
+ // a_lo = [a1 | a0]; a_hi = [a3 | a2];
+ // b_lo = [b1 | b0]; b_hi = [b3 | b2];
+ // 4x4 Karatsuba requires three 2x2 multiplications:
+ // (1) a_lo * b_lo
+ // (2) a_hi * b_hi
+ // (3) aa * bb = (a_lo + a_hi) * (b_lo + b_hi)
+ // Each of the three 2x2 multiplications requires three 1x1 multiplications:
+ // (1) is computed by a0*b0, a1*b1, (a0+a1)*(b0+b1)
+ // (2) is computed by a2*b2, a3*b3, (a2+a3)*(b2+b3)
+ // (3) is computed by aa0*bb0, aa1*bb1, (aa0+aa1)*(bb0+bb1)
+ // All the required additions are performed in the end.
+
+ __m128i aa, bb;
+ __m128i xx, yy, uu, vv, m;
+ __m128i lo[2], hi[2], mi[2];
+ __m128i t[9];
+
+ aa = a_lo ^ a_hi;
+ bb = b_lo ^ b_hi;
+
+ // xx <-- [(a2+a3) | (a0+a1)]
+ // yy <-- [(b2+b3) | (b0+b1)]
+ xx = UNPACKLO(a_lo, a_hi);
+ yy = UNPACKLO(b_lo, b_hi);
+ xx = xx ^ UNPACKHI(a_lo, a_hi);
+ yy = yy ^ UNPACKHI(b_lo, b_hi);
+
+ // uu <-- [ 0 | (aa0+aa1)]
+ // vv <-- [ 0 | (bb0+bb1)]
+ uu = aa ^ BSRLI(aa, 8);
+ vv = bb ^ BSRLI(bb, 8);
+
+ // 9 multiplications
+ t[0] = CLMUL(a_lo, b_lo, 0x00);
+ t[1] = CLMUL(a_lo, b_lo, 0x11);
+ t[2] = CLMUL(a_hi, b_hi, 0x00);
+ t[3] = CLMUL(a_hi, b_hi, 0x11);
+ t[4] = CLMUL(xx, yy, 0x00);
+ t[5] = CLMUL(xx, yy, 0x11);
+ t[6] = CLMUL(aa, bb, 0x00);
+ t[7] = CLMUL(aa, bb, 0x11);
+ t[8] = CLMUL(uu, vv, 0x00);
+
+ t[4] ^= (t[0] ^ t[1]);
+ t[5] ^= (t[2] ^ t[3]);
+ t[8] ^= (t[6] ^ t[7]);
+
+ lo[0] = t[0] ^ BSLLI(t[4], 8);
+ lo[1] = t[1] ^ BSRLI(t[4], 8);
+ hi[0] = t[2] ^ BSLLI(t[5], 8);
+ hi[1] = t[3] ^ BSRLI(t[5], 8);
+ mi[0] = t[6] ^ BSLLI(t[8], 8);
+ mi[1] = t[7] ^ BSRLI(t[8], 8);
+
+ m = lo[1] ^ hi[0];
+
+ c[0] = lo[0];
+ c[1] = lo[0] ^ mi[0] ^ m;
+ c[2] = hi[1] ^ mi[1] ^ m;
+ c[3] = hi[1];
+}
+
+// 512x512bit multiplication performed by Karatsuba algorithm
+// where a and b are considered as having 8 digits of size 64 bits.
+void gf2x_mul_base_pclmul(OUT uint64_t *c,
+ IN const uint64_t *a,
+ IN const uint64_t *b)
+{
+ __m128i va[4], vb[4];
+ __m128i aa[2], bb[2];
+ __m128i lo[4], hi[4], mi[4], m[2];
+
+ for(size_t i = 0; i < 4; i++) {
+ va[i] = LOAD128(&a[QWORDS_IN_XMM * i]);
+ vb[i] = LOAD128(&b[QWORDS_IN_XMM * i]);
+ }
+
+ // Multiply the low and the high halves of a and b
+ // lo <-- a_lo * b_lo
+ // hi <-- a_hi * b_hi
+ gf2x_mul4_int(lo, va[0], va[1], vb[0], vb[1]);
+ gf2x_mul4_int(hi, va[2], va[3], vb[2], vb[3]);
+
+ // Compute the middle multiplication
+ // aa <-- a_lo + a_hi
+ // bb <-- b_lo + b_hi
+ // mi <-- aa * bb
+ aa[0] = va[0] ^ va[2];
+ aa[1] = va[1] ^ va[3];
+ bb[0] = vb[0] ^ vb[2];
+ bb[1] = vb[1] ^ vb[3];
+ gf2x_mul4_int(mi, aa[0], aa[1], bb[0], bb[1]);
+
+ m[0] = lo[2] ^ hi[0];
+ m[1] = lo[3] ^ hi[1];
+
+ STORE128(&c[0 * QWORDS_IN_XMM], lo[0]);
+ STORE128(&c[1 * QWORDS_IN_XMM], lo[1]);
+ STORE128(&c[2 * QWORDS_IN_XMM], mi[0] ^ lo[0] ^ m[0]);
+ STORE128(&c[3 * QWORDS_IN_XMM], mi[1] ^ lo[1] ^ m[1]);
+ STORE128(&c[4 * QWORDS_IN_XMM], mi[2] ^ hi[2] ^ m[0]);
+ STORE128(&c[5 * QWORDS_IN_XMM], mi[3] ^ hi[3] ^ m[1]);
+ STORE128(&c[6 * QWORDS_IN_XMM], hi[2]);
+ STORE128(&c[7 * QWORDS_IN_XMM], hi[3]);
+}
+
+void gf2x_sqr_pclmul(OUT dbl_pad_r_t *c, IN const pad_r_t *a)
+{
+ __m128i va, vr0, vr1;
+
+ const uint64_t *a64 = (const uint64_t *)a;
+ uint64_t * c64 = (uint64_t *)c;
+
+ for(size_t i = 0; i < (R_XMM * QWORDS_IN_XMM); i += QWORDS_IN_XMM) {
+ va = LOAD128(&a64[i]);
+
+ vr0 = CLMUL(va, va, 0x00);
+ vr1 = CLMUL(va, va, 0x11);
+
+ STORE128(&c64[i * 2], vr0);
+ STORE128(&c64[i * 2 + QWORDS_IN_XMM], vr1);
+ }
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_portable.c
new file mode 100644
index 0000000000..86c21a1e28
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_portable.c
@@ -0,0 +1,77 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include "gf2x_internal.h"
+#include "utilities.h"
+
+#define LSB3(x) ((x)&7)
+
+// 64x64 bit multiplication
+// The algorithm is based on the windowing method, for example as in:
+// Brent, R. P., Gaudry, P., Thomé, E., & Zimmermann, P. (2008, May), "Faster
+// multiplication in GF (2)[x]". In: International Algorithmic Number Theory
+// Symposium (pp. 153-166). Springer, Berlin, Heidelberg. In this implementation,
+// the last three bits are multiplied using a schoolbook multiplication.
+void gf2x_mul_base_port(OUT uint64_t *c,
+ IN const uint64_t *a,
+ IN const uint64_t *b)
+{
+ uint64_t h = 0, l = 0, g1, g2, u[8];
+ const uint64_t w = 64;
+ const uint64_t s = 3;
+ const uint64_t a0 = a[0];
+ const uint64_t b0 = b[0];
+
+ // Multiplying 64 bits by 7 can results in an overflow of 3 bits.
+ // Therefore, these bits are masked out, and are treated in step 3.
+ const uint64_t b0m = b0 & MASK(61);
+
+ // Step 1: Calculate a multiplication table with 8 entries.
+ u[0] = 0;
+ u[1] = b0m;
+ u[2] = u[1] << 1;
+ u[3] = u[2] ^ b0m;
+ u[4] = u[2] << 1;
+ u[5] = u[4] ^ b0m;
+ u[6] = u[3] << 1;
+ u[7] = u[6] ^ b0m;
+
+ // Step 2: Multiply two elements in parallel in positions i, i+s
+ l = u[LSB3(a0)] ^ (u[LSB3(a0 >> 3)] << 3);
+ h = (u[LSB3(a0 >> 3)] >> 61);
+
+ for(size_t i = (2 * s); i < w; i += (2 * s)) {
+ const size_t i2 = (i + s);
+
+ g1 = u[LSB3(a0 >> i)];
+ g2 = u[LSB3(a0 >> i2)];
+
+ l ^= (g1 << i) ^ (g2 << i2);
+ h ^= (g1 >> (w - i)) ^ (g2 >> (w - i2));
+ }
+
+ // Step 3: Multiply the last three bits.
+ for(size_t i = 61; i < 64; i++) {
+ uint64_t mask = (-((b0 >> i) & 1));
+ l ^= ((a0 << i) & mask);
+ h ^= ((a0 >> (w - i)) & mask);
+ }
+
+ c[0] = l;
+ c[1] = h;
+}
+
+// c = a^2
+void gf2x_sqr_port(OUT dbl_pad_r_t *c, IN const pad_r_t *a)
+{
+ const uint64_t *a64 = (const uint64_t *)a;
+ uint64_t * c64 = (uint64_t *)c;
+
+ for(size_t i = 0; i < R_QWORDS; i++) {
+ gf2x_mul_base_port(&c64[2 * i], &a64[i], &a64[i]);
+ }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_vpclmul.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_vpclmul.c
new file mode 100644
index 0000000000..c321bf355f
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_vpclmul.c
@@ -0,0 +1,135 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#if defined(S2N_BIKE_R3_VPCLMUL)
+
+#include "gf2x_internal.h"
+
+#define AVX512_INTERNAL
+#include "x86_64_intrinsic.h"
+
+#define CLMUL(x, y, imm) _mm512_clmulepi64_epi128((x), (y), (imm))
+
+_INLINE_ void
+mul2_512(OUT __m512i *h, OUT __m512i *l, IN const __m512i a, IN const __m512i b)
+{
+ const __m512i mask_abq = SET_I64(6, 7, 4, 5, 2, 3, 0, 1);
+ const __m512i s1 = a ^ PERMX_I64(a, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m512i s2 = b ^ PERMX_I64(b, _MM_SHUFFLE(2, 3, 0, 1));
+
+ __m512i lq = CLMUL(a, b, 0x00);
+ __m512i hq = CLMUL(a, b, 0x11);
+ __m512i abq = lq ^ hq ^ CLMUL(s1, s2, 0x00);
+ abq = PERMXVAR_I64(mask_abq, abq);
+ *l = MXOR_I64(lq, 0xaa, lq, abq);
+ *h = MXOR_I64(hq, 0x55, hq, abq);
+}
+
+// 8x8 Karatsuba multiplication
+_INLINE_ void gf2x_mul8_512_int(OUT __m512i *zh,
+ OUT __m512i * zl,
+ IN const __m512i a,
+ IN const __m512i b)
+{
+ const __m512i mask0 = SET_I64(13, 12, 5, 4, 9, 8, 1, 0);
+ const __m512i mask1 = SET_I64(15, 14, 7, 6, 11, 10, 3, 2);
+ const __m512i mask2 = SET_I64(3, 2, 1, 0, 7, 6, 5, 4);
+ const __m512i mask3 = SET_I64(11, 10, 9, 8, 3, 2, 1, 0);
+ const __m512i mask4 = SET_I64(15, 14, 13, 12, 7, 6, 5, 4);
+ const __m512i mask_s1 = SET_I64(7, 6, 5, 4, 1, 0, 3, 2);
+ const __m512i mask_s2 = SET_I64(3, 2, 7, 6, 5, 4, 1, 0);
+
+ __m512i xl, xh, xabl, xabh, xab, xab1, xab2;
+ __m512i yl, yh, yabl, yabh, yab;
+ __m512i t[4];
+
+ // Calculate:
+ // AX1^AX3|| AX2^AX3 || AX0^AX2 || AX0^AX1
+ // BX1^BX3|| BX2^BX3 || BX0^BX2 || BX0^BX1
+ // Where (AX1^AX3 || AX0^AX2) stands for (AX1 || AX0)^(AX3 || AX2) = AY0^AY1
+ t[0] = PERMXVAR_I64(mask_s1, a) ^ PERMXVAR_I64(mask_s2, a);
+ t[1] = PERMXVAR_I64(mask_s1, b) ^ PERMXVAR_I64(mask_s2, b);
+
+ // Calculate:
+ // Don't care || AX1^AX3^AX0^AX2
+ // Don't care || BX1^BX3^BX0^BX2
+ t[2] = t[0] ^ VALIGN(t[0], t[0], 4);
+ t[3] = t[1] ^ VALIGN(t[1], t[1], 4);
+
+ mul2_512(&xh, &xl, a, b);
+ mul2_512(&xabh, &xabl, t[0], t[1]);
+ mul2_512(&yabh, &yabl, t[2], t[3]);
+
+ xab = xl ^ xh ^ PERMX2VAR_I64(xabl, mask0, xabh);
+ yl = PERMX2VAR_I64(xl, mask3, xh);
+ yh = PERMX2VAR_I64(xl, mask4, xh);
+ xab1 = VALIGN(xab, xab, 6);
+ xab2 = VALIGN(xab, xab, 2);
+ yl = MXOR_I64(yl, 0x3c, yl, xab1);
+ yh = MXOR_I64(yh, 0x3c, yh, xab2);
+
+ __m512i oxh = PERMX2VAR_I64(xabl, mask1, xabh);
+ __m512i oxl = VALIGN(oxh, oxh, 4);
+ yab = oxl ^ oxh ^ PERMX2VAR_I64(yabl, mask0, yabh);
+ yab = MXOR_I64(oxh, 0x3c, oxh, VALIGN(yab, yab, 2));
+ yab ^= yl ^ yh;
+
+ // Z0 (yl) + Z1 (yab) + Z2 (yh)
+ yab = PERMXVAR_I64(mask2, yab);
+ *zl = MXOR_I64(yl, 0xf0, yl, yab);
+ *zh = MXOR_I64(yh, 0x0f, yh, yab);
+}
+
+// 1024x1024 bit multiplication performed by Karatsuba algorithm.
+// Here, a and b are considered as having 16 digits of size 64 bits.
+void gf2x_mul_base_vpclmul(OUT uint64_t *c,
+ IN const uint64_t *a,
+ IN const uint64_t *b)
+{
+ const __m512i a0 = LOAD(a);
+ const __m512i a1 = LOAD(&a[QWORDS_IN_ZMM]);
+ const __m512i b0 = LOAD(b);
+ const __m512i b1 = LOAD(&b[QWORDS_IN_ZMM]);
+
+ __m512i hi[2], lo[2], mi[2];
+
+ gf2x_mul8_512_int(&lo[1], &lo[0], a0, b0);
+ gf2x_mul8_512_int(&hi[1], &hi[0], a1, b1);
+ gf2x_mul8_512_int(&mi[1], &mi[0], a0 ^ a1, b0 ^ b1);
+
+ __m512i m = lo[1] ^ hi[0];
+
+ STORE(&c[0 * QWORDS_IN_ZMM], lo[0]);
+ STORE(&c[1 * QWORDS_IN_ZMM], mi[0] ^ lo[0] ^ m);
+ STORE(&c[2 * QWORDS_IN_ZMM], mi[1] ^ hi[1] ^ m);
+ STORE(&c[3 * QWORDS_IN_ZMM], hi[1]);
+}
+
+void gf2x_sqr_vpclmul(OUT dbl_pad_r_t *c, IN const pad_r_t *a)
+{
+ __m512i va, vm, vr0, vr1;
+
+ const uint64_t *a64 = (const uint64_t *)a;
+ uint64_t * c64 = (uint64_t *)c;
+
+ vm = SET_I64(7, 3, 6, 2, 5, 1, 4, 0);
+
+ for(size_t i = 0; i < (R_ZMM * QWORDS_IN_ZMM); i += QWORDS_IN_ZMM) {
+ va = LOAD(&a64[i]);
+ va = PERMXVAR_I64(vm, va);
+
+ vr0 = CLMUL(va, va, 0x00);
+ vr1 = CLMUL(va, va, 0x11);
+
+ STORE(&c64[i * 2], vr0);
+ STORE(&c64[i * 2 + QWORDS_IN_ZMM], vr1);
+ }
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_portable.c
new file mode 100644
index 0000000000..187042d44c
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_portable.c
@@ -0,0 +1,103 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include <assert.h>
+
+#include "cleanup.h"
+#include "gf2x_internal.h"
+
+#define PORTABLE_INTERNAL
+#include "x86_64_intrinsic.h"
+
+void karatzuba_add1_port(OUT uint64_t *alah,
+ OUT uint64_t *blbh,
+ IN const uint64_t *a,
+ IN const uint64_t *b,
+ IN const size_t qwords_len)
+{
+ assert(qwords_len % REG_QWORDS == 0);
+
+ REG_T va0, va1, vb0, vb1;
+
+ for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+ va0 = LOAD(&a[i]);
+ va1 = LOAD(&a[i + qwords_len]);
+ vb0 = LOAD(&b[i]);
+ vb1 = LOAD(&b[i + qwords_len]);
+
+ STORE(&alah[i], va0 ^ va1);
+ STORE(&blbh[i], vb0 ^ vb1);
+ }
+}
+
+void karatzuba_add2_port(OUT uint64_t *z,
+ IN const uint64_t *x,
+ IN const uint64_t *y,
+ IN const size_t qwords_len)
+{
+ assert(qwords_len % REG_QWORDS == 0);
+
+ REG_T vx, vy;
+
+ for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+ vx = LOAD(&x[i]);
+ vy = LOAD(&y[i]);
+
+ STORE(&z[i], vx ^ vy);
+ }
+}
+
+void karatzuba_add3_port(OUT uint64_t *c,
+ IN const uint64_t *mid,
+ IN const size_t qwords_len)
+{
+ assert(qwords_len % REG_QWORDS == 0);
+
+ REG_T vr0, vr1, vr2, vr3, vt;
+
+ uint64_t *c0 = c;
+ uint64_t *c1 = &c[qwords_len];
+ uint64_t *c2 = &c[2 * qwords_len];
+ uint64_t *c3 = &c[3 * qwords_len];
+
+ for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+ vr0 = LOAD(&c0[i]);
+ vr1 = LOAD(&c1[i]);
+ vr2 = LOAD(&c2[i]);
+ vr3 = LOAD(&c3[i]);
+ vt = LOAD(&mid[i]);
+
+ STORE(&c1[i], vt ^ vr0 ^ vr1);
+ STORE(&c2[i], vt ^ vr2 ^ vr3);
+ }
+}
+
+// c = a mod (x^r - 1)
+void gf2x_red_port(OUT pad_r_t *c, IN const dbl_pad_r_t *a)
+{
+ const uint64_t *a64 = (const uint64_t *)a;
+ uint64_t * c64 = (uint64_t *)c;
+
+ for(size_t i = 0; i < R_QWORDS; i += REG_QWORDS) {
+ REG_T vt0 = LOAD(&a64[i]);
+ REG_T vt1 = LOAD(&a64[i + R_QWORDS]);
+ REG_T vt2 = LOAD(&a64[i + R_QWORDS - 1]);
+
+ vt1 = SLLI_I64(vt1, LAST_R_QWORD_TRAIL);
+ vt2 = SRLI_I64(vt2, LAST_R_QWORD_LEAD);
+
+ vt0 ^= (vt1 | vt2);
+
+ STORE(&c64[i], vt0);
+ }
+
+ c64[R_QWORDS - 1] &= LAST_R_QWORD_MASK;
+
+ // Clean the secrets from the upper part of c
+ secure_clean((uint8_t *)&c64[R_QWORDS],
+ (R_PADDED_QWORDS - R_QWORDS) * sizeof(uint64_t));
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.c
new file mode 100644
index 0000000000..a76a31ef87
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.c
@@ -0,0 +1,170 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include <assert.h>
+
+#include "sampling.h"
+#include "sampling_internal.h"
+
+// SIMD implementation of is_new function requires the size of wlist
+// to be a multiple of the number of DWORDS in a SIMD register (REG_DWORDS).
+// The function is used both for generating DV and T1 random numbers so we define
+// two separate macros.
+#define AVX512_REG_DWORDS (16)
+#define WLIST_SIZE_ADJUSTED_D \
+ (AVX512_REG_DWORDS * DIVIDE_AND_CEIL(DV, AVX512_REG_DWORDS))
+#define WLIST_SIZE_ADJUSTED_T \
+ (AVX512_REG_DWORDS * DIVIDE_AND_CEIL(T1, AVX512_REG_DWORDS))
+
+// BSR returns ceil(log2(val))
+_INLINE_ uint8_t bit_scan_reverse_vartime(IN uint64_t val)
+{
+ // index is always smaller than 64
+ uint8_t index = 0;
+
+ while(val != 0) {
+ val >>= 1;
+ index++;
+ }
+
+ return index;
+}
+
+_INLINE_ ret_t get_rand_mod_len(OUT uint32_t * rand_pos,
+ IN const uint32_t len,
+ IN OUT aes_ctr_prf_state_t *prf_state)
+{
+ const uint64_t mask = MASK(bit_scan_reverse_vartime(len));
+
+ do {
+ // Generate a 32 bits (pseudo) random value.
+ // This can be optimized to take only 16 bits.
+ POSIX_GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos)));
+
+ // Mask relevant bits only
+ (*rand_pos) &= mask;
+
+ // Break if a number that is smaller than len is found
+ if((*rand_pos) < len) {
+ break;
+ }
+
+ } while(1 == 1);
+
+ return SUCCESS;
+}
+
+_INLINE_ void make_odd_weight(IN OUT r_t *r)
+{
+ if(((r_bits_vector_weight(r) % 2) == 1)) {
+ // Already odd
+ return;
+ }
+
+ r->raw[0] ^= 1;
+}
+
+// Returns an array of r pseudorandom bits.
+// No restrictions exist for the top or bottom bits.
+// If the generation requires an odd number, then set must_be_odd=1.
+// The function uses the provided prf context.
+ret_t sample_uniform_r_bits_with_fixed_prf_context(
+ OUT r_t *r,
+ IN OUT aes_ctr_prf_state_t *prf_state,
+ IN const must_be_odd_t must_be_odd)
+{
+ // Generate random data
+ POSIX_GUARD(aes_ctr_prf(r->raw, prf_state, R_BYTES));
+
+ // Mask upper bits of the MSByte
+ r->raw[R_BYTES - 1] &= MASK(R_BITS + 8 - (R_BYTES * 8));
+
+ if(must_be_odd == MUST_BE_ODD) {
+ make_odd_weight(r);
+ }
+
+ return SUCCESS;
+}
+
+ret_t generate_indices_mod_z(OUT idx_t * out,
+ IN const size_t num_indices,
+ IN const size_t z,
+ IN OUT aes_ctr_prf_state_t *prf_state,
+ IN const sampling_ctx *ctx)
+{
+ size_t ctr = 0;
+
+ // Generate num_indices unique (pseudo) random numbers modulo z
+ do {
+ POSIX_GUARD(get_rand_mod_len(&out[ctr], z, prf_state));
+ ctr += ctx->is_new(out, ctr);
+ } while(ctr < num_indices);
+
+ return SUCCESS;
+}
+
+// Returns an array of r pseudorandom bits.
+// No restrictions exist for the top or bottom bits.
+// If the generation requires an odd number, then set must_be_odd = MUST_BE_ODD
+ret_t sample_uniform_r_bits(OUT r_t *r,
+ IN const seed_t * seed,
+ IN const must_be_odd_t must_be_odd)
+{
+ // For the seedexpander
+ DEFER_CLEANUP(aes_ctr_prf_state_t prf_state = {0}, aes_ctr_prf_state_cleanup);
+
+ POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed));
+
+ POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd));
+
+ return SUCCESS;
+}
+
+ret_t generate_sparse_rep(OUT pad_r_t *r,
+ OUT idx_t *wlist,
+ IN OUT aes_ctr_prf_state_t *prf_state)
+{
+
+ // Initialize the sampling context
+ sampling_ctx ctx;
+ sampling_ctx_init(&ctx);
+
+ idx_t wlist_temp[WLIST_SIZE_ADJUSTED_D] = {0};
+
+ POSIX_GUARD(generate_indices_mod_z(wlist_temp, DV, R_BITS, prf_state, &ctx));
+
+ bike_memcpy(wlist, wlist_temp, DV * sizeof(idx_t));
+ ctx.secure_set_bits(r, 0, wlist, DV);
+
+ return SUCCESS;
+}
+
+ret_t generate_error_vector(OUT pad_e_t *e, IN const seed_t *seed)
+{
+ DEFER_CLEANUP(aes_ctr_prf_state_t prf_state = {0}, aes_ctr_prf_state_cleanup);
+
+ POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed));
+
+ // Initialize the sampling context
+ sampling_ctx ctx;
+ sampling_ctx_init(&ctx);
+
+ idx_t wlist[WLIST_SIZE_ADJUSTED_T] = {0};
+ POSIX_GUARD(generate_indices_mod_z(wlist, T1, N_BITS, &prf_state, &ctx));
+
+ // (e0, e1) hold bits 0..R_BITS-1 and R_BITS..2*R_BITS-1 of the error, resp.
+ ctx.secure_set_bits(&e->val[0], 0, wlist, T1);
+ ctx.secure_set_bits(&e->val[1], R_BITS, wlist, T1);
+
+ // Clean the padding of the elements
+ PE0_RAW(e)[R_BYTES - 1] &= LAST_R_BYTE_MASK;
+ PE1_RAW(e)[R_BYTES - 1] &= LAST_R_BYTE_MASK;
+ bike_memset(&PE0_RAW(e)[R_BYTES], 0, R_PADDED_BYTES - R_BYTES);
+ bike_memset(&PE1_RAW(e)[R_BYTES], 0, R_PADDED_BYTES - R_BYTES);
+
+ return SUCCESS;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.h
new file mode 100644
index 0000000000..a9d50c9bc2
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.h
@@ -0,0 +1,40 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include <stdlib.h>
+#include "aes_ctr_prf.h"
+#include "pq-crypto/s2n_pq_random.h"
+#include "utils/s2n_result.h"
+#include "utilities.h"
+
+typedef enum
+{
+ NO_RESTRICTION = 0,
+ MUST_BE_ODD = 1
+} must_be_odd_t;
+
+_INLINE_ ret_t get_seeds(OUT seeds_t *seeds) {
+ if(s2n_result_is_ok(s2n_get_random_bytes(seeds->seed[0].raw, sizeof(seeds_t)))) {
+ return SUCCESS;
+ } else {
+ BIKE_ERROR(E_FAIL_TO_GET_SEED);
+ }
+}
+
+// Returns an array of r pseudorandom bits. If an odd
+// weight of r is required, set must_be_odd to MUST_BE_ODD.
+ret_t sample_uniform_r_bits(OUT r_t *r,
+ IN const seed_t *seed,
+ IN must_be_odd_t must_be_odd);
+
+ret_t generate_sparse_rep(OUT pad_r_t *r,
+ OUT idx_t *wlist,
+ IN OUT aes_ctr_prf_state_t *prf_state);
+
+ret_t generate_error_vector(OUT pad_e_t *e, IN const seed_t *seed);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx2.c
new file mode 100644
index 0000000000..c23be2e86e
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx2.c
@@ -0,0 +1,123 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#if defined(S2N_BIKE_R3_AVX2)
+
+#include <assert.h>
+
+#include "sampling_internal.h"
+
+#define AVX2_INTERNAL
+#include "x86_64_intrinsic.h"
+
+// For improved performance, we process NUM_YMMS amount of data in parallel.
+#define NUM_YMMS (4)
+#define YMMS_QWORDS (QWORDS_IN_YMM * NUM_YMMS)
+
+void secure_set_bits_avx2(OUT pad_r_t * r,
+ IN const size_t first_pos,
+ IN const idx_t *wlist,
+ IN const size_t w_size)
+{
+ // The function assumes that the size of r is a multiple
+ // of the cumulative size of used YMM registers.
+ assert((sizeof(*r) / sizeof(uint64_t)) % YMMS_QWORDS == 0);
+
+ // va vectors hold the bits of the output array "r"
+ // va_pos_qw vectors hold the qw position indices of "r"
+ // The algorithm works as follows:
+ // 1. Initialize va_pos_qw with starting positions of qw's of "r"
+ // va_pos_qw = (3, 2, 1, 0);
+ // 2. While the size of "r" is not exceeded:
+ // 3. For each w in wlist:
+ // 4. Compare the pos_qw of w with positions in va_pos_qw
+ // and for the position which is equal set the appropriate
+ // bit in va vector.
+ // 5. Set va_pos_qw to the next qw positions of "r"
+ __m256i va[NUM_YMMS], va_pos_qw[NUM_YMMS], va_mask;
+ __m256i w_pos_qw, w_pos_bit;
+ __m256i one, inc;
+
+ uint64_t *r64 = (uint64_t *)r;
+
+ one = SET1_I64(1);
+ inc = SET1_I64(QWORDS_IN_YMM);
+
+ // 1. Initialize
+ va_pos_qw[0] = SET_I64(3, 2, 1, 0);
+ for(size_t i = 1; i < NUM_YMMS; i++) {
+ va_pos_qw[i] = ADD_I64(va_pos_qw[i - 1], inc);
+ }
+
+ // va_pos_qw vectors hold qw positions 0 .. (NUM_YMMS * QWORDS_IN_YMM - 1)
+ // Therefore, we set the increment vector inc such that by adding it to
+ // va_pos_qw vectors, they hold the next YMM_QWORDS qw positions.
+ inc = SET1_I64(YMMS_QWORDS);
+
+ for(size_t i = 0; i < (sizeof(*r) / sizeof(uint64_t)); i += YMMS_QWORDS) {
+ for(size_t va_iter = 0; va_iter < NUM_YMMS; va_iter++) {
+ va[va_iter] = SET_ZERO;
+ }
+
+ for(size_t w_iter = 0; w_iter < w_size; w_iter++) {
+ int32_t w = wlist[w_iter] - first_pos;
+ w_pos_qw = SET1_I64(w >> 6);
+ w_pos_bit = SLLI_I64(one, w & MASK(6));
+
+ // 4. Compare the positions in va_pos_qw with w_pos_qw
+ // and set the appropriate bit in va
+ for(size_t va_iter = 0; va_iter < NUM_YMMS; va_iter++) {
+ va_mask = CMPEQ_I64(va_pos_qw[va_iter], w_pos_qw);
+ va[va_iter] |= (va_mask & w_pos_bit);
+ }
+ }
+
+ // 5. Set the va_pos_qw to the next qw positions of r
+ // and store the previously computed data in r
+ for(size_t va_iter = 0; va_iter < NUM_YMMS; va_iter++) {
+ STORE(&r64[i + (va_iter * QWORDS_IN_YMM)], va[va_iter]);
+ va_pos_qw[va_iter] = ADD_I64(va_pos_qw[va_iter], inc);
+ }
+ }
+}
+
+int is_new_avx2(IN const idx_t *wlist, IN const size_t ctr)
+{
+ bike_static_assert((sizeof(idx_t) == sizeof(uint32_t)), idx_t_is_not_uint32_t);
+
+ REG_T idx_ctr = SET1_I32(wlist[ctr]);
+
+ for(size_t i = 0; i < ctr; i += REG_DWORDS) {
+ // Comparisons are done with SIMD instructions with each SIMD register
+ // containing REG_DWORDS elements. We compare registers element-wise:
+ // idx_ctr = {8 repetitions of wlist[ctr]}, with
+ // idx_cur = {8 consecutive elements from wlist}.
+ // In the last iteration we consider wlist elements only up to ctr.
+
+ REG_T idx_cur = LOAD(&wlist[i]);
+ REG_T cmp_res = CMPEQ_I32(idx_ctr, idx_cur);
+ uint32_t check = MOVEMASK(cmp_res);
+
+ // Handle the last iteration by appropriate masking.
+ if(ctr < (i + REG_DWORDS)) {
+ // MOVEMASK instruction in AVX2 compares corresponding bytes from
+ // two given vector registers and produces a 32-bit mask. On the other hand,
+ // we compare idx_t elements, not bytes, so we multiply by sizeof(idx_t).
+ check &= MASK((ctr - i) * sizeof(idx_t));
+ }
+
+ if(check != 0) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx512.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx512.c
new file mode 100644
index 0000000000..6cab4cffea
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx512.c
@@ -0,0 +1,123 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#if defined(S2N_BIKE_R3_AVX512)
+
+#include <assert.h>
+
+#include "sampling_internal.h"
+
+#define AVX512_INTERNAL
+#include "x86_64_intrinsic.h"
+
+// For improved performance, we process NUM_ZMMS amount of data in parallel.
+#define NUM_ZMMS (8)
+#define ZMMS_QWORDS (QWORDS_IN_ZMM * NUM_ZMMS)
+
+void secure_set_bits_avx512(OUT pad_r_t * r,
+ IN const size_t first_pos,
+ IN const idx_t *wlist,
+ IN const size_t w_size)
+{
+ // The function assumes that the size of r is a multiple
+ // of the cumulative size of used ZMM registers.
+ assert((sizeof(*r) / sizeof(uint64_t)) % ZMMS_QWORDS == 0);
+
+ // va vectors hold the bits of the output array "r"
+ // va_pos_qw vectors hold the qw position indices of "r"
+ // The algorithm works as follows:
+ // 1. Initialize va_pos_qw with starting positions of qw's of "r"
+ // va_pos_qw = (7, 6, 5, 4, 3, 2, 1, 0);
+ // 2. While the size of "r" is not exceeded:
+ // 3. For each w in wlist:
+ // 4. Compare the pos_qw of w with positions in va_pos_qw
+ // and for the position which is equal set the appropriate
+ // bit in va vector.
+ // 5. Set va_pos_qw to the next qw positions of "r"
+ __m512i va[NUM_ZMMS], va_pos_qw[NUM_ZMMS];
+ __m512i w_pos_qw, w_pos_bit, one, inc;
+ __mmask8 va_mask;
+
+ uint64_t *r64 = (uint64_t *)r;
+
+ one = SET1_I64(1);
+ inc = SET1_I64(QWORDS_IN_ZMM);
+
+ // 1. Initialize
+ va_pos_qw[0] = SET_I64(7, 6, 5, 4, 3, 2, 1, 0);
+ for(size_t i = 1; i < NUM_ZMMS; i++) {
+ va_pos_qw[i] = ADD_I64(va_pos_qw[i - 1], inc);
+ }
+
+ // va_pos_qw vectors hold qw positions 0 .. (NUM_ZMMS * QWORDS_IN_ZMM - 1)
+ // Therefore, we set the increment vector inc such that by adding it to
+ // va_pos_qw vectors they hold the next ZMMS_QWORDS qw positions.
+ inc = SET1_I64(ZMMS_QWORDS);
+
+ for(size_t i = 0; i < (sizeof(*r) / sizeof(uint64_t)); i += ZMMS_QWORDS) {
+ for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) {
+ va[va_iter] = SET_ZERO;
+ }
+
+ for(size_t w_iter = 0; w_iter < w_size; w_iter++) {
+ int32_t w = wlist[w_iter] - first_pos;
+ w_pos_qw = SET1_I64(w >> 6);
+#if (defined(__GNUC__) && ((__GNUC__ == 6) || (__GNUC__ == 5)) && !defined(__clang__)) || (defined(__clang__) && __clang_major__ == 3 && __clang_minor__ == 9)
+ // Workaround for gcc-6, gcc-5, and clang < 3.9, which do not allowing the second
+ // argument of SLLI to be non-immediate value.
+ __m512i temp = SET1_I64(w & MASK(6));
+ w_pos_bit = SLLV_I64(one, temp);
+#else
+ w_pos_bit = SLLI_I64(one, w & MASK(6));
+#endif
+
+ // 4. Compare the positions in va_pos_qw with w_pos_qw
+ // and set the appropriate bit in va
+ for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) {
+ va_mask = CMPMEQ_I64(va_pos_qw[va_iter], w_pos_qw);
+ va[va_iter] = MOR_I64(va[va_iter], va_mask, va[va_iter], w_pos_bit);
+ }
+ }
+
+ // 5. Set the va_pos_qw to the next qw positions of r
+ // and store the previously computed data in r
+ for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) {
+ STORE(&r64[i + (va_iter * QWORDS_IN_ZMM)], va[va_iter]);
+ va_pos_qw[va_iter] = ADD_I64(va_pos_qw[va_iter], inc);
+ }
+ }
+}
+
+int is_new_avx512(IN const idx_t *wlist, IN const size_t ctr)
+{
+ bike_static_assert((sizeof(idx_t) == sizeof(uint32_t)), idx_t_is_not_uint32_t);
+
+ REG_T idx_ctr = SET1_I32(wlist[ctr]);
+
+ for(size_t i = 0; i < ctr; i += REG_DWORDS) {
+ // Comparisons are done with SIMD instructions with each SIMD register
+ // containing REG_DWORDS elements. We compare registers element-wise:
+ // idx_ctr = {8 repetitions of wlist[ctr]}, with
+ // idx_cur = {8 consecutive elements from wlist}.
+ // In the last iteration we consider wlist elements only up to ctr.
+
+ REG_T idx_cur = LOAD(&wlist[i]);
+
+ uint16_t mask = (ctr < (i + REG_DWORDS)) ? MASK(ctr - i) : 0xffff;
+ uint16_t check = MCMPMEQ_I32(mask, idx_ctr, idx_cur);
+
+ if(check != 0) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_internal.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_internal.h
new file mode 100644
index 0000000000..3fd68354f2
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_internal.h
@@ -0,0 +1,66 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "pq-crypto/s2n_pq.h"
+#include "defs.h"
+#include "types.h"
+
+void secure_set_bits_port(OUT pad_r_t *r,
+ IN size_t first_pos,
+ IN const idx_t *wlist,
+ IN size_t w_size);
+
+// Compares wlist[ctr] to w[i] for all i < ctr.
+// Returns 0 if wlist[ctr] is contained in wlist, returns 1 otherwise.
+int is_new_port(IN const idx_t *wlist, IN const size_t ctr);
+
+#if defined(S2N_BIKE_R3_AVX2)
+void secure_set_bits_avx2(OUT pad_r_t *r,
+ IN size_t first_pos,
+ IN const idx_t *wlist,
+ IN size_t w_size);
+
+int is_new_avx2(IN const idx_t *wlist, IN const size_t ctr);
+#endif
+
+#if defined(S2N_BIKE_R3_AVX512)
+void secure_set_bits_avx512(OUT pad_r_t *r,
+ IN size_t first_pos,
+ IN const idx_t *wlist,
+ IN size_t w_size);
+int is_new_avx512(IN const idx_t *wlist, IN const size_t ctr);
+#endif
+
+typedef struct sampling_ctx_st {
+ void (*secure_set_bits)(OUT pad_r_t *r,
+ IN size_t first_pos,
+ IN const idx_t *wlist,
+ IN size_t w_size);
+ int (*is_new)(IN const idx_t *wlist, IN const size_t ctr);
+} sampling_ctx;
+
+_INLINE_ void sampling_ctx_init(sampling_ctx *ctx)
+{
+#if defined(S2N_BIKE_R3_AVX512)
+ if(s2n_bike_r3_is_avx512_enabled()) {
+ ctx->secure_set_bits = secure_set_bits_avx512;
+ ctx->is_new = is_new_avx512;
+ } else
+#endif
+#if defined(S2N_BIKE_R3_AVX2)
+ if(s2n_bike_r3_is_avx2_enabled()) {
+ ctx->secure_set_bits = secure_set_bits_avx2;
+ ctx->is_new = is_new_avx2;
+ } else
+#endif
+ {
+ ctx->secure_set_bits = secure_set_bits_port;
+ ctx->is_new = is_new_port;
+ }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_portable.c
new file mode 100644
index 0000000000..b670730f0a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_portable.c
@@ -0,0 +1,60 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include <assert.h>
+
+#include "sampling_internal.h"
+#include "utilities.h"
+
+#define MAX_WLIST_SIZE (T1 > DV ? T1 : DV)
+
+void secure_set_bits_port(OUT pad_r_t * r,
+ IN const size_t first_pos,
+ IN const idx_t *wlist,
+ IN const size_t w_size)
+{
+ assert(w_size <= MAX_WLIST_SIZE);
+
+ // Ideally we would like to cast r.val but it is not guaranteed to be aligned
+ // as the entire pad_r_t structure. Thus, we assert that the position of val
+ // is at the beginning of r.
+ bike_static_assert(offsetof(pad_r_t, val) == 0, val_wrong_pos_in_pad_r_t);
+ uint64_t *a64 = (uint64_t *)r;
+ uint64_t val, mask;
+
+ // The size of wlist can be either DV or T. So, we set it to max(D, T)
+ size_t pos_qw[MAX_WLIST_SIZE];
+ size_t pos_bit[MAX_WLIST_SIZE];
+
+ // Identify the QW position of every value, and the bit position inside this QW.
+ for(size_t i = 0; i < w_size; i++) {
+ int32_t w = wlist[i] - first_pos;
+ pos_qw[i] = w >> 6;
+ pos_bit[i] = BIT(w & MASK(6));
+ }
+
+ // Fill each QW in constant time
+ for(size_t i = 0; i < (sizeof(*r) / sizeof(uint64_t)); i++) {
+ val = 0;
+ for(size_t j = 0; j < w_size; j++) {
+ mask = (-1ULL) + (!secure_cmp32(pos_qw[j], i));
+ val |= (pos_bit[j] & mask);
+ }
+ a64[i] = val;
+ }
+}
+
+int is_new_port(IN const idx_t *wlist, IN const size_t ctr)
+{
+ for(size_t i = 0; i < ctr; i++) {
+ if(wlist[i] == wlist[ctr]) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sha.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sha.h
new file mode 100644
index 0000000000..1857d6e638
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sha.h
@@ -0,0 +1,43 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "cleanup.h"
+#include "error.h"
+#include "types.h"
+#include "utilities.h"
+
+#include <openssl/sha.h>
+
+#define SHA384_DGST_BYTES 48ULL
+#define SHA384_DGST_QWORDS (SHA384_DGST_BYTES / 8)
+
+#define SHA512_DGST_BYTES 64ULL
+#define SHA512_DGST_QWORDS (SHA512_DGST_BYTES / 8)
+
+typedef struct sha384_dgst_s {
+ union {
+ uint8_t raw[SHA384_DGST_BYTES];
+ uint64_t qw[SHA384_DGST_QWORDS];
+ } u;
+} sha384_dgst_t;
+bike_static_assert(sizeof(sha384_dgst_t) == SHA384_DGST_BYTES, sha384_dgst_size);
+
+typedef sha384_dgst_t sha_dgst_t;
+CLEANUP_FUNC(sha_dgst, sha_dgst_t)
+
+_INLINE_ ret_t sha(OUT sha_dgst_t * dgst,
+ IN const uint32_t byte_len,
+ IN const uint8_t *msg)
+{
+ if(SHA384(msg, byte_len, dgst->u.raw) != NULL) {
+ return SUCCESS;
+ }
+
+ return FAIL;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/types.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/types.h
new file mode 100644
index 0000000000..436a584f3e
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/types.h
@@ -0,0 +1,120 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "bike_defs.h"
+#include "error.h"
+
+typedef struct uint128_s {
+ union {
+ uint8_t bytes[16]; // NOLINT
+ uint32_t dw[4]; // NOLINT
+ uint64_t qw[2]; // NOLINT
+ } u;
+} uint128_t;
+
+// Make sure no compiler optimizations.
+#pragma pack(push, 1)
+
+typedef struct seed_s {
+ uint8_t raw[SEED_BYTES];
+} seed_t;
+
+typedef struct seeds_s {
+ seed_t seed[NUM_OF_SEEDS];
+} seeds_t;
+
+typedef struct r_s {
+ uint8_t raw[R_BYTES];
+} r_t;
+
+typedef struct m_s {
+ uint8_t raw[M_BYTES];
+} m_t;
+
+typedef struct e_s {
+ r_t val[N0];
+} e_t;
+
+#define E0_RAW(e) ((e)->val[0].raw)
+#define E1_RAW(e) ((e)->val[1].raw)
+
+typedef struct ct_s {
+ r_t c0;
+ m_t c1;
+} ct_t;
+
+typedef r_t pk_t;
+
+typedef struct ss_st {
+ uint8_t raw[SS_BYTES];
+} ss_t;
+
+typedef uint32_t idx_t;
+
+typedef struct compressed_idx_d_s {
+ idx_t val[DV];
+} compressed_idx_d_t;
+
+typedef compressed_idx_d_t compressed_idx_d_ar_t[N0];
+
+// The secret key holds both representations, to avoid
+// the compression in Decaps.
+typedef struct sk_s {
+ compressed_idx_d_ar_t wlist;
+ r_t bin[N0];
+ pk_t pk;
+ m_t sigma;
+} sk_t;
+
+typedef ALIGN(sizeof(idx_t)) sk_t aligned_sk_t;
+
+// Pad r to the next Block
+typedef struct pad_r_s {
+ r_t val;
+ uint8_t pad[R_PADDED_BYTES - sizeof(r_t)];
+} ALIGN(ALIGN_BYTES) pad_r_t;
+
+// Double padded r, required for multiplication and squaring
+typedef struct dbl_pad_r_s {
+ uint8_t raw[2 * R_PADDED_BYTES];
+} ALIGN(ALIGN_BYTES) dbl_pad_r_t;
+
+typedef struct pad_e_s {
+ pad_r_t val[N0];
+} ALIGN(ALIGN_BYTES) pad_e_t;
+
+#define PE0_RAW(e) ((e)->val[0].val.raw)
+#define PE1_RAW(e) ((e)->val[1].val.raw)
+
+typedef struct func_k_s {
+ m_t m;
+ r_t c0;
+ m_t c1;
+} func_k_t;
+
+// For a faster rotate we triplicate the syndrome (into 3 copies)
+typedef struct syndrome_s {
+ uint64_t qw[3 * R_QWORDS];
+} ALIGN(ALIGN_BYTES) syndrome_t;
+
+typedef struct upc_slice_s {
+ union {
+ pad_r_t r;
+ uint64_t qw[sizeof(pad_r_t) / sizeof(uint64_t)];
+ } ALIGN(ALIGN_BYTES) u;
+} ALIGN(ALIGN_BYTES) upc_slice_t;
+
+typedef struct upc_s {
+ upc_slice_t slice[SLICES];
+} upc_t;
+
+#pragma pack(pop)
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.c
new file mode 100644
index 0000000000..0c6ad3ea0f
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.c
@@ -0,0 +1,24 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include <inttypes.h>
+
+#include "utilities.h"
+
+#define BITS_IN_QWORD 64ULL
+#define BITS_IN_BYTE 8ULL
+
+uint64_t r_bits_vector_weight(IN const r_t *in)
+{
+ uint64_t acc = 0;
+ for(size_t i = 0; i < (R_BYTES - 1); i++) {
+ acc += __builtin_popcount(in->raw[i]);
+ }
+
+ acc += __builtin_popcount(in->raw[R_BYTES - 1] & LAST_R_BYTE_MASK);
+ return acc;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.h
new file mode 100644
index 0000000000..f544990a1a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.h
@@ -0,0 +1,139 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+// For memset
+#include <string.h>
+
+#include "types.h"
+
+uint64_t r_bits_vector_weight(IN const r_t *in);
+
+// "VALUE_BARRIER returns |a|, but prevents GCC and Clang from reasoning about
+// the returned value. This is used to mitigate compilers undoing constant-time
+// code, until we can express our requirements directly in the language.
+// Note the compiler is aware that |VALUE_BARRIER| has no side effects and
+// always has the same output for a given input. This allows it to eliminate
+// dead code, move computations across loops, and vectorize."
+// See:
+// https://github.com/google/boringssl/commit/92b7c89e6e8ba82924b57153bea68241cc45f658
+#if(defined(__GNUC__) || defined(__clang__))
+# define VALUE_BARRIER(name, type) \
+ _INLINE_ type name##_barrier(type a) \
+ { \
+ __asm__("" : "+r"(a) : /* no inputs */); \
+ return a; \
+ }
+#else
+# define VALUE_BARRIER(name, type) \
+ _INLINE_ type name##_barrier(type a) { return a; }
+#endif
+
+VALUE_BARRIER(u8, uint8_t)
+VALUE_BARRIER(u32, uint32_t)
+VALUE_BARRIER(u64, uint64_t)
+
+// Comparing value in a constant time manner
+_INLINE_ uint32_t secure_cmp(IN const uint8_t *a,
+ IN const uint8_t *b,
+ IN const uint32_t size)
+{
+ volatile uint8_t res = 0;
+
+ for(uint32_t i = 0; i < size; ++i) {
+ res |= (a[i] ^ b[i]);
+ }
+
+ return (0 == res);
+}
+
+// Return 1 if the arguments are equal to each other. Return 0 otherwise.
+_INLINE_ uint32_t secure_cmp32(IN const uint32_t v1, IN const uint32_t v2)
+{
+#if defined(__aarch64__)
+ uint32_t res;
+ __asm__ __volatile__("cmp %w[V1], %w[V2]; \n "
+ "cset %w[RES], EQ; \n"
+ : [RES] "=r"(res)
+ : [V1] "r"(v1), [V2] "r"(v2)
+ : "cc" /*The condition code flag*/);
+ return res;
+#elif defined(__x86_64__) || defined(__i386__)
+ uint32_t res;
+ __asm__ __volatile__("xor %%edx, %%edx; \n"
+ "cmp %1, %2; \n "
+ "sete %%dl; \n"
+ "mov %%edx, %0; \n"
+ : "=r"(res)
+ : "r"(v1), "r"(v2)
+ : "rdx");
+ return res;
+#else
+ // Insecure comparison: The main purpose of secure_cmp32 is to avoid
+ // branches to prevent potential side channel leaks. To do that,
+ // we normally leverage some special CPU instructions such as "sete"
+ // (for __x86_64__) and "cset" (for __aarch64__). When dealing with general
+ // CPU architectures, the interpretation of the line below is left for the
+ // compiler. It could lead to an "insecure" branch. This case needs to be
+ // checked individually on such platforms
+ // (e.g., by checking the compiler-generated assembly).
+ return (v1 == v2 ? 1 : 0);
+#endif
+}
+
+// Return 0 if v1 < v2, (-1) otherwise
+_INLINE_ uint32_t secure_l32_mask(IN const uint32_t v1, IN const uint32_t v2)
+{
+#if defined(__aarch64__)
+ uint32_t res;
+ __asm__ __volatile__("cmp %w[V2], %w[V1]; \n "
+ "cset %w[RES], HI; \n"
+ : [RES] "=r"(res)
+ : [V1] "r"(v1), [V2] "r"(v2)
+ : "cc" /*The condition code flag*/);
+ return (res - 1);
+#elif defined(__x86_64__) || defined(__i386__)
+ uint32_t res;
+ __asm__ __volatile__("xor %%edx, %%edx; \n"
+ "cmp %1, %2; \n "
+ "setl %%dl; \n"
+ "dec %%edx; \n"
+ "mov %%edx, %0; \n"
+
+ : "=r"(res)
+ : "r"(v2), "r"(v1)
+ : "rdx");
+
+ return res;
+#else
+ // If v1 >= v2 then the subtraction result is 0^32||(v1-v2).
+ // else it is 1^32||(v2-v1+1). Subsequently, negating the upper
+ // 32 bits gives 0 if v1 < v2 and otherwise (-1).
+ return ~((uint32_t)(((uint64_t)v1 - (uint64_t)v2) >> 32));
+#endif
+}
+
+// bike_memcpy avoids the undefined behaviour of memcpy when byte_len=0
+_INLINE_ void *bike_memcpy(void *dst, const void *src, size_t byte_len)
+{
+ if(byte_len == 0) {
+ return dst;
+ }
+
+ return memcpy(dst, src, byte_len);
+}
+
+// bike_memset avoids the undefined behaviour of memset when byte_len=0
+_INLINE_ void *bike_memset(void *dst, const int ch, size_t byte_len)
+{
+ if(byte_len == 0) {
+ return dst;
+ }
+
+ return memset(dst, ch, byte_len);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/x86_64_intrinsic.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/x86_64_intrinsic.h
new file mode 100644
index 0000000000..b5c1e989bd
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/x86_64_intrinsic.h
@@ -0,0 +1,132 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+// This file contains definitions of macros for SIMD intrinsic functions, used
+// throughout the code package. Where necessary, we add a suffix to a macro,
+// and denote the type of the elements (operateds). For example,
+// - I16 denotes 16-bit wide integers,
+// - U64 denotes 64-bit wide unsigned integers.
+
+#pragma once
+
+#if defined(S2N_BIKE_R3_AVX2) || defined(S2N_BIKE_R3_AVX512)
+# include <immintrin.h>
+#endif
+
+// clang 3.9 doesn't recognize this macro
+#if !defined(_MM_CMPINT_EQ)
+# define _MM_CMPINT_EQ (0)
+#endif
+
+// For functions in gf2x_mul.c we use exactly the same code for
+// PORTABLE, AVX2, AVX512 implementations. Based on the implementation,
+// we define macros for the different data types (uint64_t, __m256i, __m512i),
+// and all the required operations (LOAD, STORE, >>, <<) on these types.
+#if defined(AVX2_INTERNAL)
+
+# define REG_T __m256i
+
+# define LOAD(mem) _mm256_loadu_si256((const void *)(mem))
+# define STORE(mem, reg) _mm256_storeu_si256((void *)(mem), (reg))
+
+# define SLLI_I64(a, imm) _mm256_slli_epi64(a, imm)
+# define SRLI_I64(a, imm) _mm256_srli_epi64(a, imm)
+
+#elif defined(AVX512_INTERNAL)
+
+# define REG_T __m512i
+
+# define LOAD(mem) _mm512_loadu_si512((mem))
+# define STORE(mem, reg) _mm512_storeu_si512((mem), (reg))
+
+# define SLLI_I64(a, imm) _mm512_slli_epi64(a, imm)
+# define SRLI_I64(a, imm) _mm512_srli_epi64(a, imm)
+
+#elif defined(PORTABLE_INTERNAL)
+
+# define REG_T uint64_t
+
+# define LOAD(mem) (mem)[0]
+# define STORE(mem, val) (mem)[0] = val
+
+# define SLLI_I64(a, imm) ((a) << (imm))
+# define SRLI_I64(a, imm) ((a) >> (imm))
+
+#endif
+
+// NOLINT is used to avoid the sizeof(T)/sizeof(T) warning when REG_T is defined
+// to be uint64_t
+#define REG_QWORDS (sizeof(REG_T) / sizeof(uint64_t)) // NOLINT
+#define REG_DWORDS (sizeof(REG_T) / sizeof(uint32_t)) // NOLINT
+
+// The rest of the SIMD macros that are
+// required for AVX2 and AVX512 implementation.
+#if defined(AVX2_INTERNAL)
+
+# define SET_I8(...) _mm256_set_epi8(__VA_ARGS__)
+# define SET_I32(...) _mm256_set_epi32(__VA_ARGS__)
+# define SET_I64(...) _mm256_set_epi64x(__VA_ARGS__)
+# define SET1_I8(a) _mm256_set1_epi8(a)
+# define SET1_I16(a) _mm256_set1_epi16(a)
+# define SET1_I32(a) _mm256_set1_epi32(a)
+# define SET1_I64(a) _mm256_set1_epi64x(a)
+# define SET_ZERO _mm256_setzero_si256()
+
+# define ADD_I8(a, b) _mm256_add_epi8(a, b)
+# define SUB_I8(a, b) _mm256_sub_epi8(a, b)
+# define ADD_I16(a, b) _mm256_add_epi16(a, b)
+# define SUB_I16(a, b) _mm256_sub_epi16(a, b)
+# define ADD_I64(a, b) _mm256_add_epi64(a, b)
+# define SRLI_I16(a, imm) _mm256_srli_epi16(a, imm)
+# define SLLI_I32(a, imm) _mm256_slli_epi32(a, imm)
+# define SLLV_I32(a, b) _mm256_sllv_epi32(a, b)
+
+# define CMPGT_I16(a, b) _mm256_cmpgt_epi16(a, b)
+# define CMPEQ_I16(a, b) _mm256_cmpeq_epi16(a, b)
+# define CMPEQ_I32(a, b) _mm256_cmpeq_epi32(a, b)
+# define CMPEQ_I64(a, b) _mm256_cmpeq_epi64(a, b)
+
+# define SHUF_I8(a, b) _mm256_shuffle_epi8(a, b)
+# define BLENDV_I8(a, b, mask) _mm256_blendv_epi8(a, b, mask)
+# define PERMVAR_I32(a, idx) _mm256_permutevar8x32_epi32(a, idx)
+# define PERM_I64(a, imm) _mm256_permute4x64_epi64(a, imm)
+
+# define MOVEMASK(a) _mm256_movemask_epi8(a)
+
+#elif defined(AVX512_INTERNAL)
+
+# define MSTORE(mem, mask, reg) _mm512_mask_store_epi64((mem), (mask), (reg))
+
+# define SET1_I8(a) _mm512_set1_epi8(a)
+# define SET1_I32(a) _mm512_set1_epi32(a)
+# define SET1_I64(a) _mm512_set1_epi64(a)
+# define SET1MZ_I8(mask, a) _mm512_maskz_set1_epi8(mask, a)
+# define SET1_I16(a) _mm512_set1_epi16(a)
+# define SET_I64(...) _mm512_set_epi64(__VA_ARGS__)
+# define SET_ZERO _mm512_setzero_si512()
+
+# define ADD_I16(a, b) _mm512_add_epi16(a, b)
+# define ADD_I64(a, b) _mm512_add_epi64(a, b)
+# define MSUB_I16(src, k, a, b) _mm512_mask_sub_epi16(src, k, a, b)
+# define SRLI_I16(a, imm) _mm512_srli_epi16(a, imm)
+# define SRLV_I64(a, cnt) _mm512_srlv_epi64(a, cnt)
+# define SLLV_I64(a, cnt) _mm512_sllv_epi64(a, cnt)
+# define MOR_I64(src, mask, a, b) _mm512_mask_or_epi64(src, mask, a, b)
+# define MXOR_I64(src, mask, a, b) _mm512_mask_xor_epi64(src, mask, a, b)
+# define VALIGN(a, b, count) _mm512_alignr_epi64(a, b, count)
+
+# define CMPM_U8(a, b, cmp_op) _mm512_cmp_epu8_mask(a, b, cmp_op)
+# define CMPM_U16(a, b, cmp_op) _mm512_cmp_epu16_mask(a, b, cmp_op)
+# define CMPMEQ_I64(a, b) _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_EQ)
+# define MCMPMEQ_I32(mask, a, b) \
+ _mm512_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ)
+
+# define PERMX_I64(a, imm) _mm512_permutex_epi64(a, imm)
+# define PERMX2VAR_I64(a, idx, b) _mm512_permutex2var_epi64(a, idx, b)
+# define PERMXVAR_I64(idx, a) _mm512_permutexvar_epi64(idx, a)
+
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c
index c37548326d..4c520b693f 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c
@@ -188,7 +188,7 @@ int PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) {
uint8_t *noiseseed = buf + KYBER_SYMBYTES;
uint8_t nonce = 0;
- GUARD_AS_POSIX(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
hash_g(buf, buf, KYBER_SYMBYTES);
gen_a(a, publicseed);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c
index 9de3c1daef..5b4c088b11 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c
@@ -22,14 +22,14 @@
* Returns 0 (success)
**************************************************/
int kyber_512_90s_r2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
size_t i;
PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(pk, sk);
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i];
}
hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
- GUARD_AS_POSIX(s2n_get_random_bytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES)); /* Value z for pseudo-random output on reject */
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES)); /* Value z for pseudo-random output on reject */
return 0;
}
@@ -46,11 +46,11 @@ int kyber_512_90s_r2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
* Returns 0 (success)
**************************************************/
int kyber_512_90s_r2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */
uint8_t buf[2 * KYBER_SYMBYTES];
- GUARD_AS_POSIX(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */
hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */
@@ -78,7 +78,7 @@ int kyber_512_90s_r2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk)
* On failure, ss will contain a pseudo-random value.
**************************************************/
int kyber_512_90s_r2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
size_t i;
uint8_t fail;
uint8_t cmp[KYBER_CIPHERTEXTBYTES];
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h
index 720bee975a..66fc5a9484 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h
@@ -6,8 +6,8 @@
extern const int16_t PQCLEAN_KYBER51290S_CLEAN_zetas[128];
extern const int16_t PQCLEAN_KYBER51290S_CLEAN_zetasinv[128];
-void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t *poly);
-void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t *poly);
+void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t poly[256]);
+void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t poly[256]);
void PQCLEAN_KYBER51290S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);
#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c
index 233b5d8515..1b76bb9b0c 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c
@@ -188,7 +188,7 @@ int PQCLEAN_KYBER512_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) {
uint8_t *noiseseed = buf + KYBER_SYMBYTES;
uint8_t nonce = 0;
- GUARD_AS_POSIX(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
hash_g(buf, buf, KYBER_SYMBYTES);
gen_a(a, publicseed);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c
index 9871084bb4..140ec352d4 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c
@@ -22,14 +22,14 @@
* Returns 0 (success)
**************************************************/
int kyber_512_r2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
size_t i;
PQCLEAN_KYBER512_CLEAN_indcpa_keypair(pk, sk);
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i];
}
hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
- GUARD_AS_POSIX(s2n_get_random_bytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES)); /* Value z for pseudo-random output on reject */
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES)); /* Value z for pseudo-random output on reject */
return 0;
}
@@ -46,11 +46,11 @@ int kyber_512_r2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
* Returns 0 (success)
**************************************************/
int kyber_512_r2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */
uint8_t buf[2 * KYBER_SYMBYTES];
- GUARD_AS_POSIX(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */
hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */
@@ -78,7 +78,7 @@ int kyber_512_r2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
* On failure, ss will contain a pseudo-random value.
**************************************************/
int kyber_512_r2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
size_t i;
uint8_t fail;
uint8_t cmp[KYBER_CIPHERTEXTBYTES];
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h
index 13e976f7d0..7885e7cdc6 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h
@@ -6,8 +6,8 @@
extern const int16_t PQCLEAN_KYBER512_CLEAN_zetas[128];
extern const int16_t PQCLEAN_KYBER512_CLEAN_zetasinv[128];
-void PQCLEAN_KYBER512_CLEAN_ntt(int16_t *poly);
-void PQCLEAN_KYBER512_CLEAN_invntt(int16_t *poly);
+void PQCLEAN_KYBER512_CLEAN_ntt(int16_t poly[256]);
+void PQCLEAN_KYBER512_CLEAN_invntt(int16_t poly[256]);
void PQCLEAN_KYBER512_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);
#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SIMD256_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SIMD256_avx2.c
new file mode 100644
index 0000000000..349442f65c
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SIMD256_avx2.c
@@ -0,0 +1,1284 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+// extra headers are removed: smmintrin.h, wmmintrin.h and emmintrin.h
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+#include "KeccakP-align_avx2.h"
+#include "KeccakP-1600-times4-SnP_avx2.h"
+#include "KeccakP-SIMD256-config_avx2.h"
+
+#include "KeccakP-brg_endian_avx2.h"
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+#error Expecting a little-endian platform
+#endif
+
+typedef unsigned char UINT8;
+typedef unsigned long long int UINT64;
+typedef __m128i V128;
+typedef __m256i V256;
+
+#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
+
+#if defined(KeccakP1600times4_useAVX2)
+ #define ANDnu256(a, b) _mm256_andnot_si256(a, b)
+ // correcting cast-align error
+ // old version: #define CONST256(a) _mm256_load_si256((const V256 *)&(a))
+ #define CONST256(a) _mm256_load_si256((const void *)&(a))
+ #define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a))
+ #define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
+ // correcting cast-align error
+ // old version: #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
+ #define LOAD256u(a) _mm256_loadu_si256((const void *)&(a))
+ #define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
+ #define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
+ #define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
+ #define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
+static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
+static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
+ #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
+ // correcting cast-align error
+ // old version: #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
+ #define STORE256u(a, b) _mm256_storeu_si256((void *)&(a), b)
+ #define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)
+ #define XOR256(a, b) _mm256_xor_si256(a, b)
+ #define XOReq256(a, b) a = _mm256_xor_si256(a, b)
+ #define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b))
+ #define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b))
+ #define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
+ #define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
+
+ #define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \
+ lanesH01 = UNPACKH( lanes0, lanes1 ), \
+ lanesL23 = UNPACKL( lanes2, lanes3 ), \
+ lanesH23 = UNPACKH( lanes2, lanes3 ), \
+ lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
+ lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
+ lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
+ lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
+
+ #define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
+ lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
+ lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
+ lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
+ lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
+ lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
+ lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
+ lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
+
+#endif
+
+#define SnP_laneLengthInBytes 8
+
+void KeccakP1600times4_InitializeAll(void *states)
+{
+ memset(states, 0, KeccakP1600times4_statesSizeInBytes);
+}
+
+void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+ unsigned int sizeLeft = length;
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
+ const unsigned char *curData = data;
+ UINT64 *statesAsLanes = (UINT64 *)states;
+
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
+ UINT64 lane = 0;
+ if (bytesInLane > sizeLeft)
+ bytesInLane = sizeLeft;
+ memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
+ sizeLeft -= bytesInLane;
+ lanePosition++;
+ curData += bytesInLane;
+ }
+
+ while(sizeLeft >= SnP_laneLengthInBytes) {
+ // correcting cast-align error
+ // old version: UINT64 lane = *((const UINT64*)curData);
+ UINT64 lane = *((const UINT64*)(const void *)curData);
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
+ sizeLeft -= SnP_laneLengthInBytes;
+ lanePosition++;
+ curData += SnP_laneLengthInBytes;
+ }
+
+ if (sizeLeft > 0) {
+ UINT64 lane = 0;
+ memcpy(&lane, curData, sizeLeft);
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
+ }
+}
+
+void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+ V256 *stateAsLanes = (V256 *)states;
+ unsigned int i;
+ // correcting cast-align errors
+ // old version: const UINT64 *curData0 = (const UINT64 *)data;
+ const UINT64 *curData0 = (const void *)data;
+ // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
+ const UINT64 *curData1 = (const void *)(data+laneOffset*SnP_laneLengthInBytes);
+ // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
+ const UINT64 *curData2 = (const void *)(data+laneOffset*2*SnP_laneLengthInBytes);
+ // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
+ const UINT64 *curData3 = (const void *)(data+laneOffset*3*SnP_laneLengthInBytes);
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
+
+ #define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
+
+ #define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
+ lanes1 = LOAD256u( curData1[argIndex]),\
+ lanes2 = LOAD256u( curData2[argIndex]),\
+ lanes3 = LOAD256u( curData3[argIndex]),\
+ INTLEAVE(),\
+ XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
+ XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
+ XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
+ XOReq256( stateAsLanes[argIndex+3], lanes3 )
+
+ if ( laneCount >= 16 ) {
+ Xor_In4( 0 );
+ Xor_In4( 4 );
+ Xor_In4( 8 );
+ Xor_In4( 12 );
+ if ( laneCount >= 20 ) {
+ Xor_In4( 16 );
+ for(i=20; i<laneCount; i++)
+ Xor_In( i );
+ }
+ else {
+ for(i=16; i<laneCount; i++)
+ Xor_In( i );
+ }
+ }
+ else {
+ for(i=0; i<laneCount; i++)
+ Xor_In( i );
+ }
+ #undef Xor_In
+ #undef Xor_In4
+}
+
+void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+ unsigned int sizeLeft = length;
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
+ const unsigned char *curData = data;
+ UINT64 *statesAsLanes = (UINT64 *)states;
+
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
+ if (bytesInLane > sizeLeft)
+ bytesInLane = sizeLeft;
+ memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
+ sizeLeft -= bytesInLane;
+ lanePosition++;
+ curData += bytesInLane;
+ }
+
+ while(sizeLeft >= SnP_laneLengthInBytes) {
+ // correcting cast-align error
+ // old version: UINT64 lane = *((const UINT64*)curData);
+ UINT64 lane = *((const UINT64*)(const void*)curData);
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
+ sizeLeft -= SnP_laneLengthInBytes;
+ lanePosition++;
+ curData += SnP_laneLengthInBytes;
+ }
+
+ if (sizeLeft > 0) {
+ memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
+ }
+}
+
+void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+ V256 *stateAsLanes = (V256 *)states;
+ unsigned int i;
+ // correcting cast-align errors
+ // old version: const UINT64 *curData0 = (const UINT64 *)data;
+ const UINT64 *curData0 = (const void *)data;
+ // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
+ const UINT64 *curData1 = (const void *)(data+laneOffset*SnP_laneLengthInBytes);
+ // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
+ const UINT64 *curData2 = (const void *)(data+laneOffset*2*SnP_laneLengthInBytes);
+ // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
+ const UINT64 *curData3 = (const void *)(data+laneOffset*3*SnP_laneLengthInBytes);
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
+
+ #define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
+
+ #define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
+ lanes1 = LOAD256u( curData1[argIndex]),\
+ lanes2 = LOAD256u( curData2[argIndex]),\
+ lanes3 = LOAD256u( curData3[argIndex]),\
+ INTLEAVE(),\
+ STORE256( stateAsLanes[argIndex+0], lanes0 ),\
+ STORE256( stateAsLanes[argIndex+1], lanes1 ),\
+ STORE256( stateAsLanes[argIndex+2], lanes2 ),\
+ STORE256( stateAsLanes[argIndex+3], lanes3 )
+
+ if ( laneCount >= 16 ) {
+ OverWr4( 0 );
+ OverWr4( 4 );
+ OverWr4( 8 );
+ OverWr4( 12 );
+ if ( laneCount >= 20 ) {
+ OverWr4( 16 );
+ for(i=20; i<laneCount; i++)
+ OverWr( i );
+ }
+ else {
+ for(i=16; i<laneCount; i++)
+ OverWr( i );
+ }
+ }
+ else {
+ for(i=0; i<laneCount; i++)
+ OverWr( i );
+ }
+ #undef OverWr
+ #undef OverWr4
+}
+
+void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
+{
+ unsigned int sizeLeft = byteCount;
+ unsigned int lanePosition = 0;
+ UINT64 *statesAsLanes = (UINT64 *)states;
+
+ while(sizeLeft >= SnP_laneLengthInBytes) {
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
+ sizeLeft -= SnP_laneLengthInBytes;
+ lanePosition++;
+ }
+
+ if (sizeLeft > 0) {
+ memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
+ }
+}
+
+void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
+{
+ unsigned int sizeLeft = length;
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
+ unsigned char *curData = data;
+ const UINT64 *statesAsLanes = (const UINT64 *)states;
+
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
+ if (bytesInLane > sizeLeft)
+ bytesInLane = sizeLeft;
+ // correcting cast-qual error
+ // old version: memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
+ memcpy( curData, ((const unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
+ sizeLeft -= bytesInLane;
+ lanePosition++;
+ curData += bytesInLane;
+ }
+
+ while(sizeLeft >= SnP_laneLengthInBytes) {
+ // correcting cast-align error
+ // old version: *(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+ *(UINT64*)(void*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+ sizeLeft -= SnP_laneLengthInBytes;
+ lanePosition++;
+ curData += SnP_laneLengthInBytes;
+ }
+
+ if (sizeLeft > 0) {
+ memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
+ }
+}
+
+void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+ // correcting cast-align errors
+ // old version: UINT64 *curData0 = (UINT64 *)data;
+ UINT64 *curData0 = (void *)data;
+ // old version: UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes);
+ UINT64 *curData1 = (void *)(data+laneOffset*1*SnP_laneLengthInBytes);
+ // old version: UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
+ UINT64 *curData2 = (void *)(data+laneOffset*2*SnP_laneLengthInBytes);
+ // old version: UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
+ UINT64 *curData3 = (void *)(data+laneOffset*3*SnP_laneLengthInBytes);
+
+ const V256 *stateAsLanes = (const V256 *)states;
+ const UINT64 *stateAsLanes64 = (const UINT64*)states;
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
+ unsigned int i;
+
+ #define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)], \
+ curData1[argIndex] = stateAsLanes64[4*(argIndex)+1], \
+ curData2[argIndex] = stateAsLanes64[4*(argIndex)+2], \
+ curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]
+
+ #define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \
+ lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \
+ lanes2 = LOAD256( stateAsLanes[argIndex+2] ), \
+ lanes3 = LOAD256( stateAsLanes[argIndex+3] ), \
+ UNINTLEAVE(), \
+ STORE256u( curData0[argIndex], lanes0 ), \
+ STORE256u( curData1[argIndex], lanes1 ), \
+ STORE256u( curData2[argIndex], lanes2 ), \
+ STORE256u( curData3[argIndex], lanes3 )
+
+ if ( laneCount >= 16 ) {
+ Extr4( 0 );
+ Extr4( 4 );
+ Extr4( 8 );
+ Extr4( 12 );
+ if ( laneCount >= 20 ) {
+ Extr4( 16 );
+ for(i=20; i<laneCount; i++)
+ Extr( i );
+ }
+ else {
+ for(i=16; i<laneCount; i++)
+ Extr( i );
+ }
+ }
+ else {
+ for(i=0; i<laneCount; i++)
+ Extr( i );
+ }
+ #undef Extr
+ #undef Extr4
+}
+
+void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+ unsigned int sizeLeft = length;
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
+ const unsigned char *curInput = input;
+ unsigned char *curOutput = output;
+ const UINT64 *statesAsLanes = (const UINT64 *)states;
+
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
+ UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
+ if (bytesInLane > sizeLeft)
+ bytesInLane = sizeLeft;
+ sizeLeft -= bytesInLane;
+ do {
+ *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
+ lane >>= 8;
+ } while ( --bytesInLane != 0);
+ lanePosition++;
+ }
+
+ while(sizeLeft >= SnP_laneLengthInBytes) {
+ // correcting cast-align and cast-qual errors
+ // old version: *((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+ *((UINT64*)(void*)curOutput) = *((const UINT64*)(const void*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+ sizeLeft -= SnP_laneLengthInBytes;
+ lanePosition++;
+ curInput += SnP_laneLengthInBytes;
+ curOutput += SnP_laneLengthInBytes;
+ }
+
+ if (sizeLeft != 0) {
+ UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+ do {
+ *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
+ lane >>= 8;
+ } while ( --sizeLeft != 0);
+ }
+}
+
+void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
+{
+ // correcting cast-align and cast-qual errors
+ // old version: const UINT64 *curInput0 = (UINT64 *)input;
+ const UINT64 *curInput0 = (const void *)input;
+ // old version: const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
+ const UINT64 *curInput1 = (const void *)(input+laneOffset*1*SnP_laneLengthInBytes);
+ // old version: const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
+ const UINT64 *curInput2 = (const void *)(input+laneOffset*2*SnP_laneLengthInBytes);
+ // old version: const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
+ const UINT64 *curInput3 = (const void *)(input+laneOffset*3*SnP_laneLengthInBytes);
+ // correcting cast-align errors
+ // old version: UINT64 *curOutput0 = (UINT64 *)output;
+ UINT64 *curOutput0 = (void *)output;
+ // old version: UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
+ UINT64 *curOutput1 = (void *)(output+laneOffset*1*SnP_laneLengthInBytes);
+ // old version: UUINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
+ UINT64 *curOutput2 = (void *)(output+laneOffset*2*SnP_laneLengthInBytes);
+ // old version: UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);
+ UINT64 *curOutput3 = (void *)(output+laneOffset*3*SnP_laneLengthInBytes);
+
+ const V256 *stateAsLanes = (const V256 *)states;
+ const UINT64 *stateAsLanes64 = (const UINT64*)states;
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
+ unsigned int i;
+
+ #define ExtrXor( argIndex ) \
+ curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
+ curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
+ curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
+ curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
+
+ #define ExtrXor4( argIndex ) \
+ lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
+ lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
+ lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
+ lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
+ UNINTLEAVE(),\
+ lanesL01 = LOAD256u( curInput0[argIndex]),\
+ lanesH01 = LOAD256u( curInput1[argIndex]),\
+ lanesL23 = LOAD256u( curInput2[argIndex]),\
+ lanesH23 = LOAD256u( curInput3[argIndex]),\
+ XOReq256( lanes0, lanesL01 ),\
+ XOReq256( lanes1, lanesH01 ),\
+ XOReq256( lanes2, lanesL23 ),\
+ XOReq256( lanes3, lanesH23 ),\
+ STORE256u( curOutput0[argIndex], lanes0 ),\
+ STORE256u( curOutput1[argIndex], lanes1 ),\
+ STORE256u( curOutput2[argIndex], lanes2 ),\
+ STORE256u( curOutput3[argIndex], lanes3 )
+
+ if ( laneCount >= 16 ) {
+ ExtrXor4( 0 );
+ ExtrXor4( 4 );
+ ExtrXor4( 8 );
+ ExtrXor4( 12 );
+ if ( laneCount >= 20 ) {
+ ExtrXor4( 16 );
+ for(i=20; i<laneCount; i++)
+ ExtrXor( i );
+ }
+ else {
+ for(i=16; i<laneCount; i++)
+ ExtrXor( i );
+ }
+ }
+ else {
+ for(i=0; i<laneCount; i++)
+ ExtrXor( i );
+ }
+ #undef ExtrXor
+ #undef ExtrXor4
+}
+
+#define declareABCDE \
+ V256 Aba, Abe, Abi, Abo, Abu; \
+ V256 Aga, Age, Agi, Ago, Agu; \
+ V256 Aka, Ake, Aki, Ako, Aku; \
+ V256 Ama, Ame, Ami, Amo, Amu; \
+ V256 Asa, Ase, Asi, Aso, Asu; \
+ V256 Bba, Bbe, Bbi, Bbo, Bbu; \
+ V256 Bga, Bge, Bgi, Bgo, Bgu; \
+ V256 Bka, Bke, Bki, Bko, Bku; \
+ V256 Bma, Bme, Bmi, Bmo, Bmu; \
+ V256 Bsa, Bse, Bsi, Bso, Bsu; \
+ V256 Ca, Ce, Ci, Co, Cu; \
+ V256 Ca1, Ce1, Ci1, Co1, Cu1; \
+ V256 Da, De, Di, Do, Du; \
+ V256 Eba, Ebe, Ebi, Ebo, Ebu; \
+ V256 Ega, Ege, Egi, Ego, Egu; \
+ V256 Eka, Eke, Eki, Eko, Eku; \
+ V256 Ema, Eme, Emi, Emo, Emu; \
+ V256 Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+ Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
+ Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
+ Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
+ Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
+ Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
+
+/* --- Theta Rho Pi Chi Iota Prepare-theta */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+ ROL64in256(Ce1, Ce, 1); \
+ Da = XOR256(Cu, Ce1); \
+ ROL64in256(Ci1, Ci, 1); \
+ De = XOR256(Ca, Ci1); \
+ ROL64in256(Co1, Co, 1); \
+ Di = XOR256(Ce, Co1); \
+ ROL64in256(Cu1, Cu, 1); \
+ Do = XOR256(Ci, Cu1); \
+ ROL64in256(Ca1, Ca, 1); \
+ Du = XOR256(Co, Ca1); \
+\
+ XOReq256(A##ba, Da); \
+ Bba = A##ba; \
+ XOReq256(A##ge, De); \
+ ROL64in256(Bbe, A##ge, 44); \
+ XOReq256(A##ki, Di); \
+ ROL64in256(Bbi, A##ki, 43); \
+ E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
+ XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
+ Ca = E##ba; \
+ XOReq256(A##mo, Do); \
+ ROL64in256(Bbo, A##mo, 21); \
+ E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
+ Ce = E##be; \
+ XOReq256(A##su, Du); \
+ ROL64in256(Bbu, A##su, 14); \
+ E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
+ Ci = E##bi; \
+ E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
+ Co = E##bo; \
+ E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
+ Cu = E##bu; \
+\
+ XOReq256(A##bo, Do); \
+ ROL64in256(Bga, A##bo, 28); \
+ XOReq256(A##gu, Du); \
+ ROL64in256(Bge, A##gu, 20); \
+ XOReq256(A##ka, Da); \
+ ROL64in256(Bgi, A##ka, 3); \
+ E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
+ XOReq256(Ca, E##ga); \
+ XOReq256(A##me, De); \
+ ROL64in256(Bgo, A##me, 45); \
+ E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
+ XOReq256(Ce, E##ge); \
+ XOReq256(A##si, Di); \
+ ROL64in256(Bgu, A##si, 61); \
+ E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
+ XOReq256(Ci, E##gi); \
+ E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
+ XOReq256(Co, E##go); \
+ E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
+ XOReq256(Cu, E##gu); \
+\
+ XOReq256(A##be, De); \
+ ROL64in256(Bka, A##be, 1); \
+ XOReq256(A##gi, Di); \
+ ROL64in256(Bke, A##gi, 6); \
+ XOReq256(A##ko, Do); \
+ ROL64in256(Bki, A##ko, 25); \
+ E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
+ XOReq256(Ca, E##ka); \
+ XOReq256(A##mu, Du); \
+ ROL64in256_8(Bko, A##mu); \
+ E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
+ XOReq256(Ce, E##ke); \
+ XOReq256(A##sa, Da); \
+ ROL64in256(Bku, A##sa, 18); \
+ E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
+ XOReq256(Ci, E##ki); \
+ E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
+ XOReq256(Co, E##ko); \
+ E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
+ XOReq256(Cu, E##ku); \
+\
+ XOReq256(A##bu, Du); \
+ ROL64in256(Bma, A##bu, 27); \
+ XOReq256(A##ga, Da); \
+ ROL64in256(Bme, A##ga, 36); \
+ XOReq256(A##ke, De); \
+ ROL64in256(Bmi, A##ke, 10); \
+ E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
+ XOReq256(Ca, E##ma); \
+ XOReq256(A##mi, Di); \
+ ROL64in256(Bmo, A##mi, 15); \
+ E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
+ XOReq256(Ce, E##me); \
+ XOReq256(A##so, Do); \
+ ROL64in256_56(Bmu, A##so); \
+ E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
+ XOReq256(Ci, E##mi); \
+ E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
+ XOReq256(Co, E##mo); \
+ E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
+ XOReq256(Cu, E##mu); \
+\
+ XOReq256(A##bi, Di); \
+ ROL64in256(Bsa, A##bi, 62); \
+ XOReq256(A##go, Do); \
+ ROL64in256(Bse, A##go, 55); \
+ XOReq256(A##ku, Du); \
+ ROL64in256(Bsi, A##ku, 39); \
+ E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
+ XOReq256(Ca, E##sa); \
+ XOReq256(A##ma, Da); \
+ ROL64in256(Bso, A##ma, 41); \
+ E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
+ XOReq256(Ce, E##se); \
+ XOReq256(A##se, De); \
+ ROL64in256(Bsu, A##se, 2); \
+ E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
+ XOReq256(Ci, E##si); \
+ E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
+ XOReq256(Co, E##so); \
+ E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
+ XOReq256(Cu, E##su); \
+\
+
+/* --- Theta Rho Pi Chi Iota */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+ ROL64in256(Ce1, Ce, 1); \
+ Da = XOR256(Cu, Ce1); \
+ ROL64in256(Ci1, Ci, 1); \
+ De = XOR256(Ca, Ci1); \
+ ROL64in256(Co1, Co, 1); \
+ Di = XOR256(Ce, Co1); \
+ ROL64in256(Cu1, Cu, 1); \
+ Do = XOR256(Ci, Cu1); \
+ ROL64in256(Ca1, Ca, 1); \
+ Du = XOR256(Co, Ca1); \
+\
+ XOReq256(A##ba, Da); \
+ Bba = A##ba; \
+ XOReq256(A##ge, De); \
+ ROL64in256(Bbe, A##ge, 44); \
+ XOReq256(A##ki, Di); \
+ ROL64in256(Bbi, A##ki, 43); \
+ E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
+ XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
+ XOReq256(A##mo, Do); \
+ ROL64in256(Bbo, A##mo, 21); \
+ E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
+ XOReq256(A##su, Du); \
+ ROL64in256(Bbu, A##su, 14); \
+ E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
+ E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
+ E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
+\
+ XOReq256(A##bo, Do); \
+ ROL64in256(Bga, A##bo, 28); \
+ XOReq256(A##gu, Du); \
+ ROL64in256(Bge, A##gu, 20); \
+ XOReq256(A##ka, Da); \
+ ROL64in256(Bgi, A##ka, 3); \
+ E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
+ XOReq256(A##me, De); \
+ ROL64in256(Bgo, A##me, 45); \
+ E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
+ XOReq256(A##si, Di); \
+ ROL64in256(Bgu, A##si, 61); \
+ E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
+ E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
+ E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
+\
+ XOReq256(A##be, De); \
+ ROL64in256(Bka, A##be, 1); \
+ XOReq256(A##gi, Di); \
+ ROL64in256(Bke, A##gi, 6); \
+ XOReq256(A##ko, Do); \
+ ROL64in256(Bki, A##ko, 25); \
+ E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
+ XOReq256(A##mu, Du); \
+ ROL64in256_8(Bko, A##mu); \
+ E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
+ XOReq256(A##sa, Da); \
+ ROL64in256(Bku, A##sa, 18); \
+ E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
+ E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
+ E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
+\
+ XOReq256(A##bu, Du); \
+ ROL64in256(Bma, A##bu, 27); \
+ XOReq256(A##ga, Da); \
+ ROL64in256(Bme, A##ga, 36); \
+ XOReq256(A##ke, De); \
+ ROL64in256(Bmi, A##ke, 10); \
+ E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
+ XOReq256(A##mi, Di); \
+ ROL64in256(Bmo, A##mi, 15); \
+ E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
+ XOReq256(A##so, Do); \
+ ROL64in256_56(Bmu, A##so); \
+ E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
+ E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
+ E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
+\
+ XOReq256(A##bi, Di); \
+ ROL64in256(Bsa, A##bi, 62); \
+ XOReq256(A##go, Do); \
+ ROL64in256(Bse, A##go, 55); \
+ XOReq256(A##ku, Du); \
+ ROL64in256(Bsi, A##ku, 39); \
+ E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
+ XOReq256(A##ma, Da); \
+ ROL64in256(Bso, A##ma, 41); \
+ E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
+ XOReq256(A##se, De); \
+ ROL64in256(Bsu, A##se, 2); \
+ E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
+ E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
+ E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
+\
+
+static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
+ 0x0000000000000001ULL,
+ 0x0000000000008082ULL,
+ 0x800000000000808aULL,
+ 0x8000000080008000ULL,
+ 0x000000000000808bULL,
+ 0x0000000080000001ULL,
+ 0x8000000080008081ULL,
+ 0x8000000000008009ULL,
+ 0x000000000000008aULL,
+ 0x0000000000000088ULL,
+ 0x0000000080008009ULL,
+ 0x000000008000000aULL,
+ 0x000000008000808bULL,
+ 0x800000000000008bULL,
+ 0x8000000000008089ULL,
+ 0x8000000000008003ULL,
+ 0x8000000000008002ULL,
+ 0x8000000000000080ULL,
+ 0x000000000000800aULL,
+ 0x800000008000000aULL,
+ 0x8000000080008081ULL,
+ 0x8000000000008080ULL,
+ 0x0000000080000001ULL,
+ 0x8000000080008008ULL};
+
+#define copyFromState(X, state) \
+ X##ba = LOAD256(state[ 0]); \
+ X##be = LOAD256(state[ 1]); \
+ X##bi = LOAD256(state[ 2]); \
+ X##bo = LOAD256(state[ 3]); \
+ X##bu = LOAD256(state[ 4]); \
+ X##ga = LOAD256(state[ 5]); \
+ X##ge = LOAD256(state[ 6]); \
+ X##gi = LOAD256(state[ 7]); \
+ X##go = LOAD256(state[ 8]); \
+ X##gu = LOAD256(state[ 9]); \
+ X##ka = LOAD256(state[10]); \
+ X##ke = LOAD256(state[11]); \
+ X##ki = LOAD256(state[12]); \
+ X##ko = LOAD256(state[13]); \
+ X##ku = LOAD256(state[14]); \
+ X##ma = LOAD256(state[15]); \
+ X##me = LOAD256(state[16]); \
+ X##mi = LOAD256(state[17]); \
+ X##mo = LOAD256(state[18]); \
+ X##mu = LOAD256(state[19]); \
+ X##sa = LOAD256(state[20]); \
+ X##se = LOAD256(state[21]); \
+ X##si = LOAD256(state[22]); \
+ X##so = LOAD256(state[23]); \
+ X##su = LOAD256(state[24]); \
+
+#define copyToState(state, X) \
+ STORE256(state[ 0], X##ba); \
+ STORE256(state[ 1], X##be); \
+ STORE256(state[ 2], X##bi); \
+ STORE256(state[ 3], X##bo); \
+ STORE256(state[ 4], X##bu); \
+ STORE256(state[ 5], X##ga); \
+ STORE256(state[ 6], X##ge); \
+ STORE256(state[ 7], X##gi); \
+ STORE256(state[ 8], X##go); \
+ STORE256(state[ 9], X##gu); \
+ STORE256(state[10], X##ka); \
+ STORE256(state[11], X##ke); \
+ STORE256(state[12], X##ki); \
+ STORE256(state[13], X##ko); \
+ STORE256(state[14], X##ku); \
+ STORE256(state[15], X##ma); \
+ STORE256(state[16], X##me); \
+ STORE256(state[17], X##mi); \
+ STORE256(state[18], X##mo); \
+ STORE256(state[19], X##mu); \
+ STORE256(state[20], X##sa); \
+ STORE256(state[21], X##se); \
+ STORE256(state[22], X##si); \
+ STORE256(state[23], X##so); \
+ STORE256(state[24], X##su); \
+
+#define copyStateVariables(X, Y) \
+ X##ba = Y##ba; \
+ X##be = Y##be; \
+ X##bi = Y##bi; \
+ X##bo = Y##bo; \
+ X##bu = Y##bu; \
+ X##ga = Y##ga; \
+ X##ge = Y##ge; \
+ X##gi = Y##gi; \
+ X##go = Y##go; \
+ X##gu = Y##gu; \
+ X##ka = Y##ka; \
+ X##ke = Y##ke; \
+ X##ki = Y##ki; \
+ X##ko = Y##ko; \
+ X##ku = Y##ku; \
+ X##ma = Y##ma; \
+ X##me = Y##me; \
+ X##mi = Y##mi; \
+ X##mo = Y##mo; \
+ X##mu = Y##mu; \
+ X##sa = Y##sa; \
+ X##se = Y##se; \
+ X##si = Y##si; \
+ X##so = Y##so; \
+ X##su = Y##su; \
+
+ #ifdef KeccakP1600times4_fullUnrolling
+#define FullUnrolling
+#else
+#define Unrolling KeccakP1600times4_unrolling
+#endif
+// The macro file is combined with source file directly
+/*****#include "KeccakP-1600-unrolling_avx2.macros"*****/
+/*******************************************************/
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#if (defined(FullUnrolling))
+#define rounds24 \
+ prepareTheta \
+ thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
+ thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
+ thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
+ thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
+ thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
+ thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
+ thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
+ thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
+ thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(10, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(11, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+ thetaRhoPiChiIota(23, E, A) \
+
+#define rounds12 \
+ prepareTheta \
+ thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+ thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 12)
+#define rounds24 \
+ prepareTheta \
+ for(i=0; i<24; i+=12) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
+ } \
+
+#define rounds12 \
+ prepareTheta \
+ thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+ thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 6)
+#define rounds24 \
+ prepareTheta \
+ for(i=0; i<24; i+=6) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+ } \
+
+#define rounds12 \
+ prepareTheta \
+ for(i=12; i<24; i+=6) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+ } \
+
+#elif (Unrolling == 4)
+#define rounds24 \
+ prepareTheta \
+ for(i=0; i<24; i+=4) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+ } \
+
+#define rounds12 \
+ prepareTheta \
+ for(i=12; i<24; i+=4) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+ } \
+
+#elif (Unrolling == 3)
+#define rounds24 \
+ prepareTheta \
+ for(i=0; i<24; i+=3) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+ copyStateVariables(A, E) \
+ } \
+
+#define rounds12 \
+ prepareTheta \
+ for(i=12; i<24; i+=3) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+ copyStateVariables(A, E) \
+ } \
+
+#elif (Unrolling == 2)
+#define rounds24 \
+ prepareTheta \
+ for(i=0; i<24; i+=2) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ } \
+
+#define rounds12 \
+ prepareTheta \
+ for(i=12; i<24; i+=2) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ } \
+
+#elif (Unrolling == 1)
+#define rounds24 \
+ prepareTheta \
+ for(i=0; i<24; i++) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ copyStateVariables(A, E) \
+ } \
+
+#define rounds12 \
+ prepareTheta \
+ for(i=12; i<24; i++) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ copyStateVariables(A, E) \
+ } \
+
+#else
+#error "Unrolling is not correctly specified!"
+#endif
+
+#define roundsN(__nrounds) \
+ prepareTheta \
+ i = 24 - (__nrounds); \
+ if ((i&1) != 0) { \
+ thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+ copyStateVariables(A, E) \
+ ++i; \
+ } \
+ for( /* empty */; i<24; i+=2) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ }
+
+/*******************************************************/
+
+void KeccakP1600times4_PermuteAll_24rounds(void *states)
+{
+ V256 *statesAsLanes = (V256 *)states;
+ declareABCDE
+ #ifndef KeccakP1600times4_fullUnrolling
+ unsigned int i;
+ #endif
+
+ copyFromState(A, statesAsLanes)
+ rounds24
+ copyToState(statesAsLanes, A)
+}
+
+void KeccakP1600times4_PermuteAll_12rounds(void *states)
+{
+ V256 *statesAsLanes = (V256 *)states;
+ declareABCDE
+ #ifndef KeccakP1600times4_fullUnrolling
+ unsigned int i;
+ #endif
+
+ copyFromState(A, statesAsLanes)
+ rounds12
+ copyToState(statesAsLanes, A)
+}
+
+size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
+{
+ if (laneCount == 21) {
+#if 0
+ const unsigned char *dataStart = data;
+ const UINT64 *curData0 = (const UINT64 *)data;
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
+
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
+ V256 *stateAsLanes = (V256 *)states;
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
+ #define Xor_In( argIndex ) \
+ XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
+ #define Xor_In4( argIndex ) \
+ lanes0 = LOAD256u( curData0[argIndex]),\
+ lanes1 = LOAD256u( curData1[argIndex]),\
+ lanes2 = LOAD256u( curData2[argIndex]),\
+ lanes3 = LOAD256u( curData3[argIndex]),\
+ INTLEAVE(),\
+ XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
+ XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
+ XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
+ XOReq256( stateAsLanes[argIndex+3], lanes3 )
+ Xor_In4( 0 );
+ Xor_In4( 4 );
+ Xor_In4( 8 );
+ Xor_In4( 12 );
+ Xor_In4( 16 );
+ Xor_In( 20 );
+ #undef Xor_In
+ #undef Xor_In4
+ KeccakP1600times4_PermuteAll_24rounds(states);
+ curData0 += laneOffsetSerial;
+ curData1 += laneOffsetSerial;
+ curData2 += laneOffsetSerial;
+ curData3 += laneOffsetSerial;
+ dataByteLen -= laneOffsetSerial*8;
+ }
+ return (const unsigned char *)curData0 - dataStart;
+#else
+// unsigned int i;
+ const unsigned char *dataStart = data;
+ // correcting cast-align errors
+ // old version: const UINT64 *curData0 = (const UINT64 *)data;
+ const UINT64 *curData0 = (const void *)data;
+ // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
+ const UINT64 *curData1 = (const void *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
+ // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
+ const UINT64 *curData2 = (const void *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
+ // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
+ const UINT64 *curData3 = (const void *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
+ V256 *statesAsLanes = (V256 *)states;
+ declareABCDE
+
+ copyFromState(A, statesAsLanes)
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
+ #define XOR_In( Xxx, argIndex ) \
+ XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
+ XOR_In( Aba, 0 );
+ XOR_In( Abe, 1 );
+ XOR_In( Abi, 2 );
+ XOR_In( Abo, 3 );
+ XOR_In( Abu, 4 );
+ XOR_In( Aga, 5 );
+ XOR_In( Age, 6 );
+ XOR_In( Agi, 7 );
+ XOR_In( Ago, 8 );
+ XOR_In( Agu, 9 );
+ XOR_In( Aka, 10 );
+ XOR_In( Ake, 11 );
+ XOR_In( Aki, 12 );
+ XOR_In( Ako, 13 );
+ XOR_In( Aku, 14 );
+ XOR_In( Ama, 15 );
+ XOR_In( Ame, 16 );
+ XOR_In( Ami, 17 );
+ XOR_In( Amo, 18 );
+ XOR_In( Amu, 19 );
+ XOR_In( Asa, 20 );
+ #undef XOR_In
+ rounds24
+ curData0 += laneOffsetSerial;
+ curData1 += laneOffsetSerial;
+ curData2 += laneOffsetSerial;
+ curData3 += laneOffsetSerial;
+ dataByteLen -= laneOffsetSerial*8;
+ }
+ copyToState(statesAsLanes, A)
+ return (const unsigned char *)curData0 - dataStart;
+#endif
+ }
+ else {
+// unsigned int i;
+ const unsigned char *dataStart = data;
+
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
+ KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
+ KeccakP1600times4_PermuteAll_24rounds(states);
+ data += laneOffsetSerial*8;
+ dataByteLen -= laneOffsetSerial*8;
+ }
+ return data - dataStart;
+ }
+}
+
+size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
+{
+ if (laneCount == 21) {
+#if 0
+ const unsigned char *dataStart = data;
+ const UINT64 *curData0 = (const UINT64 *)data;
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
+
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
+ V256 *stateAsLanes = states;
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
+ #define Xor_In( argIndex ) \
+ XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
+ #define Xor_In4( argIndex ) \
+ lanes0 = LOAD256u( curData0[argIndex]),\
+ lanes1 = LOAD256u( curData1[argIndex]),\
+ lanes2 = LOAD256u( curData2[argIndex]),\
+ lanes3 = LOAD256u( curData3[argIndex]),\
+ INTLEAVE(),\
+ XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
+ XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
+ XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
+ XOReq256( stateAsLanes[argIndex+3], lanes3 )
+ Xor_In4( 0 );
+ Xor_In4( 4 );
+ Xor_In4( 8 );
+ Xor_In4( 12 );
+ Xor_In4( 16 );
+ Xor_In( 20 );
+ #undef Xor_In
+ #undef Xor_In4
+ KeccakP1600times4_PermuteAll_12rounds(states);
+ curData0 += laneOffsetSerial;
+ curData1 += laneOffsetSerial;
+ curData2 += laneOffsetSerial;
+ curData3 += laneOffsetSerial;
+ dataByteLen -= laneOffsetSerial*8;
+ }
+ return (const unsigned char *)curData0 - dataStart;
+#else
+// unsigned int i;
+ const unsigned char *dataStart = data;
+ // correcting cast-align errors
+ // old version: const UINT64 *curData0 = (const UINT64 *)data;
+ const UINT64 *curData0 = (const void *)data;
+ // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
+ const UINT64 *curData1 = (const void *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
+ // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
+ const UINT64 *curData2 = (const void *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
+ // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
+ const UINT64 *curData3 = (const void *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
+ V256 *statesAsLanes = states;
+ declareABCDE
+
+ copyFromState(A, statesAsLanes)
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
+ #define XOR_In( Xxx, argIndex ) \
+ XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
+ XOR_In( Aba, 0 );
+ XOR_In( Abe, 1 );
+ XOR_In( Abi, 2 );
+ XOR_In( Abo, 3 );
+ XOR_In( Abu, 4 );
+ XOR_In( Aga, 5 );
+ XOR_In( Age, 6 );
+ XOR_In( Agi, 7 );
+ XOR_In( Ago, 8 );
+ XOR_In( Agu, 9 );
+ XOR_In( Aka, 10 );
+ XOR_In( Ake, 11 );
+ XOR_In( Aki, 12 );
+ XOR_In( Ako, 13 );
+ XOR_In( Aku, 14 );
+ XOR_In( Ama, 15 );
+ XOR_In( Ame, 16 );
+ XOR_In( Ami, 17 );
+ XOR_In( Amo, 18 );
+ XOR_In( Amu, 19 );
+ XOR_In( Asa, 20 );
+ #undef XOR_In
+ rounds12
+ curData0 += laneOffsetSerial;
+ curData1 += laneOffsetSerial;
+ curData2 += laneOffsetSerial;
+ curData3 += laneOffsetSerial;
+ dataByteLen -= laneOffsetSerial*8;
+ }
+ copyToState(statesAsLanes, A)
+ return (const unsigned char *)curData0 - dataStart;
+#endif
+ }
+ else {
+// unsigned int i;
+ const unsigned char *dataStart = data;
+
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
+ KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
+ KeccakP1600times4_PermuteAll_12rounds(states);
+ data += laneOffsetSerial*8;
+ dataByteLen -= laneOffsetSerial*8;
+ }
+ return data - dataStart;
+ }
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SnP_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SnP_avx2.h
new file mode 100644
index 0000000000..2640191779
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SnP_avx2.h
@@ -0,0 +1,63 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#pragma once
+
+/** For the documentation, see PlSnP-documentation.h.
+ */
+
+#include "KeccakP-SIMD256-config_avx2.h"
+#include "kyber512r3_params.h"
+#include "kyber512r3_fips202x4_avx2.h"
+
+#define KeccakP1600times4_implementation "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")"
+#define KeccakP1600times4_statesSizeInBytes 800
+#define KeccakP1600times4_statesAlignment 32
+#define KeccakF1600times4_FastLoop_supported
+#define KeccakP1600times4_12rounds_FastLoop_supported
+
+#include <stddef.h>
+
+#define KeccakP1600times4_StaticInitialize()
+#define KeccakP1600times4_InitializeAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_InitializeAll)
+void KeccakP1600times4_InitializeAll(void *states);
+#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \
+ ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte)
+#define KeccakP1600times4_AddBytes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_AddBytes)
+void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
+#define KeccakP1600times4_AddLanesAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_AddLanesAll)
+void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
+#define KeccakP1600times4_OverwriteBytes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_OverwriteBytes)
+void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
+#define KeccakP1600times4_OverwriteLanesAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_OverwriteLanesAll)
+void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
+#define KeccakP1600times4_OverwriteWithZeroes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_OverwriteWithZeroes)
+void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount);
+#define KeccakP1600times4_PermuteAll_12rounds S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_PermuteAll_12rounds)
+void KeccakP1600times4_PermuteAll_12rounds(void *states);
+#define KeccakP1600times4_PermuteAll_24rounds S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds)
+void KeccakP1600times4_PermuteAll_24rounds(void *states);
+#define KeccakP1600times4_ExtractBytes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_ExtractBytes)
+void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length);
+#define KeccakP1600times4_ExtractLanesAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_ExtractLanesAll)
+void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
+#define KeccakP1600times4_ExtractAndAddBytes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_ExtractAndAddBytes)
+void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
+#define KeccakP1600times4_ExtractAndAddLanesAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_ExtractAndAddLanesAll)
+void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset);
+#define KeccakF1600times4_FastLoop_Absorb S2N_KYBER_512_R3_NAMESPACE(KeccakF1600times4_FastLoop_Absorb)
+size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
+#define KeccakP1600times4_12rounds_FastLoop_Absorb S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_12rounds_FastLoop_Absorb)
+size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-SIMD256-config_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-SIMD256-config_avx2.h
new file mode 100644
index 0000000000..1c65fe29b4
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-SIMD256-config_avx2.h
@@ -0,0 +1,3 @@
+#define KeccakP1600times4_implementation_config "AVX2, all rounds unrolled"
+#define KeccakP1600times4_fullUnrolling
+#define KeccakP1600times4_useAVX2
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-align_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-align_avx2.h
new file mode 100644
index 0000000000..be08e84af2
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-align_avx2.h
@@ -0,0 +1,31 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#pragma once
+
+/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
+#ifdef ALIGN
+#undef ALIGN
+#endif
+
+#if defined(__GNUC__)
+#define ALIGN(x) __attribute__ ((aligned(x)))
+#elif defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#elif defined(__ARMCC_VERSION)
+#define ALIGN(x) __align(x)
+#else
+#define ALIGN(x)
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-brg_endian_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-brg_endian_avx2.h
new file mode 100644
index 0000000000..8e8b73cf2a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-brg_endian_avx2.h
@@ -0,0 +1,139 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+ 1. source code distributions include the above copyright notice, this
+ list of conditions and the following disclaimer;
+
+ 2. binary distributions include the above copyright notice, this list
+ of conditions and the following disclaimer in their documentation;
+
+ 3. the name of the copyright holder is not used to endorse products
+ built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+ Changes for ARM 9/9/2010
+*/
+
+#pragma once
+
+#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
+
+#if 0
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+# include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+# include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+ defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+# include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+# if !defined( __MINGW32__ ) && !defined( _AIX )
+# include <endian.h>
+# if !defined( __BEOS__ )
+# include <byteswap.h>
+# endif
+# endif
+#endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */
+/* seem to encompass most endian symbol definitions */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif defined( BIG_ENDIAN )
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif defined( _BIG_ENDIAN )
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif defined( __BIG_ENDIAN )
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif defined( __BIG_ENDIAN__ )
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/* if the platform byte order could not be determined, then try to */
+/* set this define using common machine defines */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \
+ defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \
+ defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \
+ defined( vax ) || defined( vms ) || defined( VMS ) || \
+ defined( __VMS ) || defined( _M_X64 )
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \
+ defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \
+ defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \
+ defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \
+ defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \
+ defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \
+ defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif defined(__arm__)
+# ifdef __BIG_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# else
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif 1 /* **** EDIT HERE IF NECESSARY **** */
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0 /* **** EDIT HERE IF NECESSARY **** */
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_align_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_align_avx2.h
new file mode 100644
index 0000000000..79e6d9ec0c
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_align_avx2.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <stdint.h>
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+#define ALIGNED_UINT8(N) \
+ union { \
+ uint8_t coeffs[N]; \
+ __m256i vec[(N+31)/32]; \
+ }
+
+#define ALIGNED_INT16(N) \
+ union { \
+ int16_t coeffs[N]; \
+ __m256i vec[(N+15)/16]; \
+ }
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_basemul_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_basemul_avx2.S
new file mode 100644
index 0000000000..ed2a65be20
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_basemul_avx2.S
@@ -0,0 +1,105 @@
+#include "kyber512r3_consts_avx2.h"
+
+.macro schoolbook off
+vmovdqa _16XQINV*2(%rcx),%ymm0
+vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0
+vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0
+vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1
+vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1
+
+vpmullw %ymm0,%ymm1,%ymm9 # a0.lo
+vpmullw %ymm0,%ymm2,%ymm10 # b0.lo
+vpmullw %ymm0,%ymm3,%ymm11 # a1.lo
+vpmullw %ymm0,%ymm4,%ymm12 # b1.lo
+
+vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0
+vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0
+
+vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi
+vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi
+vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi
+vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi
+
+vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1
+vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1
+
+vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi
+vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi
+vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi
+vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi
+
+vmovdqa %ymm13,(%rsp)
+
+vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo
+vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo
+vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo
+vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo
+
+vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo
+vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo
+vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo
+vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo
+
+vmovdqa _16XQ*2(%rcx),%ymm8
+vpmulhw %ymm8,%ymm13,%ymm13
+vpmulhw %ymm8,%ymm9,%ymm9
+vpmulhw %ymm8,%ymm5,%ymm5
+vpmulhw %ymm8,%ymm10,%ymm10
+vpmulhw %ymm8,%ymm6,%ymm6
+vpmulhw %ymm8,%ymm11,%ymm11
+vpmulhw %ymm8,%ymm7,%ymm7
+vpmulhw %ymm8,%ymm12,%ymm12
+
+vpsubw (%rsp),%ymm13,%ymm13 # -a0c0
+vpsubw %ymm9,%ymm1,%ymm9 # a0d0
+vpsubw %ymm5,%ymm14,%ymm5 # b0c0
+vpsubw %ymm10,%ymm2,%ymm10 # b0d0
+
+vpsubw %ymm6,%ymm15,%ymm6 # a1c1
+vpsubw %ymm11,%ymm3,%ymm11 # a1d1
+vpsubw %ymm7,%ymm0,%ymm7 # b1c1
+vpsubw %ymm12,%ymm4,%ymm12 # b1d1
+
+vmovdqa (%r9),%ymm0
+vmovdqa 32(%r9),%ymm1
+vpmullw %ymm0,%ymm10,%ymm2
+vpmullw %ymm0,%ymm12,%ymm3
+vpmulhw %ymm1,%ymm10,%ymm10
+vpmulhw %ymm1,%ymm12,%ymm12
+vpmulhw %ymm8,%ymm2,%ymm2
+vpmulhw %ymm8,%ymm3,%ymm3
+vpsubw %ymm2,%ymm10,%ymm10 # rb0d0
+vpsubw %ymm3,%ymm12,%ymm12 # rb1d1
+
+vpaddw %ymm5,%ymm9,%ymm9
+vpaddw %ymm7,%ymm11,%ymm11
+vpsubw %ymm13,%ymm10,%ymm13
+vpsubw %ymm12,%ymm6,%ymm6
+
+vmovdqa %ymm13,(64*\off+ 0)*2(%rdi)
+vmovdqa %ymm9,(64*\off+16)*2(%rdi)
+vmovdqa %ymm6,(64*\off+32)*2(%rdi)
+vmovdqa %ymm11,(64*\off+48)*2(%rdi)
+.endm
+
+.text
+.global cdecl(basemul_avx2_asm)
+cdecl(basemul_avx2_asm):
+mov %rsp,%r8
+and $-32,%rsp
+sub $32,%rsp
+
+lea (_ZETAS_EXP+176)*2(%rcx),%r9
+schoolbook 0
+
+add $32*2,%r9
+schoolbook 1
+
+add $192*2,%r9
+schoolbook 2
+
+add $32*2,%r9
+schoolbook 3
+
+mov %r8,%rsp
+ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.c
new file mode 100644
index 0000000000..ef0bb87946
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.c
@@ -0,0 +1,104 @@
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_cbd.h"
+
+/*************************************************
+* Name: load32_littleendian
+*
+* Description: load 4 bytes into a 32-bit integer
+* in little-endian order
+*
+* Arguments: - const uint8_t *x: pointer to input byte array
+*
+* Returns 32-bit unsigned integer loaded from x
+**************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4]) {
+ uint32_t r;
+ r = (uint32_t)x[0];
+ r |= (uint32_t)x[1] << 8;
+ r |= (uint32_t)x[2] << 16;
+ r |= (uint32_t)x[3] << 24;
+ return r;
+}
+
+/*************************************************
+* Name: load24_littleendian
+*
+* Description: load 3 bytes into a 32-bit integer
+* in little-endian order
+* This function is only needed for Kyber-512
+*
+* Arguments: - const uint8_t *x: pointer to input byte array
+*
+* Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+**************************************************/
+static uint32_t load24_littleendian(const uint8_t x[3]) {
+ uint32_t r;
+ r = (uint32_t)x[0];
+ r |= (uint32_t)x[1] << 8;
+ r |= (uint32_t)x[2] << 16;
+ return r;
+}
+
+
+/*************************************************
+* Name: cbd2
+*
+* Description: Given an array of uniformly random bytes, compute
+* polynomial with coefficients distributed according to
+* a centered binomial distribution with parameter eta=2
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const uint8_t *buf: pointer to input byte array
+**************************************************/
+static void cbd2(poly *r, const uint8_t buf[2 * S2N_KYBER_512_R3_N / 4]) {
+ unsigned int i, j;
+
+ for (i = 0; i < S2N_KYBER_512_R3_N / 8; i++) {
+ uint32_t t = load32_littleendian(buf + 4 * i);
+ uint32_t d = t & 0x55555555;
+ d += (t >> 1) & 0x55555555;
+
+ for (j = 0; j < 8; j++) {
+ int16_t a = (d >> (4 * j + 0)) & 0x3;
+ int16_t b = (d >> (4 * j + 2)) & 0x3;
+ r->coeffs[8 * i + j] = a - b;
+ }
+ }
+}
+
+/*************************************************
+* Name: cbd3
+*
+* Description: Given an array of uniformly random bytes, compute
+* polynomial with coefficients distributed according to
+* a centered binomial distribution with parameter eta=3
+* This function is only needed for Kyber-512
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const uint8_t *buf: pointer to input byte array
+**************************************************/
+static void cbd3(poly *r, const uint8_t buf[3 * S2N_KYBER_512_R3_N / 4]) {
+ unsigned int i, j;
+
+ for (i = 0; i < S2N_KYBER_512_R3_N / 4; i++) {
+ uint32_t t = load24_littleendian(buf + 3 * i);
+ uint32_t d = t & 0x00249249;
+ d += (t >> 1) & 0x00249249;
+ d += (t >> 2) & 0x00249249;
+
+ for (j = 0; j < 4; j++) {
+ int16_t a = (d >> (6 * j + 0)) & 0x7;
+ int16_t b = (d >> (6 * j + 3)) & 0x7;
+ r->coeffs[4 * i + j] = a - b;
+ }
+ }
+}
+
+void cbd_eta1(poly *r, const uint8_t buf[S2N_KYBER_512_R3_ETA1 * S2N_KYBER_512_R3_N / 4]) {
+ cbd3(r, buf);
+}
+
+void cbd_eta2(poly *r, const uint8_t buf[S2N_KYBER_512_R3_ETA2 * S2N_KYBER_512_R3_N / 4]) {
+ cbd2(r, buf);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.h
new file mode 100644
index 0000000000..631821956c
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_poly.h"
+
+#define cbd_eta1 S2N_KYBER_512_R3_NAMESPACE(cbd_eta1)
+void cbd_eta1(poly *r, const uint8_t buf[S2N_KYBER_512_R3_ETA1 * S2N_KYBER_512_R3_N / 4]);
+
+#define cbd_eta2 S2N_KYBER_512_R3_NAMESPACE(cbd_eta2)
+void cbd_eta2(poly *r, const uint8_t buf[S2N_KYBER_512_R3_ETA2 * S2N_KYBER_512_R3_N / 4]);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.c
new file mode 100644
index 0000000000..a922bd220f
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.c
@@ -0,0 +1,137 @@
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_cbd_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+/*************************************************
+* Name: cbd2
+*
+* Description: Given an array of uniformly random bytes, compute
+* polynomial with coefficients distributed according to
+* a centered binomial distribution with parameter eta=2
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const __m256i *buf: pointer to aligned input byte array
+**************************************************/
+static void cbd2(poly * restrict r, const __m256i buf[2*S2N_KYBER_512_R3_N/128])
+{
+ unsigned int i;
+ __m256i f0, f1, f2, f3;
+ const __m256i mask55 = _mm256_set1_epi32(0x55555555);
+ const __m256i mask33 = _mm256_set1_epi32(0x33333333);
+ const __m256i mask03 = _mm256_set1_epi32(0x03030303);
+ const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F);
+
+ for(i = 0; i < S2N_KYBER_512_R3_N/64; i++) {
+ f0 = _mm256_load_si256(&buf[i]);
+
+ f1 = _mm256_srli_epi16(f0, 1);
+ f0 = _mm256_and_si256(mask55, f0);
+ f1 = _mm256_and_si256(mask55, f1);
+ f0 = _mm256_add_epi8(f0, f1);
+
+ f1 = _mm256_srli_epi16(f0, 2);
+ f0 = _mm256_and_si256(mask33, f0);
+ f1 = _mm256_and_si256(mask33, f1);
+ f0 = _mm256_add_epi8(f0, mask33);
+ f0 = _mm256_sub_epi8(f0, f1);
+
+ f1 = _mm256_srli_epi16(f0, 4);
+ f0 = _mm256_and_si256(mask0F, f0);
+ f1 = _mm256_and_si256(mask0F, f1);
+ f0 = _mm256_sub_epi8(f0, mask03);
+ f1 = _mm256_sub_epi8(f1, mask03);
+
+ f2 = _mm256_unpacklo_epi8(f0, f1);
+ f3 = _mm256_unpackhi_epi8(f0, f1);
+
+ f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2));
+ f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2,1));
+ f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3));
+ f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3,1));
+
+ _mm256_store_si256(&r->vec[4*i+0], f0);
+ _mm256_store_si256(&r->vec[4*i+1], f2);
+ _mm256_store_si256(&r->vec[4*i+2], f1);
+ _mm256_store_si256(&r->vec[4*i+3], f3);
+ }
+}
+
+/*************************************************
+* Name: cbd3
+*
+* Description: Given an array of uniformly random bytes, compute
+* polynomial with coefficients distributed according to
+* a centered binomial distribution with parameter eta=3
+* This function is only needed for Kyber-512
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const __m256i *buf: pointer to aligned input byte array
+**************************************************/
+static void cbd3(poly * restrict r, const uint8_t buf[3*S2N_KYBER_512_R3_N/4+8])
+{
+ unsigned int i;
+ __m256i f0, f1, f2, f3;
+ const __m256i mask249 = _mm256_set1_epi32(0x249249);
+ const __m256i mask6DB = _mm256_set1_epi32(0x6DB6DB);
+ const __m256i mask07 = _mm256_set1_epi32(7);
+ const __m256i mask70 = _mm256_set1_epi32(7 << 16);
+ const __m256i mask3 = _mm256_set1_epi16(3);
+ const __m256i shufbidx = _mm256_set_epi8(-1,15,14,13,-1,12,11,10,-1, 9, 8, 7,-1, 6, 5, 4,
+ -1,11,10, 9,-1, 8, 7, 6,-1, 5, 4, 3,-1, 2, 1, 0);
+
+ for(i = 0; i < S2N_KYBER_512_R3_N/32; i++) {
+ // correcting cast-align and cast-qual errors
+ // old version: f0 = _mm256_loadu_si256((__m256i *)&buf[24*i]);
+ f0 = _mm256_loadu_si256((const void *)&buf[24*i]);
+ f0 = _mm256_permute4x64_epi64(f0,0x94);
+ f0 = _mm256_shuffle_epi8(f0,shufbidx);
+
+ f1 = _mm256_srli_epi32(f0,1);
+ f2 = _mm256_srli_epi32(f0,2);
+ f0 = _mm256_and_si256(mask249,f0);
+ f1 = _mm256_and_si256(mask249,f1);
+ f2 = _mm256_and_si256(mask249,f2);
+ f0 = _mm256_add_epi32(f0,f1);
+ f0 = _mm256_add_epi32(f0,f2);
+
+ f1 = _mm256_srli_epi32(f0,3);
+ f0 = _mm256_add_epi32(f0,mask6DB);
+ f0 = _mm256_sub_epi32(f0,f1);
+
+ f1 = _mm256_slli_epi32(f0,10);
+ f2 = _mm256_srli_epi32(f0,12);
+ f3 = _mm256_srli_epi32(f0, 2);
+ f0 = _mm256_and_si256(f0,mask07);
+ f1 = _mm256_and_si256(f1,mask70);
+ f2 = _mm256_and_si256(f2,mask07);
+ f3 = _mm256_and_si256(f3,mask70);
+ f0 = _mm256_add_epi16(f0,f1);
+ f1 = _mm256_add_epi16(f2,f3);
+ f0 = _mm256_sub_epi16(f0,mask3);
+ f1 = _mm256_sub_epi16(f1,mask3);
+
+ f2 = _mm256_unpacklo_epi32(f0,f1);
+ f3 = _mm256_unpackhi_epi32(f0,f1);
+
+ f0 = _mm256_permute2x128_si256(f2,f3,0x20);
+ f1 = _mm256_permute2x128_si256(f2,f3,0x31);
+
+ _mm256_store_si256(&r->vec[2*i+0], f0);
+ _mm256_store_si256(&r->vec[2*i+1], f1);
+ }
+}
+
+/* buf 32 bytes longer for cbd3 */
+void poly_cbd_eta1_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/128+1])
+{
+ // correcting cast-align and cast-qual errors
+ // old version: cbd3(r, (uint8_t *)buf);
+ cbd3(r, (const void *)buf);
+}
+
+void poly_cbd_eta2_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/128])
+{
+ cbd2(r, buf);
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.h
new file mode 100644
index 0000000000..972c71fbf5
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_poly_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+#define poly_cbd_eta1_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_cbd_eta1_avx2)
+void poly_cbd_eta1_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/128+1]);
+
+#define poly_cbd_eta2_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_cbd_eta2_avx2)
+void poly_cbd_eta2_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/128]);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.c
new file mode 100644
index 0000000000..cdc0b817df
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.c
@@ -0,0 +1,122 @@
+#include "kyber512r3_align_avx2.h"
+#include "kyber512r3_consts_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#define Q S2N_KYBER_512_R3_Q
+#define MONT -1044 // 2^16 mod q
+#define QINV -3327 // q^-1 mod 2^16
+#define V 20159 // floor(2^26/q + 0.5)
+#define FHI 1441 // mont^2/128
+#define FLO -10079 // qinv*FHI
+#define MONTSQHI 1353 // mont^2
+#define MONTSQLO 20553 // qinv*MONTSQHI
+#define MASK 4095
+#define SHIFT 32
+
+const qdata_t qdata = {{
+#define _16XQ 0
+ Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q,
+
+#define _16XQINV 16
+ QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
+ QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
+
+#define _16XV 32
+ V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+
+#define _16XFLO 48
+ FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
+ FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
+
+#define _16XFHI 64
+ FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
+ FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
+
+#define _16XMONTSQLO 80
+ MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
+ MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
+ MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
+ MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
+
+#define _16XMONTSQHI 96
+ MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
+ MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
+ MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
+ MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
+
+#define _16XMASK 112
+ MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
+ MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
+
+#define _REVIDXB 128
+ 3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
+ 3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
+
+#define _REVIDXD 144
+ 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0,
+
+#define _ZETAS_EXP 160
+ 31498, 31498, 31498, 31498, -758, -758, -758, -758,
+ 5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397,
+ 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745,
+ 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745,
+ -359, -359, -359, -359, -359, -359, -359, -359,
+ -359, -359, -359, -359, -359, -359, -359, -359,
+ 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525,
+ -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402,
+ 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493,
+ 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422,
+ -20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758,
+ -3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690,
+ -171, -171, -171, -171, 622, 622, 622, 622,
+ 1577, 1577, 1577, 1577, 182, 182, 182, 182,
+ -5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057,
+ 5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242,
+ 573, 573, -1325, -1325, 264, 264, 383, 383,
+ -829, -829, 1458, 1458, -1602, -1602, -130, -130,
+ -5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080,
+ -12796, 26616, 16064, -12442, 9134, -650, -25986, 27837,
+ 1223, 652, -552, 1015, -1293, 1491, -282, -1544,
+ 516, -8, -320, -666, -1618, -1162, 126, 1469,
+ -335, -11477, -32227, 20494, -27738, 945, -14883, 6182,
+ 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276,
+ -1103, 555, -1251, 1550, 422, 177, -291, 1574,
+ -246, 1159, -777, -602, -1590, -872, 418, -156,
+ 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493,
+ -32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619,
+ 430, 843, 871, 105, 587, -235, -460, 1653,
+ 778, -147, 1483, 1119, 644, 349, 329, -75,
+ 787, 787, 787, 787, 787, 787, 787, 787,
+ 787, 787, 787, 787, 787, 787, 787, 787,
+ -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
+ -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
+ 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191,
+ -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694,
+ 287, 287, 287, 287, 287, 287, 287, 287,
+ 202, 202, 202, 202, 202, 202, 202, 202,
+ 10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358,
+ -11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164,
+ 962, 962, 962, 962, -1202, -1202, -1202, -1202,
+ -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468,
+ -28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800,
+ 18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163,
+ -681, -681, 1017, 1017, 732, 732, 608, 608,
+ -1542, -1542, 411, 411, -205, -205, -1571, -1571,
+ 19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249,
+ 13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915,
+ -853, -90, -271, 830, 107, -1421, -247, -951,
+ -398, 961, -1508, -725, 448, -1065, 677, -1275,
+ -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989,
+ 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422,
+ 817, 603, 1322, -1465, -1215, 1218, -874, -1187,
+ -1185, -1278, -1510, -870, -108, 996, 958, 1522,
+ 20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469,
+ -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132,
+ 1097, 610, -1285, 384, -136, -1335, 220, -1659,
+ -1530, 794, -854, 478, -308, 991, -1460, 1628,
+
+#define _16XSHIFT 624
+ SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT,
+ SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT
+}};
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.h
new file mode 100644
index 0000000000..1983ba44d6
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include "kyber512r3_params.h"
+
+#define _16XQ 0
+#define _16XQINV 16
+#define _16XV 32
+#define _16XFLO 48
+#define _16XFHI 64
+#define _16XMONTSQLO 80
+#define _16XMONTSQHI 96
+#define _16XMASK 112
+#define _REVIDXB 128
+#define _REVIDXD 144
+#define _ZETAS_EXP 160
+#define _16XSHIFT 624
+
+/* The C ABI on MacOS exports all symbols with a leading
+ * underscore. This means that any symbols we refer to from
+ * C files (functions) can't be found, and all symbols we
+ * refer to from ASM also can't be found.
+ *
+ * This define helps us get around this
+ */
+#ifdef __ASSEMBLER__
+#if defined(__WIN32__) || defined(__APPLE__)
+#define decorate(s) _##s
+#define cdecl2(s) decorate(s)
+#define cdecl(s) cdecl2(S2N_KYBER_512_R3_NAMESPACE(##s))
+#else
+#define cdecl(s) S2N_KYBER_512_R3_NAMESPACE(##s)
+#endif
+#endif
+
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#ifndef __ASSEMBLER__
+#include "kyber512r3_align_avx2.h"
+typedef ALIGNED_INT16(640) qdata_t;
+#define qdata S2N_KYBER_512_R3_NAMESPACE(qdata)
+extern const qdata_t qdata;
+#endif
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.c
index 8289a526b3..c5ce0c91f2 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.c
@@ -7,7 +7,9 @@
#include <stddef.h>
#include <stdint.h>
-#include "fips202.h"
+
+#include "kyber512r3_params.h"
+#include "kyber512r3_fips202.h"
#define NROUNDS 24
#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset))))
@@ -24,7 +26,7 @@
static uint64_t load64(const uint8_t *x) {
uint64_t r = 0;
for (size_t i = 0; i < 8; ++i) {
- r |= (uint64_t) x[i] << 8 * i;
+ r |= (uint64_t)x[i] << 8 * i;
}
return r;
@@ -46,18 +48,19 @@ static void store64(uint8_t *x, uint64_t u) {
/* Keccak round constants */
static const uint64_t KeccakF_RoundConstants[NROUNDS] = {
- 0x0000000000000001ULL, 0x0000000000008082ULL,
- 0x800000000000808aULL, 0x8000000080008000ULL,
- 0x000000000000808bULL, 0x0000000080000001ULL,
- 0x8000000080008081ULL, 0x8000000000008009ULL,
- 0x000000000000008aULL, 0x0000000000000088ULL,
- 0x0000000080008009ULL, 0x000000008000000aULL,
- 0x000000008000808bULL, 0x800000000000008bULL,
- 0x8000000000008089ULL, 0x8000000000008003ULL,
- 0x8000000000008002ULL, 0x8000000000000080ULL,
- 0x000000000000800aULL, 0x800000008000000aULL,
- 0x8000000080008081ULL, 0x8000000000008080ULL,
- 0x0000000080000001ULL, 0x8000000080008008ULL};
+ 0x0000000000000001ULL, 0x0000000000008082ULL,
+ 0x800000000000808aULL, 0x8000000080008000ULL,
+ 0x000000000000808bULL, 0x0000000080000001ULL,
+ 0x8000000080008081ULL, 0x8000000000008009ULL,
+ 0x000000000000008aULL, 0x0000000000000088ULL,
+ 0x0000000080008009ULL, 0x000000008000000aULL,
+ 0x000000008000808bULL, 0x800000000000008bULL,
+ 0x8000000000008089ULL, 0x8000000000008003ULL,
+ 0x8000000000008002ULL, 0x8000000000000080ULL,
+ 0x000000000000800aULL, 0x800000008000000aULL,
+ 0x8000000080008081ULL, 0x8000000000008080ULL,
+ 0x0000000080000001ULL, 0x8000000080008008ULL,
+};
/*************************************************
* Name: KeccakF1600_StatePermute
@@ -74,9 +77,8 @@ static void KeccakF1600_StatePermute(uint64_t *state) {
uint64_t Aka, Ake, Aki, Ako, Aku;
uint64_t Ama, Ame, Ami, Amo, Amu;
uint64_t Asa, Ase, Asi, Aso, Asu;
- uint64_t BCa, BCe, BCi, BCo, BCu;
- // copyFromState(A, state)
+ /* copyFromState(A, state) */
Aba = state[0];
Abe = state[1];
Abi = state[2];
@@ -104,6 +106,7 @@ static void KeccakF1600_StatePermute(uint64_t *state) {
Asu = state[24];
for (round = 0; round < NROUNDS; round += 2) {
+ uint64_t BCa, BCe, BCi, BCo, BCu;
uint64_t Da, De, Di, Do, Du;
uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
uint64_t Ega, Ege, Egi, Ego, Egu;
@@ -111,14 +114,14 @@ static void KeccakF1600_StatePermute(uint64_t *state) {
uint64_t Ema, Eme, Emi, Emo, Emu;
uint64_t Esa, Ese, Esi, Eso, Esu;
- // prepareTheta
+ /* prepareTheta */
BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
BCe = Abe ^ Age ^ Ake ^ Ame ^ Ase;
BCi = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
BCo = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
BCu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;
- // thetaRhoPiChiIotaPrepareTheta(round , A, E)
+ /* thetaRhoPiChiIotaPrepareTheta(round , A, E) */
Da = BCu ^ ROL(BCe, 1);
De = BCa ^ ROL(BCi, 1);
Di = BCe ^ ROL(BCo, 1);
@@ -206,14 +209,14 @@ static void KeccakF1600_StatePermute(uint64_t *state) {
Eso = BCo ^ ((~BCu) & BCa);
Esu = BCu ^ ((~BCa) & BCe);
- // prepareTheta
+ /* prepareTheta */
BCa = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
BCe = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
BCi = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
BCo = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
BCu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;
- // thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+ /* thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */
Da = BCu ^ ROL(BCe, 1);
De = BCa ^ ROL(BCi, 1);
Di = BCe ^ ROL(BCo, 1);
@@ -302,7 +305,7 @@ static void KeccakF1600_StatePermute(uint64_t *state) {
Asu = BCu ^ ((~BCa) & BCe);
}
- // copyToState(state, A)
+ /* copyToState(state, A) */
state[0] = Aba;
state[1] = Abe;
state[2] = Abi;
@@ -400,6 +403,37 @@ static void keccak_squeezeblocks(uint8_t *h, size_t nblocks, uint64_t *s, uint32
}
/*************************************************
+ * Name: shake128_absorb
+ *
+ * Description: Absorb step of the SHAKE128 XOF.
+ * non-incremental, starts by zeroeing the state.
+ *
+ * Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state
+ * - const uint8_t *input: pointer to input to be absorbed
+ * into s
+ * - size_t inlen: length of input in bytes
+ **************************************************/
+void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen) {
+ keccak_absorb(state->ctx, S2N_KYBER_512_R3_SHAKE128_RATE, input, inlen, 0x1F);
+}
+
+/*************************************************
+ * Name: shake128_squeezeblocks
+ *
+ * Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of
+ * SHAKE128_RATE bytes each. Modifies the state. Can be called
+ * multiple times to keep squeezing, i.e., is incremental.
+ *
+ * Arguments: - uint8_t *output: pointer to output blocks
+ * - size_t nblocks: number of blocks to be squeezed
+ * (written to output)
+ * - shake128ctx *state: pointer to input/output Keccak state
+ **************************************************/
+void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state) {
+ keccak_squeezeblocks(output, nblocks, state->ctx, S2N_KYBER_512_R3_SHAKE128_RATE);
+}
+
+/*************************************************
* Name: shake256_absorb
*
* Description: Absorb step of the SHAKE256 XOF.
@@ -410,8 +444,8 @@ static void keccak_squeezeblocks(uint8_t *h, size_t nblocks, uint64_t *s, uint32
* into s
* - size_t inlen: length of input in bytes
**************************************************/
-static void shake256_absorb(shake256_ctx *state, const uint8_t *input, size_t inlen) {
- keccak_absorb(state->ctx, SHAKE256_RATE, input, inlen, 0x1F);
+void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen) {
+ keccak_absorb(state->ctx, S2N_KYBER_512_R3_SHAKE256_RATE, input, inlen, 0x1F);
}
/*************************************************
@@ -426,8 +460,8 @@ static void shake256_absorb(shake256_ctx *state, const uint8_t *input, size_t in
* (written to output)
* - shake256ctx *state: pointer to input/output Keccak state
**************************************************/
-static void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256_ctx *state) {
- keccak_squeezeblocks(output, nblocks, state->ctx, SHAKE256_RATE);
+void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state) {
+ keccak_squeezeblocks(output, nblocks, state->ctx, S2N_KYBER_512_R3_SHAKE256_RATE);
}
/*************************************************
@@ -441,15 +475,15 @@ static void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256_ctx
* - size_t inlen: length of input in bytes
**************************************************/
void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen) {
- size_t nblocks = outlen / SHAKE256_RATE;
- uint8_t t[SHAKE256_RATE];
- shake256_ctx s;
+ size_t nblocks = outlen / S2N_KYBER_512_R3_SHAKE256_RATE;
+ uint8_t t[S2N_KYBER_512_R3_SHAKE256_RATE];
+ shake256ctx s;
shake256_absorb(&s, input, inlen);
shake256_squeezeblocks(output, nblocks, &s);
- output += nblocks * SHAKE256_RATE;
- outlen -= nblocks * SHAKE256_RATE;
+ output += nblocks * S2N_KYBER_512_R3_SHAKE256_RATE;
+ outlen -= nblocks * S2N_KYBER_512_R3_SHAKE256_RATE;
if (outlen) {
shake256_squeezeblocks(t, 1, &s);
@@ -459,3 +493,50 @@ void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen
}
}
+/*************************************************
+ * Name: sha3_256
+ *
+ * Description: SHA3-256 with non-incremental API
+ *
+ * Arguments: - uint8_t *output: pointer to output
+ * - const uint8_t *input: pointer to input
+ * - size_t inlen: length of input in bytes
+ **************************************************/
+void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen) {
+ uint64_t s[25];
+ uint8_t t[S2N_KYBER_512_R3_SHA3_256_RATE];
+
+ /* Absorb input */
+ keccak_absorb(s, S2N_KYBER_512_R3_SHA3_256_RATE, input, inlen, 0x06);
+
+ /* Squeeze output */
+ keccak_squeezeblocks(t, 1, s, S2N_KYBER_512_R3_SHA3_256_RATE);
+
+ for (size_t i = 0; i < 32; i++) {
+ output[i] = t[i];
+ }
+}
+
+/*************************************************
+ * Name: sha3_512
+ *
+ * Description: SHA3-512 with non-incremental API
+ *
+ * Arguments: - uint8_t *output: pointer to output
+ * - const uint8_t *input: pointer to input
+ * - size_t inlen: length of input in bytes
+ **************************************************/
+void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen) {
+ uint64_t s[25];
+ uint8_t t[S2N_KYBER_512_R3_SHA3_512_RATE];
+
+ /* Absorb input */
+ keccak_absorb(s, S2N_KYBER_512_R3_SHA3_512_RATE, input, inlen, 0x06);
+
+ /* Squeeze output */
+ keccak_squeezeblocks(t, 1, s, S2N_KYBER_512_R3_SHA3_512_RATE);
+
+ for (size_t i = 0; i < 64; i++) {
+ output[i] = t[i];
+ }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.h
new file mode 100644
index 0000000000..1f4f395f72
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include "kyber512r3_params.h"
+
+#define S2N_KYBER_512_R3_SHAKE128_RATE 168
+#define S2N_KYBER_512_R3_SHAKE256_RATE 136
+#define S2N_KYBER_512_R3_SHA3_256_RATE 136
+#define S2N_KYBER_512_R3_SHA3_384_RATE 104
+#define S2N_KYBER_512_R3_SHA3_512_RATE 72
+
+#define S2N_KYBER_512_R3_PQC_SHAKECTX_SIZE 25
+
+/* Context for non-incremental API */
+#define shake128ctx S2N_KYBER_512_R3_NAMESPACE(shake128ctx)
+typedef struct {
+ uint64_t ctx[S2N_KYBER_512_R3_PQC_SHAKECTX_SIZE];
+} shake128ctx;
+
+/* Context for non-incremental API */
+#define shake256ctx S2N_KYBER_512_R3_NAMESPACE(shake256ctx)
+typedef struct {
+ uint64_t ctx[S2N_KYBER_512_R3_PQC_SHAKECTX_SIZE];
+} shake256ctx;
+
+/* Initialize the state and absorb the provided input.
+ *
+ * This function does not support being called multiple times
+ * with the same state.
+ */
+#define shake128_absorb S2N_KYBER_512_R3_NAMESPACE(shake128_absorb)
+void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen);
+/* Squeeze output out of the sponge.
+ *
+ * Supports being called multiple times
+ */
+#define shake128_squeezeblocks S2N_KYBER_512_R3_NAMESPACE(shake128_squeezeblocks)
+void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state);
+
+/* Copy the state. */
+#define shake128_ctx_clone S2N_KYBER_512_R3_NAMESPACE(shake128_ctx_clone)
+void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src);
+
+/* Initialize the state and absorb the provided input.
+ *
+ * This function does not support being called multiple times
+ * with the same state.
+ */
+#define shake256_absorb S2N_KYBER_512_R3_NAMESPACE(shake256_absorb)
+void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen);
+/* Squeeze output out of the sponge.
+ *
+ * Supports being called multiple times
+ */
+#define shake256_squeezeblocks S2N_KYBER_512_R3_NAMESPACE(shake256_squeezeblocks)
+void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state);
+
+/* One-stop SHAKE256 call */
+#define shake256 S2N_KYBER_512_R3_NAMESPACE(shake256)
+void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen);
+
+#define sha3_256 S2N_KYBER_512_R3_NAMESPACE(sha3_256)
+void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen);
+
+/* One-stop SHA3-512 shop */
+#define sha3_512 S2N_KYBER_512_R3_NAMESPACE(sha3_512)
+void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.c
new file mode 100644
index 0000000000..5f07fb44a3
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.c
@@ -0,0 +1,210 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "kyber512r3_fips202.h"
+#include "kyber512r3_fips202x4_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+#define KeccakF1600_StatePermute4x S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds)
+extern void KeccakF1600_StatePermute4x(__m256i *s);
+
+/* Implementation is used from Crystal Kyber Repository
+ * See for more details: https://github.com/XKCP/XKCP */
+
+static void keccakx4_absorb_once(__m256i s[25],
+ unsigned int r,
+ const uint8_t *in0,
+ const uint8_t *in1,
+ const uint8_t *in2,
+ const uint8_t *in3,
+ size_t inlen,
+ uint8_t p)
+{
+ size_t i;
+ uint64_t pos = 0;
+ __m256i t, idx;
+
+ for(i = 0; i < 25; ++i)
+ s[i] = _mm256_setzero_si256();
+
+ idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0);
+ while(inlen >= r) {
+ for(i = 0; i < r/8; ++i) {
+ t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
+ s[i] = _mm256_xor_si256(s[i], t);
+ pos += 8;
+ }
+ inlen -= r;
+
+ KeccakF1600_StatePermute4x(s);
+ }
+
+ for(i = 0; i < inlen/8; ++i) {
+ t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
+ s[i] = _mm256_xor_si256(s[i], t);
+ pos += 8;
+ }
+ inlen -= 8*i;
+
+ if(inlen) {
+ t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
+ idx = _mm256_set1_epi64x((1ULL << (8*inlen)) - 1);
+ t = _mm256_and_si256(t, idx);
+ s[i] = _mm256_xor_si256(s[i], t);
+ }
+
+ t = _mm256_set1_epi64x((uint64_t)p << 8*inlen);
+ s[i] = _mm256_xor_si256(s[i], t);
+ t = _mm256_set1_epi64x(1ULL << 63);
+ s[r/8 - 1] = _mm256_xor_si256(s[r/8 - 1], t);
+}
+
+static void keccakx4_squeezeblocks(uint8_t *out0,
+ uint8_t *out1,
+ uint8_t *out2,
+ uint8_t *out3,
+ size_t nblocks,
+ unsigned int r,
+ __m256i s[25])
+{
+ unsigned int i;
+ __m128d t;
+
+ while(nblocks > 0) {
+ KeccakF1600_StatePermute4x(s);
+ for(i=0; i < r/8; ++i) {
+ t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i]));
+ // correcting cast-align errors
+ // old version: _mm_storel_pd((__attribute__((__may_alias__)) double *)&out0[8*i], t);
+ _mm_storel_pd((__attribute__((__may_alias__)) void *)&out0[8*i], t);
+ // old version: _mm_storeh_pd((__attribute__((__may_alias__)) double *)&out1[8*i], t);
+ _mm_storeh_pd((__attribute__((__may_alias__)) void *)&out1[8*i], t);
+ t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i],1));
+ // old version: _mm_storel_pd((__attribute__((__may_alias__)) double *)&out2[8*i], t);
+ _mm_storel_pd((__attribute__((__may_alias__)) void *)&out2[8*i], t);
+ // old version: _mm_storeh_pd((__attribute__((__may_alias__)) double *)&out3[8*i], t);
+ _mm_storeh_pd((__attribute__((__may_alias__)) void *)&out3[8*i], t);
+ }
+
+ out0 += r;
+ out1 += r;
+ out2 += r;
+ out3 += r;
+ --nblocks;
+ }
+}
+
+void shake128x4_absorb_once(keccakx4_state *state,
+ const uint8_t *in0,
+ const uint8_t *in1,
+ const uint8_t *in2,
+ const uint8_t *in3,
+ size_t inlen)
+{
+ keccakx4_absorb_once(state->s, S2N_KYBER_512_R3_SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
+}
+
+void shake128x4_squeezeblocks(uint8_t *out0,
+ uint8_t *out1,
+ uint8_t *out2,
+ uint8_t *out3,
+ size_t nblocks,
+ keccakx4_state *state)
+{
+ keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, S2N_KYBER_512_R3_SHAKE128_RATE, state->s);
+}
+
+void shake256x4_absorb_once(keccakx4_state *state,
+ const uint8_t *in0,
+ const uint8_t *in1,
+ const uint8_t *in2,
+ const uint8_t *in3,
+ size_t inlen)
+{
+ keccakx4_absorb_once(state->s, S2N_KYBER_512_R3_SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);
+}
+
+void shake256x4_squeezeblocks(uint8_t *out0,
+ uint8_t *out1,
+ uint8_t *out2,
+ uint8_t *out3,
+ size_t nblocks,
+ keccakx4_state *state)
+{
+ keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, S2N_KYBER_512_R3_SHAKE256_RATE, state->s);
+}
+
+void shake128x4(uint8_t *out0,
+ uint8_t *out1,
+ uint8_t *out2,
+ uint8_t *out3,
+ size_t outlen,
+ const uint8_t *in0,
+ const uint8_t *in1,
+ const uint8_t *in2,
+ const uint8_t *in3,
+ size_t inlen)
+{
+ unsigned int i;
+ size_t nblocks = outlen/S2N_KYBER_512_R3_SHAKE128_RATE;
+ uint8_t t[4][S2N_KYBER_512_R3_SHAKE128_RATE];
+ keccakx4_state state;
+
+ shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen);
+ shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);
+
+ out0 += nblocks*S2N_KYBER_512_R3_SHAKE128_RATE;
+ out1 += nblocks*S2N_KYBER_512_R3_SHAKE128_RATE;
+ out2 += nblocks*S2N_KYBER_512_R3_SHAKE128_RATE;
+ out3 += nblocks*S2N_KYBER_512_R3_SHAKE128_RATE;
+ outlen -= nblocks*S2N_KYBER_512_R3_SHAKE128_RATE;
+
+ if(outlen) {
+ shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
+ for(i = 0; i < outlen; ++i) {
+ out0[i] = t[0][i];
+ out1[i] = t[1][i];
+ out2[i] = t[2][i];
+ out3[i] = t[3][i];
+ }
+ }
+}
+
+void shake256x4(uint8_t *out0,
+ uint8_t *out1,
+ uint8_t *out2,
+ uint8_t *out3,
+ size_t outlen,
+ const uint8_t *in0,
+ const uint8_t *in1,
+ const uint8_t *in2,
+ const uint8_t *in3,
+ size_t inlen)
+{
+ unsigned int i;
+ size_t nblocks = outlen/S2N_KYBER_512_R3_SHAKE256_RATE;
+ uint8_t t[4][S2N_KYBER_512_R3_SHAKE256_RATE];
+ keccakx4_state state;
+
+ shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen);
+ shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);
+
+ out0 += nblocks*S2N_KYBER_512_R3_SHAKE256_RATE;
+ out1 += nblocks*S2N_KYBER_512_R3_SHAKE256_RATE;
+ out2 += nblocks*S2N_KYBER_512_R3_SHAKE256_RATE;
+ out3 += nblocks*S2N_KYBER_512_R3_SHAKE256_RATE;
+ outlen -= nblocks*S2N_KYBER_512_R3_SHAKE256_RATE;
+
+ if(outlen) {
+ shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
+ for(i = 0; i < outlen; ++i) {
+ out0[i] = t[0][i];
+ out1[i] = t[1][i];
+ out2[i] = t[2][i];
+ out3[i] = t[3][i];
+ }
+ }
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.h
new file mode 100644
index 0000000000..8c4896724c
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include "kyber512r3_params.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+#define keccakx4_state S2N_KYBER_512_R3_NAMESPACE(keccakx4_state)
+typedef struct {
+ __m256i s[25];
+} keccakx4_state;
+
+#define shake128x4_absorb_once S2N_KYBER_512_R3_NAMESPACE(shake128x4_absorb_once)
+void shake128x4_absorb_once(keccakx4_state *state,
+ const uint8_t *in0,
+ const uint8_t *in1,
+ const uint8_t *in2,
+ const uint8_t *in3,
+ size_t inlen);
+
+#define shake128x4_squeezeblocks S2N_KYBER_512_R3_NAMESPACE(shake128x4_squeezeblocks)
+void shake128x4_squeezeblocks(uint8_t *out0,
+ uint8_t *out1,
+ uint8_t *out2,
+ uint8_t *out3,
+ size_t nblocks,
+ keccakx4_state *state);
+
+#define shake256x4_absorb_once S2N_KYBER_512_R3_NAMESPACE(shake256x4_absorb_once)
+void shake256x4_absorb_once(keccakx4_state *state,
+ const uint8_t *in0,
+ const uint8_t *in1,
+ const uint8_t *in2,
+ const uint8_t *in3,
+ size_t inlen);
+
+#define shake256x4_squeezeblocks S2N_KYBER_512_R3_NAMESPACE(shake256x4_squeezeblocks)
+void shake256x4_squeezeblocks(uint8_t *out0,
+ uint8_t *out1,
+ uint8_t *out2,
+ uint8_t *out3,
+ size_t nblocks,
+ keccakx4_state *state);
+
+#define shake128x4 S2N_KYBER_512_R3_NAMESPACE(shake128x4)
+void shake128x4(uint8_t *out0,
+ uint8_t *out1,
+ uint8_t *out2,
+ uint8_t *out3,
+ size_t outlen,
+ const uint8_t *in0,
+ const uint8_t *in1,
+ const uint8_t *in2,
+ const uint8_t *in3,
+ size_t inlen);
+
+#define shake256x4 S2N_KYBER_512_R3_NAMESPACE(shake256x4)
+void shake256x4(uint8_t *out0,
+ uint8_t *out1,
+ uint8_t *out2,
+ uint8_t *out3,
+ size_t outlen,
+ const uint8_t *in0,
+ const uint8_t *in1,
+ const uint8_t *in2,
+ const uint8_t *in3,
+ size_t inlen);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fq_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fq_avx2.S
new file mode 100644
index 0000000000..3492489a67
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fq_avx2.S
@@ -0,0 +1,122 @@
+#include "kyber512r3_consts_avx2.h"
+
+// The small macros (.inc files) are combined with .S files directly
+/*****.include "fq.inc"*****/
+/***************************/
+.macro red16 r,rs=0,x=12
+vpmulhw %ymm1,%ymm\r,%ymm\x
+.if \rs
+vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
+.else
+vpsraw $10,%ymm\x,%ymm\x
+.endif
+vpmullw %ymm0,%ymm\x,%ymm\x
+vpsubw %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro csubq r,x=12
+vpsubw %ymm0,%ymm\r,%ymm\r
+vpsraw $15,%ymm\r,%ymm\x
+vpand %ymm0,%ymm\x,%ymm\x
+vpaddw %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro caddq r,x=12
+vpsraw $15,%ymm\r,%ymm\x
+vpand %ymm0,%ymm\x,%ymm\x
+vpaddw %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro fqmulprecomp al,ah,b,x=12
+vpmullw %ymm\al,%ymm\b,%ymm\x
+vpmulhw %ymm\ah,%ymm\b,%ymm\b
+vpmulhw %ymm0,%ymm\x,%ymm\x
+vpsubw %ymm\x,%ymm\b,%ymm\b
+.endm
+/***************************/
+
+.text
+reduce128_avx:
+#load
+vmovdqa (%rdi),%ymm2
+vmovdqa 32(%rdi),%ymm3
+vmovdqa 64(%rdi),%ymm4
+vmovdqa 96(%rdi),%ymm5
+vmovdqa 128(%rdi),%ymm6
+vmovdqa 160(%rdi),%ymm7
+vmovdqa 192(%rdi),%ymm8
+vmovdqa 224(%rdi),%ymm9
+
+red16 2
+red16 3
+red16 4
+red16 5
+red16 6
+red16 7
+red16 8
+red16 9
+
+#store
+vmovdqa %ymm2,(%rdi)
+vmovdqa %ymm3,32(%rdi)
+vmovdqa %ymm4,64(%rdi)
+vmovdqa %ymm5,96(%rdi)
+vmovdqa %ymm6,128(%rdi)
+vmovdqa %ymm7,160(%rdi)
+vmovdqa %ymm8,192(%rdi)
+vmovdqa %ymm9,224(%rdi)
+
+ret
+
+.global cdecl(reduce_avx2_asm)
+cdecl(reduce_avx2_asm):
+#consts
+vmovdqa _16XQ*2(%rsi),%ymm0
+vmovdqa _16XV*2(%rsi),%ymm1
+call reduce128_avx
+add $256,%rdi
+call reduce128_avx
+ret
+
+tomont128_avx:
+#load
+vmovdqa (%rdi),%ymm3
+vmovdqa 32(%rdi),%ymm4
+vmovdqa 64(%rdi),%ymm5
+vmovdqa 96(%rdi),%ymm6
+vmovdqa 128(%rdi),%ymm7
+vmovdqa 160(%rdi),%ymm8
+vmovdqa 192(%rdi),%ymm9
+vmovdqa 224(%rdi),%ymm10
+
+fqmulprecomp 1,2,3,11
+fqmulprecomp 1,2,4,12
+fqmulprecomp 1,2,5,13
+fqmulprecomp 1,2,6,14
+fqmulprecomp 1,2,7,15
+fqmulprecomp 1,2,8,11
+fqmulprecomp 1,2,9,12
+fqmulprecomp 1,2,10,13
+
+#store
+vmovdqa %ymm3,(%rdi)
+vmovdqa %ymm4,32(%rdi)
+vmovdqa %ymm5,64(%rdi)
+vmovdqa %ymm6,96(%rdi)
+vmovdqa %ymm7,128(%rdi)
+vmovdqa %ymm8,160(%rdi)
+vmovdqa %ymm9,192(%rdi)
+vmovdqa %ymm10,224(%rdi)
+
+ret
+
+.global cdecl(tomont_avx2_asm)
+cdecl(tomont_avx2_asm):
+#consts
+vmovdqa _16XQ*2(%rsi),%ymm0
+vmovdqa _16XMONTSQLO*2(%rsi),%ymm1
+vmovdqa _16XMONTSQHI*2(%rsi),%ymm2
+call tomont128_avx
+add $256,%rdi
+call tomont128_avx
+ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.c
new file mode 100644
index 0000000000..ace1783448
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.c
@@ -0,0 +1,323 @@
+#include <stddef.h>
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_indcpa.h"
+#include "kyber512r3_poly.h"
+#include "kyber512r3_polyvec.h"
+#include "kyber512r3_fips202.h"
+#include "kyber512r3_symmetric.h"
+#include "pq-crypto/s2n_pq_random.h"
+#include "utils/s2n_safety.h"
+
+/*************************************************
+* Name: pack_pk
+*
+* Description: Serialize the public key as concatenation of the
+* serialized vector of polynomials pk
+* and the public seed used to generate the matrix A.
+*
+* Arguments: uint8_t *r: pointer to the output serialized public key
+* polyvec *pk: pointer to the input public-key polyvec
+* const uint8_t *seed: pointer to the input public seed
+**************************************************/
+static void pack_pk(uint8_t r[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], polyvec *pk, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES]) {
+ polyvec_tobytes(r, pk);
+ for (size_t i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) {
+ r[i + S2N_KYBER_512_R3_POLYVECBYTES] = seed[i];
+ }
+}
+
+/*************************************************
+* Name: unpack_pk
+*
+* Description: De-serialize public key from a byte array;
+* approximate inverse of pack_pk
+*
+* Arguments: - polyvec *pk: pointer to output public-key
+* polynomial vector
+* - uint8_t *seed: pointer to output seed to generate
+* matrix A
+* - const uint8_t *packedpk: pointer to input serialized public key
+**************************************************/
+static void unpack_pk(polyvec *pk, uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], const uint8_t packedpk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES]) {
+ polyvec_frombytes(pk, packedpk);
+ for (size_t i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) {
+ seed[i] = packedpk[i + S2N_KYBER_512_R3_POLYVECBYTES];
+ }
+}
+
+/*************************************************
+* Name: pack_sk
+*
+* Description: Serialize the secret key
+*
+* Arguments: - uint8_t *r: pointer to output serialized secret key
+* - polyvec *sk: pointer to input vector of polynomials (secret key)
+**************************************************/
+static void pack_sk(uint8_t r[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES], polyvec *sk) {
+ polyvec_tobytes(r, sk);
+}
+
+/*************************************************
+* Name: unpack_sk
+*
+* Description: De-serialize the secret key;
+* inverse of pack_sk
+*
+* Arguments: - polyvec *sk: pointer to output vector of
+* polynomials (secret key)
+* - const uint8_t *packedsk: pointer to input serialized secret key
+**************************************************/
+static void unpack_sk(polyvec *sk, const uint8_t packedsk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]) {
+ polyvec_frombytes(sk, packedsk);
+}
+
+/*************************************************
+* Name: pack_ciphertext
+*
+* Description: Serialize the ciphertext as concatenation of the
+* compressed and serialized vector of polynomials b
+* and the compressed and serialized polynomial v
+*
+* Arguments: uint8_t *r: pointer to the output serialized ciphertext
+* poly *pk: pointer to the input vector of polynomials b
+* poly *v: pointer to the input polynomial v
+**************************************************/
+static void pack_ciphertext(uint8_t r[S2N_KYBER_512_R3_INDCPA_BYTES], polyvec *b, poly *v) {
+ polyvec_compress(r, b);
+ poly_compress(r + S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES, v);
+}
+
+/*************************************************
+* Name: unpack_ciphertext
+*
+* Description: De-serialize and decompress ciphertext from a byte array;
+* approximate inverse of pack_ciphertext
+*
+* Arguments: - polyvec *b: pointer to the output vector of polynomials b
+* - poly *v: pointer to the output polynomial v
+* - const uint8_t *c: pointer to the input serialized ciphertext
+**************************************************/
+static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES]) {
+ polyvec_decompress(b, c);
+ poly_decompress(v, c + S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES);
+}
+
+/*************************************************
+* Name: rej_uniform
+*
+* Description: Run rejection sampling on uniform random bytes to generate
+* uniform random integers mod q
+*
+* Arguments: - int16_t *r: pointer to output buffer
+* - unsigned int len: requested number of 16-bit integers
+* (uniform mod q)
+* - const uint8_t *buf: pointer to input buffer
+* (assumed to be uniform random bytes)
+* - unsigned int buflen: length of input buffer in bytes
+*
+* Returns number of sampled 16-bit integers (at most len)
+**************************************************/
+static unsigned int rej_uniform(int16_t *r, unsigned int len, const uint8_t *buf, unsigned int buflen) {
+ unsigned int ctr, pos;
+
+ ctr = pos = 0;
+ while (ctr < len && pos + 3 <= buflen) {
+ uint16_t val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
+ uint16_t val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+ pos += 3;
+
+ if (val0 < S2N_KYBER_512_R3_Q) {
+ r[ctr++] = val0;
+ }
+ if (ctr < len && val1 < S2N_KYBER_512_R3_Q) {
+ r[ctr++] = val1;
+ }
+ }
+
+ return ctr;
+}
+
+/*************************************************
+* Name: gen_matrix
+*
+* Description: Deterministically generate matrix A (or the transpose of A)
+* from a seed. Entries of the matrix are polynomials that look
+* uniformly random. Performs rejection sampling on output of
+* a XOF
+*
+* Arguments: - polyvec *a: pointer to ouptput matrix A
+* - const uint8_t *seed: pointer to input seed
+* - int transposed: boolean deciding whether A or A^T
+* is generated
+**************************************************/
+#define XOF_BLOCKBYTES 168
+#define GEN_MATRIX_NBLOCKS ((12*S2N_KYBER_512_R3_N/8*(1 << 12)/S2N_KYBER_512_R3_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
+static void gen_matrix(polyvec *a, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], int transposed) {
+ unsigned int ctr, buflen, off;
+ uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
+ xof_state state;
+
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ for (unsigned int j = 0; j < S2N_KYBER_512_R3_K; j++) {
+ if (transposed) {
+ kyber_shake128_absorb(&state, seed, i, j);
+ } else {
+ kyber_shake128_absorb(&state, seed, j, i);
+ }
+
+ shake128_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state);
+ buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
+ ctr = rej_uniform(a[i].vec[j].coeffs, S2N_KYBER_512_R3_N, buf, buflen);
+
+ while (ctr < S2N_KYBER_512_R3_N) {
+ off = buflen % 3;
+ for (unsigned int k = 0; k < off; k++) {
+ buf[k] = buf[buflen - off + k];
+ }
+ shake128_squeezeblocks(buf + off, 1, &state);
+ buflen = off + XOF_BLOCKBYTES;
+ ctr += rej_uniform(a[i].vec[j].coeffs + ctr, S2N_KYBER_512_R3_N - ctr, buf, buflen);
+ }
+ }
+ }
+}
+
+/*************************************************
+* Name: indcpa_keypair
+*
+* Description: Generates public and private key for the CPA-secure
+* public-key encryption scheme underlying Kyber
+*
+* Arguments: - uint8_t *pk: pointer to output public key
+* (of length S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES bytes)
+* - uint8_t *sk: pointer to output private key
+* (of length S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES bytes)
+*
+* Returns: 0 on success
+* !0 on failure
+**************************************************/
+int indcpa_keypair(uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]) {
+ uint8_t buf[2 * S2N_KYBER_512_R3_SYMBYTES];
+ const uint8_t *publicseed = buf;
+ const uint8_t *noiseseed = buf + S2N_KYBER_512_R3_SYMBYTES;
+ uint8_t nonce = 0;
+ polyvec a[S2N_KYBER_512_R3_K], e, pkpv, skpv;
+
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, S2N_KYBER_512_R3_SYMBYTES));
+ sha3_512(buf, buf, S2N_KYBER_512_R3_SYMBYTES);
+
+ gen_matrix(a, publicseed, 0);
+
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++);
+ }
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++);
+ }
+
+ polyvec_ntt(&skpv);
+ polyvec_ntt(&e);
+
+ //* matrix-vector multiplication */
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
+ poly_tomont(&pkpv.vec[i]);
+ }
+
+ polyvec_add(&pkpv, &pkpv, &e);
+ polyvec_reduce(&pkpv);
+
+ pack_sk(sk, &skpv);
+ pack_pk(pk, &pkpv, publicseed);
+
+ return 0;
+}
+
+/*************************************************
+* Name: indcpa_enc
+*
+* Description: Encryption function of the CPA-secure
+* public-key encryption scheme underlying Kyber.
+*
+* Arguments: - uint8_t *c: pointer to output ciphertext
+* (of length S2N_KYBER_512_R3_INDCPA_BYTES bytes)
+* - const uint8_t *m: pointer to input message
+* (of length S2N_KYBER_512_R3_INDCPA_MSGBYTES bytes)
+* - const uint8_t *pk: pointer to input public key
+* (of length S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES)
+* - const uint8_t *coins: pointer to input random coins
+* used as seed (of length S2N_KYBER_512_R3_SYMBYTES)
+* to deterministically generate all
+* randomness
+**************************************************/
+void indcpa_enc(uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES], const uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES],
+ const uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], const uint8_t coins[S2N_KYBER_512_R3_SYMBYTES]) {
+ uint8_t seed[S2N_KYBER_512_R3_SYMBYTES];
+ uint8_t nonce = 0;
+ polyvec sp, pkpv, ep, at[S2N_KYBER_512_R3_K], bp;
+ poly v, k, epp;
+
+ unpack_pk(&pkpv, seed, pk);
+ poly_frommsg(&k, m);
+ gen_matrix(at, seed, 1);
+
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ poly_getnoise_eta1(sp.vec + i, coins, nonce++);
+ }
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ poly_getnoise_eta2(ep.vec + i, coins, nonce++);
+ }
+ poly_getnoise_eta2(&epp, coins, nonce++);
+
+ polyvec_ntt(&sp);
+
+ /* matrix-vector multiplication */
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp);
+ }
+
+ polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp);
+
+ polyvec_invntt_tomont(&bp);
+ poly_invntt_tomont(&v);
+
+ polyvec_add(&bp, &bp, &ep);
+ poly_add(&v, &v, &epp);
+ poly_add(&v, &v, &k);
+ polyvec_reduce(&bp);
+ poly_reduce(&v);
+
+ pack_ciphertext(c, &bp, &v);
+}
+
+/*************************************************
+* Name: indcpa_dec
+*
+* Description: Decryption function of the CPA-secure
+* public-key encryption scheme underlying Kyber.
+*
+* Arguments: - uint8_t *m: pointer to output decrypted message
+* (of length S2N_KYBER_512_R3_INDCPA_MSGBYTES)
+* - const uint8_t *c: pointer to input ciphertext
+* (of length S2N_KYBER_512_R3_INDCPA_BYTES)
+* - const uint8_t *sk: pointer to input secret key
+* (of length S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES)
+**************************************************/
+void indcpa_dec(uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES], const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES],
+ const uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]) {
+ polyvec bp, skpv;
+ poly v, mp;
+
+ unpack_ciphertext(&bp, &v, c);
+ unpack_sk(&skpv, sk);
+
+ polyvec_ntt(&bp);
+ polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp);
+ poly_invntt_tomont(&mp);
+
+ poly_sub(&mp, &v, &mp);
+ poly_reduce(&mp);
+
+ poly_tomsg(m, &mp);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.h
new file mode 100644
index 0000000000..f8b9e401a0
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+
+#define indcpa_keypair S2N_KYBER_512_R3_NAMESPACE(indcpa_keypair)
+int indcpa_keypair(uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]);
+
+#define indcpa_enc S2N_KYBER_512_R3_NAMESPACE(indcpa_enc)
+void indcpa_enc(uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES], const uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES],
+ const uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], const uint8_t coins[S2N_KYBER_512_R3_SYMBYTES]);
+
+#define indcpa_dec S2N_KYBER_512_R3_NAMESPACE(indcpa_dec)
+void indcpa_dec(uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES], const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES],
+ const uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.c
new file mode 100644
index 0000000000..91e7513881
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.c
@@ -0,0 +1,363 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "kyber512r3_align_avx2.h"
+#include "kyber512r3_params.h"
+#include "kyber512r3_indcpa_avx2.h"
+#include "kyber512r3_polyvec_avx2.h"
+#include "kyber512r3_poly_avx2.h"
+#include "kyber512r3_rejsample_avx2.h"
+#include "kyber512r3_fips202.h"
+#include "kyber512r3_fips202x4_avx2.h"
+#include "pq-crypto/s2n_pq_random.h"
+#include "utils/s2n_safety.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+/*************************************************
+* Name: pack_pk
+*
+* Description: Serialize the public key as concatenation of the
+* serialized vector of polynomials pk and the
+* public seed used to generate the matrix A.
+* The polynomial coefficients in pk are assumed to
+* lie in the invertal [0,q], i.e. pk must be reduced
+* by polyvec_reduce_avx2().
+*
+* Arguments: uint8_t *r: pointer to the output serialized public key
+* polyvec *pk: pointer to the input public-key polyvec
+* const uint8_t *seed: pointer to the input public seed
+**************************************************/
+static void pack_pk(uint8_t r[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES],
+ polyvec *pk,
+ const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES])
+{
+ polyvec_tobytes_avx2(r, pk);
+ memcpy(r+S2N_KYBER_512_R3_POLYVECBYTES, seed, S2N_KYBER_512_R3_SYMBYTES);
+}
+
+/*************************************************
+* Name: unpack_pk
+*
+* Description: De-serialize public key from a byte array;
+* approximate inverse of pack_pk
+*
+* Arguments: - polyvec *pk: pointer to output public-key polynomial vector
+* - uint8_t *seed: pointer to output seed to generate matrix A
+* - const uint8_t *packedpk: pointer to input serialized public key
+**************************************************/
+static void unpack_pk(polyvec *pk,
+ uint8_t seed[S2N_KYBER_512_R3_SYMBYTES],
+ const uint8_t packedpk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES])
+{
+ polyvec_frombytes_avx2(pk, packedpk);
+ memcpy(seed, packedpk+S2N_KYBER_512_R3_POLYVECBYTES, S2N_KYBER_512_R3_SYMBYTES);
+}
+
+/*************************************************
+* Name: pack_sk
+*
+* Description: Serialize the secret key.
+* The polynomial coefficients in sk are assumed to
+* lie in the invertal [0,q], i.e. sk must be reduced
+* by polyvec_reduce_avx2().
+*
+* Arguments: - uint8_t *r: pointer to output serialized secret key
+* - polyvec *sk: pointer to input vector of polynomials (secret key)
+**************************************************/
+static void pack_sk(uint8_t r[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES], polyvec *sk)
+{
+ polyvec_tobytes_avx2(r, sk);
+}
+
+/*************************************************
+* Name: unpack_sk
+*
+* Description: De-serialize the secret key; inverse of pack_sk
+*
+* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key)
+* - const uint8_t *packedsk: pointer to input serialized secret key
+**************************************************/
+static void unpack_sk(polyvec *sk, const uint8_t packedsk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES])
+{
+ polyvec_frombytes_avx2(sk, packedsk);
+}
+
+/*************************************************
+* Name: pack_ciphertext
+*
+* Description: Serialize the ciphertext as concatenation of the
+* compressed and serialized vector of polynomials b
+* and the compressed and serialized polynomial v.
+* The polynomial coefficients in b and v are assumed to
+* lie in the invertal [0,q], i.e. b and v must be reduced
+* by polyvec_reduce_avx2() and poly_reduce_avx2(), respectively.
+*
+* Arguments: uint8_t *r: pointer to the output serialized ciphertext
+* poly *pk: pointer to the input vector of polynomials b
+* poly *v: pointer to the input polynomial v
+**************************************************/
+static void pack_ciphertext(uint8_t r[S2N_KYBER_512_R3_INDCPA_BYTES], polyvec *b, poly *v)
+{
+ polyvec_compress_avx2(r, b);
+ poly_compress_avx2(r+S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES, v);
+}
+
+/*************************************************
+* Name: unpack_ciphertext
+*
+* Description: De-serialize and decompress ciphertext from a byte array;
+* approximate inverse of pack_ciphertext
+*
+* Arguments: - polyvec *b: pointer to the output vector of polynomials b
+* - poly *v: pointer to the output polynomial v
+* - const uint8_t *c: pointer to the input serialized ciphertext
+**************************************************/
+static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES])
+{
+ polyvec_decompress_avx2(b, c);
+ poly_decompress_avx2(v, c+S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES);
+}
+
+/*************************************************
+* Name: rej_uniform
+*
+* Description: Run rejection sampling on uniform random bytes to generate
+* uniform random integers mod q
+*
+* Arguments: - int16_t *r: pointer to output array
+* - unsigned int len: requested number of 16-bit integers (uniform mod q)
+* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
+* - unsigned int buflen: length of input buffer in bytes
+*
+* Returns number of sampled 16-bit integers (at most len)
+**************************************************/
+static unsigned int rej_uniform(int16_t *r,
+ unsigned int len,
+ const uint8_t *buf,
+ unsigned int buflen)
+{
+ unsigned int ctr, pos;
+ uint16_t val0, val1;
+
+ ctr = pos = 0;
+ while(ctr < len && pos <= buflen - 3) { // buflen is always at least 3
+ val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
+ val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4)) & 0xFFF;
+ pos += 3;
+
+ if(val0 < S2N_KYBER_512_R3_Q)
+ r[ctr++] = val0;
+ if(ctr < len && val1 < S2N_KYBER_512_R3_Q)
+ r[ctr++] = val1;
+ }
+
+ return ctr;
+}
+
+#define gen_a(A,B) gen_matrix_avx2(A,B,0)
+#define gen_at(A,B) gen_matrix_avx2(A,B,1)
+
+/*************************************************
+* Name: gen_matrix_avx2
+*
+* Description: Deterministically generate matrix A (or the transpose of A)
+* from a seed. Entries of the matrix are polynomials that look
+* uniformly random. Performs rejection sampling on output of
+* a XOF
+*
+* Arguments: - polyvec *a: pointer to ouptput matrix A
+* - const uint8_t *seed: pointer to input seed
+* - int transposed: boolean deciding whether A or A^T is generated
+**************************************************/
+void gen_matrix_avx2(polyvec *a, const uint8_t seed[32], int transposed)
+{
+ unsigned int ctr0, ctr1, ctr2, ctr3;
+ ALIGNED_UINT8(S2N_KYBER_512_R3_REJ_UNIFORM_AVX_NBLOCKS*S2N_KYBER_512_R3_SHAKE128_RATE) buf[4];
+ __m256i f;
+ keccakx4_state state;
+
+ // correcting cast-align and cast-qual errors
+ // old version: f = _mm256_loadu_si256((__m256i *)seed);
+ f = _mm256_loadu_si256((const void *)seed);
+ _mm256_store_si256(buf[0].vec, f);
+ _mm256_store_si256(buf[1].vec, f);
+ _mm256_store_si256(buf[2].vec, f);
+ _mm256_store_si256(buf[3].vec, f);
+
+ if(transposed) {
+ buf[0].coeffs[32] = 0;
+ buf[0].coeffs[33] = 0;
+ buf[1].coeffs[32] = 0;
+ buf[1].coeffs[33] = 1;
+ buf[2].coeffs[32] = 1;
+ buf[2].coeffs[33] = 0;
+ buf[3].coeffs[32] = 1;
+ buf[3].coeffs[33] = 1;
+ }
+ else {
+ buf[0].coeffs[32] = 0;
+ buf[0].coeffs[33] = 0;
+ buf[1].coeffs[32] = 1;
+ buf[1].coeffs[33] = 0;
+ buf[2].coeffs[32] = 0;
+ buf[2].coeffs[33] = 1;
+ buf[3].coeffs[32] = 1;
+ buf[3].coeffs[33] = 1;
+ }
+
+ shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
+ shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, S2N_KYBER_512_R3_REJ_UNIFORM_AVX_NBLOCKS, &state);
+
+ ctr0 = rej_uniform_avx2(a[0].vec[0].coeffs, buf[0].coeffs);
+ ctr1 = rej_uniform_avx2(a[0].vec[1].coeffs, buf[1].coeffs);
+ ctr2 = rej_uniform_avx2(a[1].vec[0].coeffs, buf[2].coeffs);
+ ctr3 = rej_uniform_avx2(a[1].vec[1].coeffs, buf[3].coeffs);
+
+ while(ctr0 < S2N_KYBER_512_R3_N || ctr1 < S2N_KYBER_512_R3_N || ctr2 < S2N_KYBER_512_R3_N || ctr3 < S2N_KYBER_512_R3_N) {
+ shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
+
+ ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, S2N_KYBER_512_R3_N - ctr0, buf[0].coeffs, S2N_KYBER_512_R3_SHAKE128_RATE);
+ ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, S2N_KYBER_512_R3_N - ctr1, buf[1].coeffs, S2N_KYBER_512_R3_SHAKE128_RATE);
+ ctr2 += rej_uniform(a[1].vec[0].coeffs + ctr2, S2N_KYBER_512_R3_N - ctr2, buf[2].coeffs, S2N_KYBER_512_R3_SHAKE128_RATE);
+ ctr3 += rej_uniform(a[1].vec[1].coeffs + ctr3, S2N_KYBER_512_R3_N - ctr3, buf[3].coeffs, S2N_KYBER_512_R3_SHAKE128_RATE);
+ }
+
+ poly_nttunpack_avx2(&a[0].vec[0]);
+ poly_nttunpack_avx2(&a[0].vec[1]);
+ poly_nttunpack_avx2(&a[1].vec[0]);
+ poly_nttunpack_avx2(&a[1].vec[1]);
+}
+
+/*************************************************
+* Name: indcpa_keypair_avx2
+*
+* Description: Generates public and private key for the CPA-secure
+* public-key encryption scheme underlying Kyber
+*
+* Arguments: - uint8_t *pk: pointer to output public key
+* (of length S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES bytes)
+* - uint8_t *sk: pointer to output private key
+ (of length S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES bytes)
+**************************************************/
+int indcpa_keypair_avx2(uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES])
+{
+ unsigned int i;
+ uint8_t buf[2*S2N_KYBER_512_R3_SYMBYTES];
+ const uint8_t *publicseed = buf;
+ const uint8_t *noiseseed = buf + S2N_KYBER_512_R3_SYMBYTES;
+ polyvec a[S2N_KYBER_512_R3_K], e, pkpv, skpv;
+
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, S2N_KYBER_512_R3_SYMBYTES));
+ sha3_512(buf, buf, S2N_KYBER_512_R3_SYMBYTES);
+
+ gen_a(a, publicseed);
+
+ poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, e.vec+0, e.vec+1, noiseseed, 0, 1, 2, 3);
+
+ polyvec_ntt_avx2(&skpv);
+ polyvec_reduce_avx2(&skpv);
+ polyvec_ntt_avx2(&e);
+
+ // matrix-vector multiplication
+ for(i=0;i<S2N_KYBER_512_R3_K;i++) {
+ polyvec_basemul_acc_montgomery_avx2(&pkpv.vec[i], &a[i], &skpv);
+ poly_tomont_avx2(&pkpv.vec[i]);
+ }
+
+ polyvec_add_avx2(&pkpv, &pkpv, &e);
+ polyvec_reduce_avx2(&pkpv);
+
+ pack_sk(sk, &skpv);
+ pack_pk(pk, &pkpv, publicseed);
+
+ return 0;
+}
+
+/*************************************************
+* Name: indcpa_enc_avx2
+*
+* Description: Encryption function of the CPA-secure
+* public-key encryption scheme underlying Kyber.
+*
+* Arguments: - uint8_t *c: pointer to output ciphertext
+* (of length S2N_KYBER_512_R3_INDCPA_BYTES bytes)
+* - const uint8_t *m: pointer to input message
+* (of length S2N_KYBER_512_R3_INDCPA_MSGBYTES bytes)
+* - const uint8_t *pk: pointer to input public key
+* (of length S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES)
+* - const uint8_t *coins: pointer to input random coins used as seed
+* (of length S2N_KYBER_512_R3_SYMBYTES) to deterministically
+* generate all randomness
+**************************************************/
+void indcpa_enc_avx2(uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES],
+ const uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES],
+ const uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[S2N_KYBER_512_R3_SYMBYTES])
+{
+ unsigned int i;
+ uint8_t seed[S2N_KYBER_512_R3_SYMBYTES];
+ polyvec sp, pkpv, ep, at[S2N_KYBER_512_R3_K], b;
+ poly v, k, epp;
+
+ unpack_pk(&pkpv, seed, pk);
+ poly_frommsg_avx2(&k, m);
+ gen_at(at, seed);
+
+ poly_getnoise_eta1122_4x(sp.vec+0, sp.vec+1, ep.vec+0, ep.vec+1, coins, 0, 1, 2, 3);
+ poly_getnoise_eta2_avx2(&epp, coins, 4);
+
+ polyvec_ntt_avx2(&sp);
+
+ // matrix-vector multiplication
+ for(i=0;i<S2N_KYBER_512_R3_K;i++)
+ polyvec_basemul_acc_montgomery_avx2(&b.vec[i], &at[i], &sp);
+ polyvec_basemul_acc_montgomery_avx2(&v, &pkpv, &sp);
+
+ polyvec_invntt_tomont_avx2(&b);
+ poly_invntt_tomont_avx2(&v);
+
+ polyvec_add_avx2(&b, &b, &ep);
+ poly_add_avx2(&v, &v, &epp);
+ poly_add_avx2(&v, &v, &k);
+ polyvec_reduce_avx2(&b);
+ poly_reduce_avx2(&v);
+
+ pack_ciphertext(c, &b, &v);
+}
+
+/*************************************************
+* Name: indcpa_dec_avx2
+*
+* Description: Decryption function of the CPA-secure
+* public-key encryption scheme underlying Kyber.
+*
+* Arguments: - uint8_t *m: pointer to output decrypted message
+* (of length S2N_KYBER_512_R3_INDCPA_MSGBYTES)
+* - const uint8_t *c: pointer to input ciphertext
+* (of length S2N_KYBER_512_R3_INDCPA_BYTES)
+* - const uint8_t *sk: pointer to input secret key
+* (of length S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES)
+**************************************************/
+void indcpa_dec_avx2(uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES],
+ const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES],
+ const uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES])
+{
+ polyvec b, skpv;
+ poly v, mp;
+
+ unpack_ciphertext(&b, &v, c);
+ unpack_sk(&skpv, sk);
+
+ polyvec_ntt_avx2(&b);
+ polyvec_basemul_acc_montgomery_avx2(&mp, &skpv, &b);
+ poly_invntt_tomont_avx2(&mp);
+
+ poly_sub_avx2(&mp, &v, &mp);
+ poly_reduce_avx2(&mp);
+
+ poly_tomsg_avx2(m, &mp);
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.h
new file mode 100644
index 0000000000..127e5bc4f6
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_polyvec_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#define gen_matrix_avx2 S2N_KYBER_512_R3_NAMESPACE(gen_matrix_avx2)
+void gen_matrix_avx2(polyvec *a, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], int transposed);
+
+#define indcpa_keypair_avx2 S2N_KYBER_512_R3_NAMESPACE(indcpa_keypair_avx2)
+int indcpa_keypair_avx2(uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]);
+
+#define indcpa_enc_avx2 S2N_KYBER_512_R3_NAMESPACE(indcpa_enc_avx2)
+void indcpa_enc_avx2(uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES],
+ const uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES],
+ const uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[S2N_KYBER_512_R3_SYMBYTES]);
+
+#define indcpa_dec_avx2 S2N_KYBER_512_R3_NAMESPACE(indcpa_dec_avx2)
+void indcpa_dec_avx2(uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES],
+ const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES],
+ const uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_invntt_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_invntt_avx2.S
new file mode 100644
index 0000000000..8f131668ff
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_invntt_avx2.S
@@ -0,0 +1,255 @@
+#include "kyber512r3_consts_avx2.h"
+
+// The small macros (.inc files) are combined with .S files directly
+/*****.include "shuffle.inc"*****/
+/********************************/
+.macro shuffle8 r0,r1,r2,r3
+vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
+vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle4 r0,r1,r2,r3
+vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
+vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle2 r0,r1,r2,r3
+#vpsllq $32,%ymm\r1,%ymm\r2
+vmovsldup %ymm\r1,%ymm\r2
+vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
+vpsrlq $32,%ymm\r0,%ymm\r0
+#vmovshdup %ymm\r0,%ymm\r0
+vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle1 r0,r1,r2,r3
+vpslld $16,%ymm\r1,%ymm\r2
+vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
+vpsrld $16,%ymm\r0,%ymm\r0
+vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+/********************************/
+
+/*****.include "fq.inc"*****/
+/***************************/
+.macro red16 r,rs=0,x=12
+vpmulhw %ymm1,%ymm\r,%ymm\x
+.if \rs
+vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
+.else
+vpsraw $10,%ymm\x,%ymm\x
+.endif
+vpmullw %ymm0,%ymm\x,%ymm\x
+vpsubw %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro csubq r,x=12
+vpsubw %ymm0,%ymm\r,%ymm\r
+vpsraw $15,%ymm\r,%ymm\x
+vpand %ymm0,%ymm\x,%ymm\x
+vpaddw %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro caddq r,x=12
+vpsraw $15,%ymm\r,%ymm\x
+vpand %ymm0,%ymm\x,%ymm\x
+vpaddw %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro fqmulprecomp al,ah,b,x=12
+vpmullw %ymm\al,%ymm\b,%ymm\x
+vpmulhw %ymm\ah,%ymm\b,%ymm\b
+vpmulhw %ymm0,%ymm\x,%ymm\x
+vpsubw %ymm\x,%ymm\b,%ymm\b
+.endm
+/***************************/
+
+.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
+vpsubw %ymm\rl0,%ymm\rh0,%ymm12
+vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0
+vpsubw %ymm\rl1,%ymm\rh1,%ymm13
+
+vpmullw %ymm\zl0,%ymm12,%ymm\rh0
+vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1
+vpsubw %ymm\rl2,%ymm\rh2,%ymm14
+
+vpmullw %ymm\zl0,%ymm13,%ymm\rh1
+vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2
+vpsubw %ymm\rl3,%ymm\rh3,%ymm15
+
+vpmullw %ymm\zl1,%ymm14,%ymm\rh2
+vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3
+vpmullw %ymm\zl1,%ymm15,%ymm\rh3
+
+vpmulhw %ymm\zh0,%ymm12,%ymm12
+vpmulhw %ymm\zh0,%ymm13,%ymm13
+
+vpmulhw %ymm\zh1,%ymm14,%ymm14
+vpmulhw %ymm\zh1,%ymm15,%ymm15
+
+vpmulhw %ymm0,%ymm\rh0,%ymm\rh0
+
+vpmulhw %ymm0,%ymm\rh1,%ymm\rh1
+
+vpmulhw %ymm0,%ymm\rh2,%ymm\rh2
+vpmulhw %ymm0,%ymm\rh3,%ymm\rh3
+
+#
+
+#
+
+vpsubw %ymm\rh0,%ymm12,%ymm\rh0
+
+vpsubw %ymm\rh1,%ymm13,%ymm\rh1
+
+vpsubw %ymm\rh2,%ymm14,%ymm\rh2
+vpsubw %ymm\rh3,%ymm15,%ymm\rh3
+.endm
+
+.macro intt_levels0t5 off
+/* level 0 */
+vmovdqa _16XFLO*2(%rsi),%ymm2
+vmovdqa _16XFHI*2(%rsi),%ymm3
+
+vmovdqa (128*\off+ 0)*2(%rdi),%ymm4
+vmovdqa (128*\off+ 32)*2(%rdi),%ymm6
+vmovdqa (128*\off+ 16)*2(%rdi),%ymm5
+vmovdqa (128*\off+ 48)*2(%rdi),%ymm7
+
+fqmulprecomp 2,3,4
+fqmulprecomp 2,3,6
+fqmulprecomp 2,3,5
+fqmulprecomp 2,3,7
+
+vmovdqa (128*\off+ 64)*2(%rdi),%ymm8
+vmovdqa (128*\off+ 96)*2(%rdi),%ymm10
+vmovdqa (128*\off+ 80)*2(%rdi),%ymm9
+vmovdqa (128*\off+112)*2(%rdi),%ymm11
+
+fqmulprecomp 2,3,8
+fqmulprecomp 2,3,10
+fqmulprecomp 2,3,9
+fqmulprecomp 2,3,11
+
+vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
+vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
+vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
+vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
+vmovdqa _REVIDXB*2(%rsi),%ymm12
+vpshufb %ymm12,%ymm15,%ymm15
+vpshufb %ymm12,%ymm1,%ymm1
+vpshufb %ymm12,%ymm2,%ymm2
+vpshufb %ymm12,%ymm3,%ymm3
+
+butterfly 4,5,8,9,6,7,10,11,15,1,2,3
+
+/* level 1 */
+vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
+vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
+vmovdqa _REVIDXB*2(%rsi),%ymm1
+vpshufb %ymm1,%ymm2,%ymm2
+vpshufb %ymm1,%ymm3,%ymm3
+
+butterfly 4,5,6,7,8,9,10,11,2,2,3,3
+
+shuffle1 4,5,3,5
+shuffle1 6,7,4,7
+shuffle1 8,9,6,9
+shuffle1 10,11,8,11
+
+/* level 2 */
+vmovdqa _REVIDXD*2(%rsi),%ymm12
+vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
+vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10
+
+butterfly 3,4,6,8,5,7,9,11,2,2,10,10
+
+vmovdqa _16XV*2(%rsi),%ymm1
+red16 3
+
+shuffle2 3,4,10,4
+shuffle2 6,8,3,8
+shuffle2 5,7,6,7
+shuffle2 9,11,5,11
+
+/* level 3 */
+vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
+vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9
+
+butterfly 10,3,6,5,4,8,7,11,2,2,9,9
+
+shuffle4 10,3,9,3
+shuffle4 6,5,10,5
+shuffle4 4,8,6,8
+shuffle4 7,11,4,11
+
+/* level 4 */
+vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
+vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7
+
+butterfly 9,10,6,4,3,5,8,11,2,2,7,7
+
+red16 9
+
+shuffle8 9,10,7,10
+shuffle8 6,4,9,4
+shuffle8 3,5,6,5
+shuffle8 8,11,3,11
+
+/* level 5 */
+vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
+vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8
+
+butterfly 7,9,6,3,10,4,5,11,2,2,8,8
+
+vmovdqa %ymm7,(128*\off+ 0)*2(%rdi)
+vmovdqa %ymm9,(128*\off+ 16)*2(%rdi)
+vmovdqa %ymm6,(128*\off+ 32)*2(%rdi)
+vmovdqa %ymm3,(128*\off+ 48)*2(%rdi)
+vmovdqa %ymm10,(128*\off+ 64)*2(%rdi)
+vmovdqa %ymm4,(128*\off+ 80)*2(%rdi)
+vmovdqa %ymm5,(128*\off+ 96)*2(%rdi)
+vmovdqa %ymm11,(128*\off+112)*2(%rdi)
+.endm
+
+.macro intt_level6 off
+/* level 6 */
+vmovdqa (64*\off+ 0)*2(%rdi),%ymm4
+vmovdqa (64*\off+128)*2(%rdi),%ymm8
+vmovdqa (64*\off+ 16)*2(%rdi),%ymm5
+vmovdqa (64*\off+144)*2(%rdi),%ymm9
+vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2
+
+vmovdqa (64*\off+ 32)*2(%rdi),%ymm6
+vmovdqa (64*\off+160)*2(%rdi),%ymm10
+vmovdqa (64*\off+ 48)*2(%rdi),%ymm7
+vmovdqa (64*\off+176)*2(%rdi),%ymm11
+vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3
+
+butterfly 4,5,6,7,8,9,10,11
+
+.if \off == 0
+red16 4
+.endif
+
+vmovdqa %ymm4,(64*\off+ 0)*2(%rdi)
+vmovdqa %ymm5,(64*\off+ 16)*2(%rdi)
+vmovdqa %ymm6,(64*\off+ 32)*2(%rdi)
+vmovdqa %ymm7,(64*\off+ 48)*2(%rdi)
+vmovdqa %ymm8,(64*\off+128)*2(%rdi)
+vmovdqa %ymm9,(64*\off+144)*2(%rdi)
+vmovdqa %ymm10,(64*\off+160)*2(%rdi)
+vmovdqa %ymm11,(64*\off+176)*2(%rdi)
+.endm
+
+.text
+.global cdecl(invntt_avx2_asm)
+cdecl(invntt_avx2_asm):
+vmovdqa _16XQ*2(%rsi),%ymm0
+
+intt_levels0t5 0
+intt_levels0t5 1
+
+intt_level6 0
+intt_level6 1
+ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_kem.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_kem.c
new file mode 100644
index 0000000000..9d6c49b9c4
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_kem.c
@@ -0,0 +1,158 @@
+#include <stddef.h>
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_symmetric.h"
+#include "kyber512r3_indcpa.h"
+#include "kyber512r3_indcpa_avx2.h"
+#include "tls/s2n_kem.h"
+#include "utils/s2n_safety.h"
+#include "pq-crypto/s2n_pq_random.h"
+#include "pq-crypto/s2n_pq.h"
+
+/*************************************************
+* Name: crypto_kem_keypair
+*
+* Description: Generates public and private key
+* for CCA-secure Kyber key encapsulation mechanism
+*
+* Arguments: - unsigned char *pk: pointer to output public key
+* (an already allocated array of S2N_KYBER_512_R3_PUBLIC_KEY_BYTES bytes)
+* - unsigned char *sk: pointer to output private key
+* (an already allocated array of S2N_KYBER_512_R3_SECRET_KEY_BYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int s2n_kyber_512_r3_crypto_kem_keypair(unsigned char *pk, unsigned char *sk)
+{
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+ if (s2n_kyber512r3_is_avx2_bmi2_enabled()) {
+ POSIX_GUARD(indcpa_keypair_avx2(pk, sk));
+ }else
+#endif
+ {
+ POSIX_GUARD(indcpa_keypair(pk, sk));
+ }
+
+ for(size_t i = 0; i < S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES; i++) {
+ sk[i + S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES] = pk[i];
+ }
+ sha3_256(sk+S2N_KYBER_512_R3_SECRET_KEY_BYTES-2*S2N_KYBER_512_R3_SYMBYTES, pk, S2N_KYBER_512_R3_PUBLIC_KEY_BYTES);
+ /* Value z for pseudo-random output on reject */
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(sk+S2N_KYBER_512_R3_SECRET_KEY_BYTES-S2N_KYBER_512_R3_SYMBYTES, S2N_KYBER_512_R3_SYMBYTES));
+ return S2N_SUCCESS;
+}
+
+/*************************************************
+* Name: crypto_kem_enc
+*
+* Description: Generates cipher text and shared
+* secret for given public key
+*
+* Arguments: - unsigned char *ct: pointer to output cipher text
+* (an already allocated array of S2N_KYBER_512_R3_CIPHERTEXT_BYTES bytes)
+* - unsigned char *ss: pointer to output shared secret
+* (an already allocated array of S2N_KYBER_512_R3_SHARED_SECRET_BYTES bytes)
+* - const unsigned char *pk: pointer to input public key
+* (an already allocated array of S2N_KYBER_512_R3_PUBLIC_KEY_BYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int s2n_kyber_512_r3_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk)
+{
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ uint8_t buf[2*S2N_KYBER_512_R3_SYMBYTES];
+ /* Will contain key, coins */
+ uint8_t kr[2*S2N_KYBER_512_R3_SYMBYTES];
+
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, S2N_KYBER_512_R3_SYMBYTES));
+ /* Don't release system RNG output */
+ sha3_256(buf, buf, S2N_KYBER_512_R3_SYMBYTES);
+
+ /* Multitarget countermeasure for coins + contributory KEM */
+ sha3_256(buf+S2N_KYBER_512_R3_SYMBYTES, pk, S2N_KYBER_512_R3_PUBLIC_KEY_BYTES);
+ sha3_512(kr, buf, 2*S2N_KYBER_512_R3_SYMBYTES);
+
+ /* coins are in kr+S2N_KYBER_512_R3_SYMBYTES */
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+ if (s2n_kyber512r3_is_avx2_bmi2_enabled()) {
+ indcpa_enc_avx2(ct, buf, pk, kr+S2N_KYBER_512_R3_SYMBYTES);
+ }else
+#endif
+ {
+ indcpa_enc(ct, buf, pk, kr+S2N_KYBER_512_R3_SYMBYTES);
+ }
+
+ /* overwrite coins in kr with H(c) */
+ sha3_256(kr+S2N_KYBER_512_R3_SYMBYTES, ct, S2N_KYBER_512_R3_CIPHERTEXT_BYTES);
+ /* hash concatenation of pre-k and H(c) to k */
+ shake256(ss, S2N_KYBER_512_R3_SSBYTES, kr, 2*S2N_KYBER_512_R3_SYMBYTES);
+ return S2N_SUCCESS;
+}
+
+/*************************************************
+* Name: crypto_kem_dec
+*
+* Description: Generates shared secret for given
+* cipher text and private key
+*
+* Arguments: - unsigned char *ss: pointer to output shared secret
+* (an already allocated array of S2N_KYBER_512_R3_SHARED_SECRET_BYTES bytes)
+* - const unsigned char *ct: pointer to input cipher text
+* (an already allocated array of S2N_KYBER_512_R3_CIPHERTEXT_BYTES bytes)
+* - const unsigned char *sk: pointer to input private key
+* (an already allocated array of S2N_KYBER_512_R3_SECRET_KEY_BYTES bytes)
+*
+* Returns 0.
+*
+* On failure, ss will contain a pseudo-random value.
+**************************************************/
+int s2n_kyber_512_r3_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk)
+{
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ uint8_t buf[2*S2N_KYBER_512_R3_SYMBYTES];
+ /* Will contain key, coins */
+ uint8_t kr[2*S2N_KYBER_512_R3_SYMBYTES];
+ uint8_t cmp[S2N_KYBER_512_R3_CIPHERTEXT_BYTES];
+ const uint8_t *pk = sk+S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES;
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+ if (s2n_kyber512r3_is_avx2_bmi2_enabled()) {
+ indcpa_dec_avx2(buf, ct, sk);
+ }else
+#endif
+ {
+ indcpa_dec(buf, ct, sk);
+ }
+
+ /* Multitarget countermeasure for coins + contributory KEM */
+ for(size_t i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) {
+ buf[S2N_KYBER_512_R3_SYMBYTES + i] = sk[S2N_KYBER_512_R3_SECRET_KEY_BYTES - 2 * S2N_KYBER_512_R3_SYMBYTES + i];
+ }
+ sha3_512(kr, buf, 2*S2N_KYBER_512_R3_SYMBYTES);
+
+ /* coins are in kr+S2N_KYBER_512_R3_SYMBYTES */
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+ if (s2n_kyber512r3_is_avx2_bmi2_enabled()) {
+ indcpa_enc_avx2(cmp, buf, pk, kr+S2N_KYBER_512_R3_SYMBYTES);
+ }else
+#endif
+ {
+ indcpa_enc(cmp, buf, pk, kr+S2N_KYBER_512_R3_SYMBYTES);
+ }
+
+ /* If ct and cmp are equal (dont_copy = 1), decryption has succeeded and we do NOT overwrite pre-k below.
+ * If ct and cmp are not equal (dont_copy = 0), decryption fails and we do overwrite pre-k. */
+ int dont_copy = s2n_constant_time_equals(ct, cmp, S2N_KYBER_512_R3_CIPHERTEXT_BYTES);
+
+ /* overwrite coins in kr with H(c) */
+ sha3_256(kr+S2N_KYBER_512_R3_SYMBYTES, ct, S2N_KYBER_512_R3_CIPHERTEXT_BYTES);
+
+ /* Overwrite pre-k with z on re-encryption failure */
+ POSIX_GUARD(s2n_constant_time_copy_or_dont(kr, sk+S2N_KYBER_512_R3_SECRET_KEY_BYTES-S2N_KYBER_512_R3_SYMBYTES,
+ S2N_KYBER_512_R3_SYMBYTES, dont_copy));
+
+ /* hash concatenation of pre-k and H(c) to k */
+ shake256(ss, S2N_KYBER_512_R3_SSBYTES, kr, 2*S2N_KYBER_512_R3_SYMBYTES);
+ return S2N_SUCCESS;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.c
new file mode 100644
index 0000000000..6c82105c19
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.c
@@ -0,0 +1,122 @@
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_ntt.h"
+#include "kyber512r3_reduce.h"
+
+const int16_t zetas[128] = {
+ 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962,
+ 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017,
+ 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047,
+ 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830,
+ 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226,
+ 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574,
+ 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349,
+ 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193,
+ 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459,
+ 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628
+};
+
+const int16_t zetas_inv[128] = {
+ 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535,
+ 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465,
+ 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685,
+ 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235,
+ 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652,
+ 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853,
+ 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552,
+ 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871,
+ 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171,
+ 3127, 3042, 1907, 1836, 1517, 359, 758, 1441
+};
+
+/*************************************************
+* Name: fqmul
+*
+* Description: Multiplication followed by Montgomery reduction
+*
+* Arguments: - int16_t a: first factor
+* - int16_t b: second factor
+*
+* Returns 16-bit integer congruent to a*b*R^{-1} mod q
+**************************************************/
+static int16_t fqmul(int16_t a, int16_t b) {
+ return montgomery_reduce((int32_t)a * b);
+}
+
+/*************************************************
+* Name: ntt
+*
+* Description: Inplace number-theoretic transform (NTT) in Rq
+* input is in standard order, output is in bitreversed order
+*
+* Arguments: - int16_t r[256]: pointer to input/output vector of elements
+* of Zq
+**************************************************/
+void ntt(int16_t r[256]) {
+ unsigned int len, start, j, k;
+ int16_t t, zeta;
+
+ k = 1;
+ for (len = 128; len >= 2; len >>= 1) {
+ for (start = 0; start < 256; start = j + len) {
+ zeta = zetas[k++];
+ for (j = start; j < start + len; ++j) {
+ t = fqmul(zeta, r[j + len]);
+ r[j + len] = r[j] - t;
+ r[j] = r[j] + t;
+ }
+ }
+ }
+}
+
+/*************************************************
+* Name: invntt_tomont
+*
+* Description: Inplace inverse number-theoretic transform in Rq and
+* multiplication by Montgomery factor 2^16.
+* Input is in bitreversed order, output is in standard order
+*
+* Arguments: - int16_t r[256]: pointer to input/output vector of elements
+* of Zq
+**************************************************/
+void invntt(int16_t r[256]) {
+ unsigned int start, len, j, k;
+ int16_t t, zeta;
+
+ k = 0;
+ for (len = 2; len <= 128; len <<= 1) {
+ for (start = 0; start < 256; start = j + len) {
+ zeta = zetas_inv[k++];
+ for (j = start; j < start + len; ++j) {
+ t = r[j];
+ r[j] = barrett_reduce(t + r[j + len]);
+ r[j + len] = t - r[j + len];
+ r[j + len] = fqmul(zeta, r[j + len]);
+ }
+ }
+ }
+
+ for (j = 0; j < 256; ++j) {
+ r[j] = fqmul(r[j], zetas_inv[127]);
+ }
+}
+
+/*************************************************
+* Name: basemul
+*
+* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
+* used for multiplication of elements in Rq in NTT domain
+*
+* Arguments: - int16_t r[2]: pointer to the output polynomial
+* - const int16_t a[2]: pointer to the first factor
+* - const int16_t b[2]: pointer to the second factor
+* - int16_t zeta: integer defining the reduction polynomial
+**************************************************/
+void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) {
+ r[0] = fqmul(a[1], b[1]);
+ r[0] = fqmul(r[0], zeta);
+ r[0] += fqmul(a[0], b[0]);
+
+ r[1] = fqmul(a[0], b[1]);
+ r[1] += fqmul(a[1], b[0]);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.h
new file mode 100644
index 0000000000..98d6235764
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+
+#define zetas S2N_KYBER_512_R3_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
+#define zetas_inv S2N_KYBER_512_R3_NAMESPACE(zetas_inv)
+extern const int16_t zetas_inv[128];
+
+#define ntt S2N_KYBER_512_R3_NAMESPACE(ntt)
+void ntt(int16_t poly[256]);
+
+#define invntt S2N_KYBER_512_R3_NAMESPACE(invntt)
+void invntt(int16_t poly[256]);
+
+#define basemul S2N_KYBER_512_R3_NAMESPACE(basemul)
+void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.S
new file mode 100644
index 0000000000..dc80086cb1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.S
@@ -0,0 +1,218 @@
+#include "kyber512r3_consts_avx2.h"
+
+// The small macros (.inc files) are combined with .S files directly
+/*****.include "shuffle.inc"*****/
+/********************************/
+.macro shuffle8 r0,r1,r2,r3
+vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
+vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle4 r0,r1,r2,r3
+vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
+vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle2 r0,r1,r2,r3
+#vpsllq $32,%ymm\r1,%ymm\r2
+vmovsldup %ymm\r1,%ymm\r2
+vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
+vpsrlq $32,%ymm\r0,%ymm\r0
+#vmovshdup %ymm\r0,%ymm\r0
+vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle1 r0,r1,r2,r3
+vpslld $16,%ymm\r1,%ymm\r2
+vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
+vpsrld $16,%ymm\r0,%ymm\r0
+vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+/********************************/
+
+.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2
+vpmullw %ymm\zl0,%ymm\rh0,%ymm12
+vpmullw %ymm\zl0,%ymm\rh1,%ymm13
+
+vpmullw %ymm\zl1,%ymm\rh2,%ymm14
+vpmullw %ymm\zl1,%ymm\rh3,%ymm15
+
+vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0
+vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1
+
+vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2
+vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3
+.endm
+
+.macro reduce
+vpmulhw %ymm0,%ymm12,%ymm12
+vpmulhw %ymm0,%ymm13,%ymm13
+
+vpmulhw %ymm0,%ymm14,%ymm14
+vpmulhw %ymm0,%ymm15,%ymm15
+.endm
+
+.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3
+vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln
+vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0
+vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0
+
+vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1
+vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1
+vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2
+
+vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2
+vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3
+
+vpsubw %ymm12,%ymm\rln,%ymm\rln
+vpaddw %ymm12,%ymm\rh0,%ymm\rh0
+vpsubw %ymm13,%ymm\rl0,%ymm\rl0
+
+vpaddw %ymm13,%ymm\rh1,%ymm\rh1
+vpsubw %ymm14,%ymm\rl1,%ymm\rl1
+vpaddw %ymm14,%ymm\rh2,%ymm\rh2
+
+vpsubw %ymm15,%ymm\rl2,%ymm\rl2
+vpaddw %ymm15,%ymm\rh3,%ymm\rh3
+.endm
+
+.macro level0 off
+vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15
+vmovdqa (64*\off+128)*2(%rdi),%ymm8
+vmovdqa (64*\off+144)*2(%rdi),%ymm9
+vmovdqa (64*\off+160)*2(%rdi),%ymm10
+vmovdqa (64*\off+176)*2(%rdi),%ymm11
+vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2
+
+mul 8,9,10,11
+
+vmovdqa (64*\off+ 0)*2(%rdi),%ymm4
+vmovdqa (64*\off+ 16)*2(%rdi),%ymm5
+vmovdqa (64*\off+ 32)*2(%rdi),%ymm6
+vmovdqa (64*\off+ 48)*2(%rdi),%ymm7
+
+reduce
+update 3,4,5,6,7,8,9,10,11
+
+vmovdqa %ymm3,(64*\off+ 0)*2(%rdi)
+vmovdqa %ymm4,(64*\off+ 16)*2(%rdi)
+vmovdqa %ymm5,(64*\off+ 32)*2(%rdi)
+vmovdqa %ymm6,(64*\off+ 48)*2(%rdi)
+vmovdqa %ymm8,(64*\off+128)*2(%rdi)
+vmovdqa %ymm9,(64*\off+144)*2(%rdi)
+vmovdqa %ymm10,(64*\off+160)*2(%rdi)
+vmovdqa %ymm11,(64*\off+176)*2(%rdi)
+.endm
+
+.macro levels1t6 off
+/* level 1 */
+vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15
+vmovdqa (128*\off+ 64)*2(%rdi),%ymm8
+vmovdqa (128*\off+ 80)*2(%rdi),%ymm9
+vmovdqa (128*\off+ 96)*2(%rdi),%ymm10
+vmovdqa (128*\off+112)*2(%rdi),%ymm11
+vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2
+
+mul 8,9,10,11
+
+vmovdqa (128*\off+ 0)*2(%rdi),%ymm4
+vmovdqa (128*\off+ 16)*2(%rdi),%ymm5
+vmovdqa (128*\off+ 32)*2(%rdi),%ymm6
+vmovdqa (128*\off+ 48)*2(%rdi),%ymm7
+
+reduce
+update 3,4,5,6,7,8,9,10,11
+
+/* level 2 */
+shuffle8 5,10,7,10
+shuffle8 6,11,5,11
+
+vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15
+vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2
+
+mul 7,10,5,11
+
+shuffle8 3,8,6,8
+shuffle8 4,9,3,9
+
+reduce
+update 4,6,8,3,9,7,10,5,11
+
+/* level 3 */
+shuffle4 8,5,9,5
+shuffle4 3,11,8,11
+
+vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15
+vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2
+
+mul 9,5,8,11
+
+shuffle4 4,7,3,7
+shuffle4 6,10,4,10
+
+reduce
+update 6,3,7,4,10,9,5,8,11
+
+/* level 4 */
+shuffle2 7,8,10,8
+shuffle2 4,11,7,11
+
+vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15
+vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2
+
+mul 10,8,7,11
+
+shuffle2 6,9,4,9
+shuffle2 3,5,6,5
+
+reduce
+update 3,4,9,6,5,10,8,7,11
+
+/* level 5 */
+shuffle1 9,7,5,7
+shuffle1 6,11,9,11
+
+vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15
+vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2
+
+mul 5,7,9,11
+
+shuffle1 3,10,6,10
+shuffle1 4,8,3,8
+
+reduce
+update 4,6,10,3,8,5,7,9,11
+
+/* level 6 */
+vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14
+vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15
+vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8
+vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2
+
+mul 10,3,9,11,14,15,8,2
+
+reduce
+update 8,4,6,5,7,10,3,9,11
+
+vmovdqa %ymm8,(128*\off+ 0)*2(%rdi)
+vmovdqa %ymm4,(128*\off+ 16)*2(%rdi)
+vmovdqa %ymm10,(128*\off+ 32)*2(%rdi)
+vmovdqa %ymm3,(128*\off+ 48)*2(%rdi)
+vmovdqa %ymm6,(128*\off+ 64)*2(%rdi)
+vmovdqa %ymm5,(128*\off+ 80)*2(%rdi)
+vmovdqa %ymm9,(128*\off+ 96)*2(%rdi)
+vmovdqa %ymm11,(128*\off+112)*2(%rdi)
+.endm
+
+.text
+.global cdecl(ntt_avx2_asm)
+cdecl(ntt_avx2_asm):
+vmovdqa _16XQ*2(%rsi),%ymm0
+
+level0 0
+level0 1
+
+levels1t6 0
+levels1t6 1
+
+ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.h
new file mode 100644
index 0000000000..3616132358
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <stdint.h>
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+#define ntt_avx2_asm S2N_KYBER_512_R3_NAMESPACE(ntt_avx2_asm)
+void ntt_avx2_asm(__m256i *r, const __m256i *qdata);
+
+#define invntt_avx2_asm S2N_KYBER_512_R3_NAMESPACE(invntt_avx2_asm)
+void invntt_avx2_asm(__m256i *r, const __m256i *qdata);
+
+#define nttunpack_avx2_asm S2N_KYBER_512_R3_NAMESPACE(nttunpack_avx2_asm)
+void nttunpack_avx2_asm(__m256i *r, const __m256i *qdata);
+
+#define basemul_avx2_asm S2N_KYBER_512_R3_NAMESPACE(basemul_avx2_asm)
+void basemul_avx2_asm(__m256i *r,
+ const __m256i *a,
+ const __m256i *b,
+ const __m256i *qdata);
+
+#define ntttobytes_avx2_asm S2N_KYBER_512_R3_NAMESPACE(ntttobytes_avx2_asm)
+void ntttobytes_avx2_asm(uint8_t *r, const __m256i *a, const __m256i *qdata);
+
+#define nttfrombytes_avx2_asm S2N_KYBER_512_R3_NAMESPACE(nttfrombytes_avx2_asm)
+void nttfrombytes_avx2_asm(__m256i *r, const uint8_t *a, const __m256i *qdata);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_params.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_params.h
new file mode 100644
index 0000000000..d2d32d08f1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_params.h
@@ -0,0 +1,31 @@
+#pragma once
+
+/* All kyber512r3 functions and global variables in the pq-crypto/kyber_r3 directory
+ * should be defined using the namespace macro to avoid symbol collisions. For example,
+ * in foo.h, declare a function as follows:
+ *
+ * #define foo_function S2N_KYBER_512_R3_NAMESPACE(foo_function)
+ * int foo_function(int foo_argument); */
+#define S2N_KYBER_512_R3_NAMESPACE(s) s2n_kyber_512_r3_##s
+
+#define S2N_KYBER_512_R3_K 2
+
+#define S2N_KYBER_512_R3_N 256
+#define S2N_KYBER_512_R3_Q 3329
+
+#define S2N_KYBER_512_R3_SYMBYTES 32 /* size in bytes of hashes, and seeds */
+#define S2N_KYBER_512_R3_SSBYTES 32 /* size in bytes of shared key */
+
+#define S2N_KYBER_512_R3_POLYBYTES 384
+#define S2N_KYBER_512_R3_POLYVECBYTES (S2N_KYBER_512_R3_K * S2N_KYBER_512_R3_POLYBYTES)
+
+#define S2N_KYBER_512_R3_ETA1 3
+#define S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES 128
+#define S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES 640
+
+#define S2N_KYBER_512_R3_ETA2 2
+
+#define S2N_KYBER_512_R3_INDCPA_MSGBYTES S2N_KYBER_512_R3_SYMBYTES
+#define S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES (S2N_KYBER_512_R3_POLYVECBYTES + S2N_KYBER_512_R3_SYMBYTES)
+#define S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES (S2N_KYBER_512_R3_POLYVECBYTES)
+#define S2N_KYBER_512_R3_INDCPA_BYTES (S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES + S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES)
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.c
new file mode 100644
index 0000000000..76ae60a583
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.c
@@ -0,0 +1,300 @@
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_poly.h"
+#include "kyber512r3_ntt.h"
+#include "kyber512r3_reduce.h"
+#include "kyber512r3_cbd.h"
+#include "kyber512r3_symmetric.h"
+
+/*************************************************
+* Name: poly_compress
+*
+* Description: Compression and subsequent serialization of a polynomial
+*
+* Arguments: - uint8_t *r: pointer to output byte array
+* (of length S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES)
+* - poly *a: pointer to input polynomial
+**************************************************/
+void poly_compress(uint8_t r[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES], poly *a) {
+ unsigned int i, j;
+ uint8_t t[8];
+
+ poly_csubq(a);
+
+ for (i = 0; i < S2N_KYBER_512_R3_N / 8; i++) {
+ for (j = 0; j < 8; j++) {
+ t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 4) + S2N_KYBER_512_R3_Q / 2) / S2N_KYBER_512_R3_Q) & 15;
+ }
+
+ r[0] = t[0] | (t[1] << 4);
+ r[1] = t[2] | (t[3] << 4);
+ r[2] = t[4] | (t[5] << 4);
+ r[3] = t[6] | (t[7] << 4);
+ r += 4;
+ }
+}
+
+/*************************************************
+* Name: poly_decompress
+*
+* Description: De-serialization and subsequent decompression of a polynomial;
+* approximate inverse of poly_compress
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const uint8_t *a: pointer to input byte array
+* (of length S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES bytes)
+**************************************************/
+void poly_decompress(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES]) {
+ unsigned int i;
+
+ for (i = 0; i < S2N_KYBER_512_R3_N / 2; i++) {
+ r->coeffs[2 * i + 0] = (((uint16_t)(a[0] & 15) * S2N_KYBER_512_R3_Q) + 8) >> 4;
+ r->coeffs[2 * i + 1] = (((uint16_t)(a[0] >> 4) * S2N_KYBER_512_R3_Q) + 8) >> 4;
+ a += 1;
+ }
+}
+
+/*************************************************
+* Name: poly_tobytes
+*
+* Description: Serialization of a polynomial
+*
+* Arguments: - uint8_t *r: pointer to output byte array
+* (needs space for S2N_KYBER_512_R3_POLYBYTES bytes)
+* - poly *a: pointer to input polynomial
+**************************************************/
+void poly_tobytes(uint8_t r[S2N_KYBER_512_R3_POLYBYTES], poly *a) {
+ unsigned int i;
+
+ poly_csubq(a);
+
+ for (i = 0; i < S2N_KYBER_512_R3_N / 2; i++) {
+ uint16_t t0 = a->coeffs[2 * i];
+ uint16_t t1 = a->coeffs[2 * i + 1];
+ r[3 * i + 0] = (t0 >> 0);
+ r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
+ r[3 * i + 2] = (t1 >> 4);
+ }
+}
+
+/*************************************************
+* Name: poly_frombytes
+*
+* Description: De-serialization of a polynomial;
+* inverse of poly_tobytes
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const uint8_t *a: pointer to input byte array
+* (of S2N_KYBER_512_R3_POLYBYTES bytes)
+**************************************************/
+void poly_frombytes(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYBYTES]) {
+ unsigned int i;
+ for (i = 0; i < S2N_KYBER_512_R3_N / 2; i++) {
+ r->coeffs[2 * i] = ((a[3 * i + 0] >> 0) | ((uint16_t)a[3 * i + 1] << 8)) & 0xFFF;
+ r->coeffs[2 * i + 1] = ((a[3 * i + 1] >> 4) | ((uint16_t)a[3 * i + 2] << 4)) & 0xFFF;
+ }
+}
+
+/*************************************************
+* Name: poly_frommsg
+*
+* Description: Convert 32-byte message to polynomial
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const uint8_t *msg: pointer to input message
+**************************************************/
+void poly_frommsg(poly *r, const uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES]) {
+ unsigned int i, j;
+ int16_t mask;
+
+ for (i = 0; i < S2N_KYBER_512_R3_N / 8; i++) {
+ for (j = 0; j < 8; j++) {
+ mask = -(int16_t)((msg[i] >> j) & 1);
+ r->coeffs[8 * i + j] = mask & ((S2N_KYBER_512_R3_Q + 1) / 2);
+ }
+ }
+}
+
+/*************************************************
+* Name: poly_tomsg
+*
+* Description: Convert polynomial to 32-byte message
+*
+* Arguments: - uint8_t *msg: pointer to output message
+* - poly *a: pointer to input polynomial
+**************************************************/
+void poly_tomsg(uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES], poly *a) {
+ unsigned int i, j;
+ uint16_t t;
+
+ poly_csubq(a);
+
+ for (i = 0; i < S2N_KYBER_512_R3_N / 8; i++) {
+ msg[i] = 0;
+ for (j = 0; j < 8; j++) {
+ t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + S2N_KYBER_512_R3_Q / 2) / S2N_KYBER_512_R3_Q) & 1;
+ msg[i] |= t << j;
+ }
+ }
+}
+
+/*************************************************
+* Name: poly_getnoise_eta1
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+* with output polynomial close to centered binomial distribution
+* with parameter S2N_KYBER_512_R3_ETA1
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const uint8_t *seed: pointer to input seed
+* (of length S2N_KYBER_512_R3_SYMBYTES bytes)
+* - uint8_t nonce: one-byte input nonce
+**************************************************/
+void poly_getnoise_eta1(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce) {
+ uint8_t buf[S2N_KYBER_512_R3_ETA1 * S2N_KYBER_512_R3_N / 4];
+ shake256_prf(buf, sizeof(buf), seed, nonce);
+ cbd_eta1(r, buf);
+}
+
+/*************************************************
+* Name: poly_getnoise_eta2
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+* with output polynomial close to centered binomial distribution
+* with parameter S2N_KYBER_512_R3_ETA2
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const uint8_t *seed: pointer to input seed
+* (of length S2N_KYBER_512_R3_SYMBYTES bytes)
+* - uint8_t nonce: one-byte input nonce
+**************************************************/
+void poly_getnoise_eta2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce) {
+ uint8_t buf[S2N_KYBER_512_R3_ETA2 * S2N_KYBER_512_R3_N / 4];
+ shake256_prf(buf, sizeof(buf), seed, nonce);
+ cbd_eta2(r, buf);
+}
+
+
+/*************************************************
+* Name: poly_ntt
+*
+* Description: Computes negacyclic number-theoretic transform (NTT) of
+* a polynomial in place;
+* inputs assumed to be in normal order, output in bitreversed order
+*
+* Arguments: - uint16_t *r: pointer to in/output polynomial
+**************************************************/
+void poly_ntt(poly *r) {
+ ntt(r->coeffs);
+ poly_reduce(r);
+}
+
+/*************************************************
+* Name: poly_invntt_tomont
+*
+* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+* of a polynomial in place;
+* inputs assumed to be in bitreversed order, output in normal order
+*
+* Arguments: - uint16_t *a: pointer to in/output polynomial
+**************************************************/
+void poly_invntt_tomont(poly *r) {
+ invntt(r->coeffs);
+}
+
+/*************************************************
+* Name: poly_basemul_montgomery
+*
+* Description: Multiplication of two polynomials in NTT domain
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const poly *a: pointer to first input polynomial
+* - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_basemul_montgomery(poly *r, const poly *a, const poly *b) {
+ unsigned int i;
+ for (i = 0; i < S2N_KYBER_512_R3_N / 4; i++) {
+ basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], zetas[64 + i]);
+ basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2],
+ -zetas[64 + i]);
+ }
+}
+
+/*************************************************
+* Name: poly_tomont
+*
+* Description: Inplace conversion of all coefficients of a polynomial
+* from normal domain to Montgomery domain
+*
+* Arguments: - poly *r: pointer to input/output polynomial
+**************************************************/
+void poly_tomont(poly *r) {
+ unsigned int i;
+ const int16_t f = (1ULL << 32) % S2N_KYBER_512_R3_Q;
+ for (i = 0; i < S2N_KYBER_512_R3_N; i++) {
+ r->coeffs[i] = montgomery_reduce((int32_t)r->coeffs[i] * f);
+ }
+}
+
+/*************************************************
+* Name: poly_reduce
+*
+* Description: Applies Barrett reduction to all coefficients of a polynomial
+* for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments: - poly *r: pointer to input/output polynomial
+**************************************************/
+void poly_reduce(poly *r) {
+ unsigned int i;
+ for (i = 0; i < S2N_KYBER_512_R3_N; i++) {
+ r->coeffs[i] = barrett_reduce(r->coeffs[i]);
+ }
+}
+
+/*************************************************
+* Name: poly_csubq
+*
+* Description: Applies conditional subtraction of q to each coefficient
+* of a polynomial. For details of conditional subtraction
+* of q see comments in reduce.c
+*
+* Arguments: - poly *r: pointer to input/output polynomial
+**************************************************/
+void poly_csubq(poly *r) {
+ unsigned int i;
+ for (i = 0; i < S2N_KYBER_512_R3_N; i++) {
+ r->coeffs[i] = csubq(r->coeffs[i]);
+ }
+}
+
+/*************************************************
+* Name: poly_add
+*
+* Description: Add two polynomials
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const poly *a: pointer to first input polynomial
+* - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_add(poly *r, const poly *a, const poly *b) {
+ unsigned int i;
+ for (i = 0; i < S2N_KYBER_512_R3_N; i++) {
+ r->coeffs[i] = a->coeffs[i] + b->coeffs[i];
+ }
+}
+
+/*************************************************
+* Name: poly_sub
+*
+* Description: Subtract two polynomials
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const poly *a: pointer to first input polynomial
+* - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_sub(poly *r, const poly *a, const poly *b) {
+ unsigned int i;
+ for (i = 0; i < S2N_KYBER_512_R3_N; i++) {
+ r->coeffs[i] = a->coeffs[i] - b->coeffs[i];
+ }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.h
new file mode 100644
index 0000000000..da43766e51
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+
+/*
+ * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
+ * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
+ */
+#define poly S2N_KYBER_512_R3_NAMESPACE(poly)
+typedef struct {
+ int16_t coeffs[S2N_KYBER_512_R3_N];
+} poly;
+
+#define poly_compress S2N_KYBER_512_R3_NAMESPACE(poly_compress)
+void poly_compress(uint8_t r[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES], poly *a);
+
+#define poly_decompress S2N_KYBER_512_R3_NAMESPACE(poly_decompress)
+void poly_decompress(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES]);
+
+#define poly_tobytes S2N_KYBER_512_R3_NAMESPACE(poly_tobytes)
+void poly_tobytes(uint8_t r[S2N_KYBER_512_R3_POLYBYTES], poly *a);
+
+#define poly_frombytes S2N_KYBER_512_R3_NAMESPACE(poly_frombytes)
+void poly_frombytes(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYBYTES]);
+
+#define poly_frommsg S2N_KYBER_512_R3_NAMESPACE(poly_frommsg)
+void poly_frommsg(poly *r, const uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES]);
+
+#define poly_tomsg S2N_KYBER_512_R3_NAMESPACE(poly_tomsg)
+void poly_tomsg(uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES], poly *r);
+
+#define poly_getnoise_eta1 S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta1)
+void poly_getnoise_eta1(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce);
+
+#define poly_getnoise_eta2 S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta2)
+void poly_getnoise_eta2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce);
+
+#define poly_ntt S2N_KYBER_512_R3_NAMESPACE(poly_ntt)
+void poly_ntt(poly *r);
+
+#define poly_invntt_tomont S2N_KYBER_512_R3_NAMESPACE(poly_invntt_tomont)
+void poly_invntt_tomont(poly *r);
+
+#define poly_basemul_montgomery S2N_KYBER_512_R3_NAMESPACE(poly_basemul_montgomery)
+void poly_basemul_montgomery(poly *r, const poly *a, const poly *b);
+
+#define poly_tomont S2N_KYBER_512_R3_NAMESPACE(poly_tomont)
+void poly_tomont(poly *r);
+
+#define poly_reduce S2N_KYBER_512_R3_NAMESPACE(poly_reduce)
+void poly_reduce(poly *r);
+
+#define poly_csubq S2N_KYBER_512_R3_NAMESPACE(poly_csubq)
+void poly_csubq(poly *r);
+
+#define poly_add S2N_KYBER_512_R3_NAMESPACE(poly_add)
+void poly_add(poly *r, const poly *a, const poly *b);
+
+#define poly_sub S2N_KYBER_512_R3_NAMESPACE(poly_sub)
+void poly_sub(poly *r, const poly *a, const poly *b);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.c
new file mode 100644
index 0000000000..aa961ff403
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.c
@@ -0,0 +1,453 @@
+#include <stdint.h>
+#include <string.h>
+#include "kyber512r3_align_avx2.h"
+#include "kyber512r3_consts_avx2.h"
+#include "kyber512r3_poly_avx2.h"
+#include "kyber512r3_ntt_avx2.h"
+#include "kyber512r3_reduce_avx2.h"
+#include "kyber512r3_cbd_avx2.h"
+#include "kyber512r3_fips202.h"
+#include "kyber512r3_fips202x4_avx2.h"
+#include "kyber512r3_symmetric.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+/*************************************************
+* Name: poly_compress_avx2
+*
+* Description: Compression and subsequent serialization of a polynomial.
+* The coefficients of the input polynomial are assumed to
+* lie in the invertal [0,q], i.e. the polynomial must be reduced
+* by poly_reduce_avx2().
+*
+* Arguments: - uint8_t *r: pointer to output byte array
+* (of length S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES)
+* - const poly *a: pointer to input polynomial
+**************************************************/
+void poly_compress_avx2(uint8_t r[128], const poly * restrict a)
+{
+ unsigned int i;
+ __m256i f0, f1, f2, f3;
+ const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
+ const __m256i shift1 = _mm256_set1_epi16(1 << 9);
+ const __m256i mask = _mm256_set1_epi16(15);
+ const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1);
+ const __m256i permdidx = _mm256_set_epi32(7,3,6,2,5,1,4,0);
+
+ for(i=0;i<S2N_KYBER_512_R3_N/64;i++) {
+ f0 = _mm256_load_si256(&a->vec[4*i+0]);
+ f1 = _mm256_load_si256(&a->vec[4*i+1]);
+ f2 = _mm256_load_si256(&a->vec[4*i+2]);
+ f3 = _mm256_load_si256(&a->vec[4*i+3]);
+ f0 = _mm256_mulhi_epi16(f0,v);
+ f1 = _mm256_mulhi_epi16(f1,v);
+ f2 = _mm256_mulhi_epi16(f2,v);
+ f3 = _mm256_mulhi_epi16(f3,v);
+ f0 = _mm256_mulhrs_epi16(f0,shift1);
+ f1 = _mm256_mulhrs_epi16(f1,shift1);
+ f2 = _mm256_mulhrs_epi16(f2,shift1);
+ f3 = _mm256_mulhrs_epi16(f3,shift1);
+ f0 = _mm256_and_si256(f0,mask);
+ f1 = _mm256_and_si256(f1,mask);
+ f2 = _mm256_and_si256(f2,mask);
+ f3 = _mm256_and_si256(f3,mask);
+ f0 = _mm256_packus_epi16(f0,f1);
+ f2 = _mm256_packus_epi16(f2,f3);
+ f0 = _mm256_maddubs_epi16(f0,shift2);
+ f2 = _mm256_maddubs_epi16(f2,shift2);
+ f0 = _mm256_packus_epi16(f0,f2);
+ f0 = _mm256_permutevar8x32_epi32(f0,permdidx);
+ // correcting cast-align error
+ // old version: _mm256_storeu_si256((__m256i *)&r[32*i],f0);
+ _mm256_storeu_si256((void *)&r[32*i],f0);
+ }
+}
+
+void poly_decompress_avx2(poly * restrict r, const uint8_t a[128])
+{
+ unsigned int i;
+ __m128i t;
+ __m256i f;
+ const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
+ const __m256i shufbidx = _mm256_set_epi8(7,7,7,7,6,6,6,6,5,5,5,5,4,4,4,4,
+ 3,3,3,3,2,2,2,2,1,1,1,1,0,0,0,0);
+ const __m256i mask = _mm256_set1_epi32(0x00F0000F);
+ const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048);
+
+ for(i=0;i<S2N_KYBER_512_R3_N/16;i++) {
+ // correcting cast-align and cast-qual errors
+ // old version: t = _mm_loadl_epi64((__m128i *)&a[8*i]);
+ t = _mm_loadl_epi64((const void *)&a[8*i]);
+ f = _mm256_broadcastsi128_si256(t);
+ f = _mm256_shuffle_epi8(f,shufbidx);
+ f = _mm256_and_si256(f,mask);
+ f = _mm256_mullo_epi16(f,shift);
+ f = _mm256_mulhrs_epi16(f,q);
+ _mm256_store_si256(&r->vec[i],f);
+ }
+}
+
+/*************************************************
+* Name: poly_tobytes_avx2
+*
+* Description: Serialization of a polynomial in NTT representation.
+* The coefficients of the input polynomial are assumed to
+* lie in the invertal [0,q], i.e. the polynomial must be reduced
+* by poly_reduce_avx2(). The coefficients are orderd as output by
+* poly_ntt_avx2(); the serialized output coefficients are in bitreversed
+* order.
+*
+* Arguments: - uint8_t *r: pointer to output byte array
+* (needs space for S2N_KYBER_512_R3_POLYBYTES bytes)
+* - poly *a: pointer to input polynomial
+**************************************************/
+void poly_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYBYTES], const poly *a)
+{
+ ntttobytes_avx2_asm(r, a->vec, qdata.vec);
+}
+
+/*************************************************
+* Name: poly_frombytes_avx2
+*
+* Description: De-serialization of a polynomial;
+* inverse of poly_tobytes_avx2
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const uint8_t *a: pointer to input byte array
+* (of S2N_KYBER_512_R3_POLYBYTES bytes)
+**************************************************/
+void poly_frombytes_avx2(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYBYTES])
+{
+ nttfrombytes_avx2_asm(r->vec, a, qdata.vec);
+}
+
+/*************************************************
+* Name: poly_frommsg_avx2
+*
+* Description: Convert 32-byte message to polynomial
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const uint8_t *msg: pointer to input message
+**************************************************/
+void poly_frommsg_avx2(poly * restrict r, const uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES])
+{
+ __m256i f, g0, g1, g2, g3, h0, h1, h2, h3;
+ const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0,1,2,3));
+ const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0));
+ const __m256i hqs = _mm256_set1_epi16((S2N_KYBER_512_R3_Q+1)/2);
+
+#define FROMMSG64(i) \
+ g3 = _mm256_shuffle_epi32(f,0x55*i); \
+ g3 = _mm256_sllv_epi32(g3,shift); \
+ g3 = _mm256_shuffle_epi8(g3,idx); \
+ g0 = _mm256_slli_epi16(g3,12); \
+ g1 = _mm256_slli_epi16(g3,8); \
+ g2 = _mm256_slli_epi16(g3,4); \
+ g0 = _mm256_srai_epi16(g0,15); \
+ g1 = _mm256_srai_epi16(g1,15); \
+ g2 = _mm256_srai_epi16(g2,15); \
+ g3 = _mm256_srai_epi16(g3,15); \
+ g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \
+ g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \
+ g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \
+ g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \
+ h0 = _mm256_unpacklo_epi64(g0,g1); \
+ h2 = _mm256_unpackhi_epi64(g0,g1); \
+ h1 = _mm256_unpacklo_epi64(g2,g3); \
+ h3 = _mm256_unpackhi_epi64(g2,g3); \
+ g0 = _mm256_permute2x128_si256(h0,h1,0x20); \
+ g2 = _mm256_permute2x128_si256(h0,h1,0x31); \
+ g1 = _mm256_permute2x128_si256(h2,h3,0x20); \
+ g3 = _mm256_permute2x128_si256(h2,h3,0x31); \
+ _mm256_store_si256(&r->vec[0+2*i+0],g0); \
+ _mm256_store_si256(&r->vec[0+2*i+1],g1); \
+ _mm256_store_si256(&r->vec[8+2*i+0],g2); \
+ _mm256_store_si256(&r->vec[8+2*i+1],g3)
+
+ // correcting cast-align and cast-qual errors
+ // old version: f = _mm256_loadu_si256((__m256i *)msg);
+ f = _mm256_loadu_si256((const void *)msg);
+ FROMMSG64(0);
+ FROMMSG64(1);
+ FROMMSG64(2);
+ FROMMSG64(3);
+}
+
+/*************************************************
+* Name: poly_tomsg_avx2
+*
+* Description: Convert polynomial to 32-byte message.
+* The coefficients of the input polynomial are assumed to
+* lie in the invertal [0,q], i.e. the polynomial must be reduced
+* by poly_reduce_avx2().
+*
+* Arguments: - uint8_t *msg: pointer to output message
+* - poly *a: pointer to input polynomial
+**************************************************/
+void poly_tomsg_avx2(uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES], const poly * restrict a)
+{
+ unsigned int i;
+ uint32_t small;
+ __m256i f0, f1, g0, g1;
+ const __m256i hq = _mm256_set1_epi16((S2N_KYBER_512_R3_Q - 1)/2);
+ const __m256i hhq = _mm256_set1_epi16((S2N_KYBER_512_R3_Q - 1)/4);
+
+ for(i=0;i<S2N_KYBER_512_R3_N/32;i++) {
+ f0 = _mm256_load_si256(&a->vec[2*i+0]);
+ f1 = _mm256_load_si256(&a->vec[2*i+1]);
+ f0 = _mm256_sub_epi16(hq, f0);
+ f1 = _mm256_sub_epi16(hq, f1);
+ g0 = _mm256_srai_epi16(f0, 15);
+ g1 = _mm256_srai_epi16(f1, 15);
+ f0 = _mm256_xor_si256(f0, g0);
+ f1 = _mm256_xor_si256(f1, g1);
+ f0 = _mm256_sub_epi16(f0, hhq);
+ f1 = _mm256_sub_epi16(f1, hhq);
+ f0 = _mm256_packs_epi16(f0, f1);
+ f0 = _mm256_permute4x64_epi64(f0, 0xD8);
+ small = _mm256_movemask_epi8(f0);
+ memcpy(&msg[4*i], &small, 4);
+ }
+}
+
+/*************************************************
+* Name: poly_getnoise_eta1_avx2
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+* with output polynomial close to centered binomial distribution
+* with parameter S2N_KYBER_512_R3_ETA1
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const uint8_t *seed: pointer to input seed
+* (of length S2N_KYBER_512_R3_SYMBYTES bytes)
+* - uint8_t nonce: one-byte input nonce
+**************************************************/
+void poly_getnoise_eta1_avx2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce)
+{
+ ALIGNED_UINT8(S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/4+32) buf; // +32 bytes as required by poly_cbd_eta1_avx2
+ shake256_prf(buf.coeffs, S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/4, seed, nonce);
+ poly_cbd_eta1_avx2(r, buf.vec);
+}
+
+/*************************************************
+* Name: poly_getnoise_eta2_avx2
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+* with output polynomial close to centered binomial distribution
+* with parameter S2N_KYBER_512_R3_ETA2
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const uint8_t *seed: pointer to input seed
+* (of length S2N_KYBER_512_R3_SYMBYTES bytes)
+* - uint8_t nonce: one-byte input nonce
+**************************************************/
+void poly_getnoise_eta2_avx2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce)
+{
+ ALIGNED_UINT8(S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/4) buf;
+ shake256_prf(buf.coeffs, S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/4, seed, nonce);
+ poly_cbd_eta2_avx2(r, buf.vec);
+}
+
+#define NOISE_NBLOCKS ((S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/4+S2N_KYBER_512_R3_SHAKE256_RATE-1)/S2N_KYBER_512_R3_SHAKE256_RATE)
+void poly_getnoise_eta1_4x(poly *r0,
+ poly *r1,
+ poly *r2,
+ poly *r3,
+ const uint8_t seed[32],
+ uint8_t nonce0,
+ uint8_t nonce1,
+ uint8_t nonce2,
+ uint8_t nonce3)
+{
+ ALIGNED_UINT8(NOISE_NBLOCKS*S2N_KYBER_512_R3_SHAKE256_RATE) buf[4];
+ __m256i f;
+ keccakx4_state state;
+
+ // correcting cast-align and cast-qual errors
+ // old version: f = _mm256_loadu_si256((__m256i *)seed);
+ f = _mm256_loadu_si256((const void *)seed);
+ _mm256_store_si256(buf[0].vec, f);
+ _mm256_store_si256(buf[1].vec, f);
+ _mm256_store_si256(buf[2].vec, f);
+ _mm256_store_si256(buf[3].vec, f);
+
+ buf[0].coeffs[32] = nonce0;
+ buf[1].coeffs[32] = nonce1;
+ buf[2].coeffs[32] = nonce2;
+ buf[3].coeffs[32] = nonce3;
+
+ shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
+ shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
+
+ poly_cbd_eta1_avx2(r0, buf[0].vec);
+ poly_cbd_eta1_avx2(r1, buf[1].vec);
+ poly_cbd_eta1_avx2(r2, buf[2].vec);
+ poly_cbd_eta1_avx2(r3, buf[3].vec);
+}
+
+void poly_getnoise_eta1122_4x(poly *r0,
+ poly *r1,
+ poly *r2,
+ poly *r3,
+ const uint8_t seed[32],
+ uint8_t nonce0,
+ uint8_t nonce1,
+ uint8_t nonce2,
+ uint8_t nonce3)
+{
+ ALIGNED_UINT8(NOISE_NBLOCKS*S2N_KYBER_512_R3_SHAKE256_RATE) buf[4];
+ __m256i f;
+ keccakx4_state state;
+
+ // correcting cast-align and cast-qual errors
+ // old version: f = _mm256_loadu_si256((__m256i *)seed);
+ f = _mm256_loadu_si256((const void *)seed);
+ _mm256_store_si256(buf[0].vec, f);
+ _mm256_store_si256(buf[1].vec, f);
+ _mm256_store_si256(buf[2].vec, f);
+ _mm256_store_si256(buf[3].vec, f);
+
+ buf[0].coeffs[32] = nonce0;
+ buf[1].coeffs[32] = nonce1;
+ buf[2].coeffs[32] = nonce2;
+ buf[3].coeffs[32] = nonce3;
+
+ shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
+ shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
+
+ poly_cbd_eta1_avx2(r0, buf[0].vec);
+ poly_cbd_eta1_avx2(r1, buf[1].vec);
+ poly_cbd_eta2_avx2(r2, buf[2].vec);
+ poly_cbd_eta2_avx2(r3, buf[3].vec);
+}
+
+/*************************************************
+* Name: poly_ntt_avx2
+*
+* Description: Computes negacyclic number-theoretic transform (NTT) of
+* a polynomial in place.
+* Input coefficients assumed to be in normal order,
+* output coefficients are in special order that is natural
+* for the vectorization. Input coefficients are assumed to be
+* bounded by q in absolute value, output coefficients are bounded
+* by 16118 in absolute value.
+*
+* Arguments: - poly *r: pointer to in/output polynomial
+**************************************************/
+void poly_ntt_avx2(poly *r)
+{
+ ntt_avx2_asm(r->vec, qdata.vec);
+}
+
+/*************************************************
+* Name: poly_invntt_tomont_avx2
+*
+* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+* of a polynomial in place;
+* Input coefficients assumed to be in special order from vectorized
+* forward ntt, output in normal order. Input coefficients can be
+* arbitrary 16-bit integers, output coefficients are bounded by 14870
+* in absolute value.
+*
+* Arguments: - poly *a: pointer to in/output polynomial
+**************************************************/
+void poly_invntt_tomont_avx2(poly *r)
+{
+ invntt_avx2_asm(r->vec, qdata.vec);
+}
+
+void poly_nttunpack_avx2(poly *r)
+{
+ nttunpack_avx2_asm(r->vec, qdata.vec);
+}
+
+/*************************************************
+* Name: poly_basemul_montgomery_avx2
+*
+* Description: Multiplication of two polynomials in NTT domain.
+* One of the input polynomials needs to have coefficients
+* bounded by q, the other polynomial can have arbitrary
+* coefficients. Output coefficients are bounded by 6656.
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const poly *a: pointer to first input polynomial
+* - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_basemul_montgomery_avx2(poly *r, const poly *a, const poly *b)
+{
+ basemul_avx2_asm(r->vec, a->vec, b->vec, qdata.vec);
+}
+
+/*************************************************
+* Name: poly_tomont_avx2
+*
+* Description: Inplace conversion of all coefficients of a polynomial
+* from normal domain to Montgomery domain
+*
+* Arguments: - poly *r: pointer to input/output polynomial
+**************************************************/
+void poly_tomont_avx2(poly *r)
+{
+ tomont_avx2_asm(r->vec, qdata.vec);
+}
+
+/*************************************************
+* Name: poly_reduce_avx2
+*
+* Description: Applies Barrett reduction to all coefficients of a polynomial
+* for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments: - poly *r: pointer to input/output polynomial
+**************************************************/
+void poly_reduce_avx2(poly *r)
+{
+ reduce_avx2_asm(r->vec, qdata.vec);
+}
+
+/*************************************************
+* Name: poly_add_avx2
+*
+* Description: Add two polynomials. No modular reduction
+* is performed.
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const poly *a: pointer to first input polynomial
+* - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_add_avx2(poly *r, const poly *a, const poly *b)
+{
+ unsigned int i;
+ __m256i f0, f1;
+
+ for(i=0;i<S2N_KYBER_512_R3_N/16;i++) {
+ f0 = _mm256_load_si256(&a->vec[i]);
+ f1 = _mm256_load_si256(&b->vec[i]);
+ f0 = _mm256_add_epi16(f0, f1);
+ _mm256_store_si256(&r->vec[i], f0);
+ }
+}
+
+/*************************************************
+* Name: poly_sub_avx2
+*
+* Description: Subtract two polynomials. No modular reduction
+* is performed.
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const poly *a: pointer to first input polynomial
+* - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_sub_avx2(poly *r, const poly *a, const poly *b)
+{
+ unsigned int i;
+ __m256i f0, f1;
+
+ for(i=0;i<S2N_KYBER_512_R3_N/16;i++) {
+ f0 = _mm256_load_si256(&a->vec[i]);
+ f1 = _mm256_load_si256(&b->vec[i]);
+ f0 = _mm256_sub_epi16(f0, f1);
+ _mm256_store_si256(&r->vec[i], f0);
+ }
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.h
new file mode 100644
index 0000000000..bd6e857f79
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_align_avx2.h"
+#include "kyber512r3_params.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#define poly S2N_KYBER_512_R3_NAMESPACE(poly)
+typedef ALIGNED_INT16(S2N_KYBER_512_R3_N) poly;
+
+#define poly_compress_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_compress_avx2)
+void poly_compress_avx2(uint8_t r[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES], const poly *a);
+
+#define poly_decompress_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_decompress_avx2)
+void poly_decompress_avx2(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES]);
+
+#define poly_tobytes_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_tobytes_avx2)
+void poly_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYBYTES], const poly *a);
+
+#define poly_frombytes_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_frombytes_avx2)
+void poly_frombytes_avx2(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYBYTES]);
+
+#define poly_frommsg_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_frommsg_avx2)
+void poly_frommsg_avx2(poly *r, const uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES]);
+
+#define poly_tomsg_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_tomsg_avx2)
+void poly_tomsg_avx2(uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES], const poly *r);
+
+#define poly_getnoise_eta1_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta1_avx2)
+void poly_getnoise_eta1_avx2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce);
+
+#define poly_getnoise_eta2_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta2_avx2)
+void poly_getnoise_eta2_avx2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce);
+
+#define poly_getnoise_eta1_4x S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta2_4x)
+void poly_getnoise_eta1_4x(poly *r0,
+ poly *r1,
+ poly *r2,
+ poly *r3,
+ const uint8_t seed[32],
+ uint8_t nonce0,
+ uint8_t nonce1,
+ uint8_t nonce2,
+ uint8_t nonce3);
+
+#define poly_getnoise_eta1122_4x S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta1122_4x)
+void poly_getnoise_eta1122_4x(poly *r0,
+ poly *r1,
+ poly *r2,
+ poly *r3,
+ const uint8_t seed[32],
+ uint8_t nonce0,
+ uint8_t nonce1,
+ uint8_t nonce2,
+ uint8_t nonce3);
+
+#define poly_ntt_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_ntt_avx2)
+void poly_ntt_avx2(poly *r);
+
+#define poly_invntt_tomont_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_invntt_tomont_avx2)
+void poly_invntt_tomont_avx2(poly *r);
+
+#define poly_nttunpack_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_nttunpack_avx2)
+void poly_nttunpack_avx2(poly *r);
+
+#define poly_basemul_montgomery_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_basemul_montgomery_avx2)
+void poly_basemul_montgomery_avx2(poly *r, const poly *a, const poly *b);
+
+#define poly_tomont_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_tomont_avx2)
+void poly_tomont_avx2(poly *r);
+
+#define poly_reduce_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_reduce_avx2)
+void poly_reduce_avx2(poly *r);
+
+#define poly_add_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_add_avx2)
+void poly_add_avx2(poly *r, const poly *a, const poly *b);
+
+#define poly_sub_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_sub_avx2)
+void poly_sub_avx2(poly *r, const poly *a, const poly *b);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.c
new file mode 100644
index 0000000000..0a84cd092a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.c
@@ -0,0 +1,186 @@
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_poly.h"
+#include "kyber512r3_polyvec.h"
+
+/*************************************************
+* Name: polyvec_compress
+*
+* Description: Compress and serialize vector of polynomials
+*
+* Arguments: - uint8_t *r: pointer to output byte array
+* (needs space for S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES)
+* - polyvec *a: pointer to input vector of polynomials
+**************************************************/
+void polyvec_compress(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES], polyvec *a) {
+ polyvec_csubq(a);
+
+ uint16_t t[4];
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ for (unsigned int j = 0; j < S2N_KYBER_512_R3_N / 4; j++) {
+ for (unsigned int k = 0; k < 4; k++)
+ t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + S2N_KYBER_512_R3_Q / 2)
+ / S2N_KYBER_512_R3_Q) & 0x3ff;
+
+ r[0] = (t[0] >> 0);
+ r[1] = (t[0] >> 8) | (t[1] << 2);
+ r[2] = (t[1] >> 6) | (t[2] << 4);
+ r[3] = (t[2] >> 4) | (t[3] << 6);
+ r[4] = (t[3] >> 2);
+ r += 5;
+ }
+ }
+}
+
+/*************************************************
+* Name: polyvec_decompress
+*
+* Description: De-serialize and decompress vector of polynomials;
+* approximate inverse of polyvec_compress
+*
+* Arguments: - polyvec *r: pointer to output vector of polynomials
+* - const uint8_t *a: pointer to input byte array
+* (of length S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES)
+**************************************************/
+void polyvec_decompress(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES]) {
+ uint16_t t[4];
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ for (unsigned int j = 0; j < S2N_KYBER_512_R3_N / 4; j++) {
+ t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8);
+ t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6);
+ t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4);
+ t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2);
+ a += 5;
+
+ for (unsigned int k = 0; k < 4; k++) {
+ r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * S2N_KYBER_512_R3_Q + 512) >> 10;
+ }
+ }
+ }
+}
+
+/*************************************************
+* Name: polyvec_tobytes
+*
+* Description: Serialize vector of polynomials
+*
+* Arguments: - uint8_t *r: pointer to output byte array
+* (needs space for S2N_KYBER_512_R3_POLYVECBYTES)
+* - polyvec *a: pointer to input vector of polynomials
+**************************************************/
+void polyvec_tobytes(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], polyvec *a) {
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ poly_tobytes(r + i * S2N_KYBER_512_R3_POLYBYTES, &a->vec[i]);
+ }
+}
+
+/*************************************************
+* Name: polyvec_frombytes
+*
+* Description: De-serialize vector of polynomials;
+* inverse of polyvec_tobytes
+*
+* Arguments: - uint8_t *r: pointer to output byte array
+* - const polyvec *a: pointer to input vector of polynomials
+* (of length S2N_KYBER_512_R3_POLYVECBYTES)
+**************************************************/
+void polyvec_frombytes(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES]) {
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ poly_frombytes(&r->vec[i], a + i * S2N_KYBER_512_R3_POLYBYTES);
+ }
+}
+
+/*************************************************
+* Name: polyvec_ntt
+*
+* Description: Apply forward NTT to all elements of a vector of polynomials
+*
+* Arguments: - polyvec *r: pointer to in/output vector of polynomials
+**************************************************/
+void polyvec_ntt(polyvec *r) {
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ poly_ntt(&r->vec[i]);
+ }
+}
+
+/*************************************************
+* Name: polyvec_invntt_tomont
+*
+* Description: Apply inverse NTT to all elements of a vector of polynomials
+* and multiply by Montgomery factor 2^16
+*
+* Arguments: - polyvec *r: pointer to in/output vector of polynomials
+**************************************************/
+void polyvec_invntt_tomont(polyvec *r) {
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ poly_invntt_tomont(&r->vec[i]);
+ }
+}
+
+/*************************************************
+* Name: polyvec_pointwise_acc_montgomery
+*
+* Description: Pointwise multiply elements of a and b, accumulate into r,
+* and multiply by 2^-16.
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const polyvec *a: pointer to first input vector of polynomials
+* - const polyvec *b: pointer to second input vector of polynomials
+**************************************************/
+void polyvec_pointwise_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) {
+ poly t;
+
+ poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]);
+ for (unsigned int i = 1; i < S2N_KYBER_512_R3_K; i++) {
+ poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]);
+ poly_add(r, r, &t);
+ }
+
+ poly_reduce(r);
+}
+
+/*************************************************
+* Name: polyvec_reduce
+*
+* Description: Applies Barrett reduction to each coefficient
+* of each element of a vector of polynomials
+* for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments: - poly *r: pointer to input/output polynomial
+**************************************************/
+void polyvec_reduce(polyvec *r) {
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ poly_reduce(&r->vec[i]);
+ }
+}
+
+/*************************************************
+* Name: polyvec_csubq
+*
+* Description: Applies conditional subtraction of q to each coefficient
+* of each element of a vector of polynomials
+* for details of conditional subtraction of q see comments in
+* reduce.c
+*
+* Arguments: - poly *r: pointer to input/output polynomial
+**************************************************/
+void polyvec_csubq(polyvec *r) {
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ poly_csubq(&r->vec[i]);
+ }
+}
+
+/*************************************************
+* Name: polyvec_add
+*
+* Description: Add vectors of polynomials
+*
+* Arguments: - polyvec *r: pointer to output vector of polynomials
+* - const polyvec *a: pointer to first input vector of polynomials
+* - const polyvec *b: pointer to second input vector of polynomials
+**************************************************/
+void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) {
+ for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+ poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
+ }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.h
new file mode 100644
index 0000000000..797f3c0d31
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_poly.h"
+
+#define polyvec S2N_KYBER_512_R3_NAMESPACE(polyvec)
+typedef struct {
+ poly vec[S2N_KYBER_512_R3_K];
+} polyvec;
+
+#define polyvec_compress S2N_KYBER_512_R3_NAMESPACE(polyvec_compress)
+void polyvec_compress(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES], polyvec *a);
+
+#define polyvec_decompress S2N_KYBER_512_R3_NAMESPACE(polyvec_decompress)
+void polyvec_decompress(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES]);
+
+#define polyvec_tobytes S2N_KYBER_512_R3_NAMESPACE(polyvec_tobytes)
+void polyvec_tobytes(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], polyvec *a);
+
+#define polyvec_frombytes S2N_KYBER_512_R3_NAMESPACE(polyvec_frombytes)
+void polyvec_frombytes(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES]);
+
+#define polyvec_ntt S2N_KYBER_512_R3_NAMESPACE(polyvec_ntt)
+void polyvec_ntt(polyvec *r);
+
+#define polyvec_invntt_tomont S2N_KYBER_512_R3_NAMESPACE(polyvec_invntt_tomont)
+void polyvec_invntt_tomont(polyvec *r);
+
+#define polyvec_pointwise_acc_montgomery S2N_KYBER_512_R3_NAMESPACE(polyvec_pointwise_acc_montgomery)
+void polyvec_pointwise_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);
+
+#define polyvec_reduce S2N_KYBER_512_R3_NAMESPACE(polyvec_reduce)
+void polyvec_reduce(polyvec *r);
+
+#define polyvec_csubq S2N_KYBER_512_R3_NAMESPACE(polyvec_csubq)
+void polyvec_csubq(polyvec *r);
+
+#define polyvec_add S2N_KYBER_512_R3_NAMESPACE(polyvec_add)
+void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.c
new file mode 100644
index 0000000000..8434b96d76
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.c
@@ -0,0 +1,227 @@
+#include <stdint.h>
+#include <string.h>
+#include "kyber512r3_polyvec_avx2.h"
+#include "kyber512r3_poly_avx2.h"
+#include "kyber512r3_consts_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+static void poly_compress10(uint8_t r[320], const poly * restrict a)
+{
+ unsigned int i;
+ __m256i f0, f1, f2;
+ __m128i t0, t1;
+ const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
+ const __m256i v8 = _mm256_slli_epi16(v,3);
+ const __m256i off = _mm256_set1_epi16(15);
+ const __m256i shift1 = _mm256_set1_epi16(1 << 12);
+ const __m256i mask = _mm256_set1_epi16(1023);
+ const __m256i shift2 = _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1);
+ const __m256i sllvdidx = _mm256_set1_epi64x(12);
+ const __m256i shufbidx = _mm256_set_epi8( 8, 4, 3, 2, 1, 0,-1,-1,-1,-1,-1,-1,12,11,10, 9,
+ -1,-1,-1,-1,-1,-1,12,11,10, 9, 8, 4, 3, 2, 1, 0);
+
+ for(i=0;i<S2N_KYBER_512_R3_N/16;i++) {
+ f0 = _mm256_load_si256(&a->vec[i]);
+ f1 = _mm256_mullo_epi16(f0,v8);
+ f2 = _mm256_add_epi16(f0,off);
+ f0 = _mm256_slli_epi16(f0,3);
+ f0 = _mm256_mulhi_epi16(f0,v);
+ f2 = _mm256_sub_epi16(f1,f2);
+ f1 = _mm256_andnot_si256(f1,f2);
+ f1 = _mm256_srli_epi16(f1,15);
+ f0 = _mm256_sub_epi16(f0,f1);
+ f0 = _mm256_mulhrs_epi16(f0,shift1);
+ f0 = _mm256_and_si256(f0,mask);
+ f0 = _mm256_madd_epi16(f0,shift2);
+ f0 = _mm256_sllv_epi32(f0,sllvdidx);
+ f0 = _mm256_srli_epi64(f0,12);
+ f0 = _mm256_shuffle_epi8(f0,shufbidx);
+ t0 = _mm256_castsi256_si128(f0);
+ t1 = _mm256_extracti128_si256(f0,1);
+ t0 = _mm_blend_epi16(t0,t1,0xE0);
+ // correcting cast-align error
+ // old version: _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0);
+ _mm_storeu_si128((void *)&r[20*i+ 0],t0);
+ memcpy(&r[20*i+16],&t1,4);
+ }
+}
+
+static void poly_decompress10(poly * restrict r, const uint8_t a[320+12])
+{
+ unsigned int i;
+ __m256i f;
+ const __m256i q = _mm256_set1_epi32((S2N_KYBER_512_R3_Q << 16) + 4*S2N_KYBER_512_R3_Q);
+ const __m256i shufbidx = _mm256_set_epi8(11,10,10, 9, 9, 8, 8, 7,
+ 6, 5, 5, 4, 4, 3, 3, 2,
+ 9, 8, 8, 7, 7, 6, 6, 5,
+ 4, 3, 3, 2, 2, 1, 1, 0);
+ const __m256i sllvdidx = _mm256_set1_epi64x(4);
+ const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184);
+
+ for(i=0;i<S2N_KYBER_512_R3_N/16;i++) {
+ // correcting cast-align and cast-qual errors
+ // old version: f = _mm256_loadu_si256((__m256i *)&a[20*i]);
+ f = _mm256_loadu_si256((const void *)&a[20*i]);
+ f = _mm256_permute4x64_epi64(f,0x94);
+ f = _mm256_shuffle_epi8(f,shufbidx);
+ f = _mm256_sllv_epi32(f,sllvdidx);
+ f = _mm256_srli_epi16(f,1);
+ f = _mm256_and_si256(f,mask);
+ f = _mm256_mulhrs_epi16(f,q);
+ _mm256_store_si256(&r->vec[i],f);
+ }
+}
+
+/*************************************************
+* Name: polyvec_compress_avx2
+*
+* Description: Compress and serialize vector of polynomials
+*
+* Arguments: - uint8_t *r: pointer to output byte array
+* (needs space for S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES)
+* - polyvec *a: pointer to input vector of polynomials
+**************************************************/
+void polyvec_compress_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+2], const polyvec *a)
+{
+ unsigned int i;
+
+ for(i=0;i<S2N_KYBER_512_R3_K;i++)
+ poly_compress10(&r[320*i],&a->vec[i]);
+}
+
+/*************************************************
+* Name: polyvec_decompress_avx2
+*
+* Description: De-serialize and decompress vector of polynomials;
+* approximate inverse of polyvec_compress_avx2
+*
+* Arguments: - polyvec *r: pointer to output vector of polynomials
+* - const uint8_t *a: pointer to input byte array
+* (of length S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES)
+**************************************************/
+void polyvec_decompress_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+12])
+{
+ unsigned int i;
+
+ for(i=0;i<S2N_KYBER_512_R3_K;i++)
+ poly_decompress10(&r->vec[i],&a[320*i]);
+}
+
+/*************************************************
+* Name: polyvec_tobytes_avx2
+*
+* Description: Serialize vector of polynomials
+*
+* Arguments: - uint8_t *r: pointer to output byte array
+* (needs space for S2N_KYBER_512_R3_POLYVECBYTES)
+* - polyvec *a: pointer to input vector of polynomials
+**************************************************/
+void polyvec_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], const polyvec *a)
+{
+ unsigned int i;
+ for(i=0;i<S2N_KYBER_512_R3_K;i++)
+ poly_tobytes_avx2(r+i*S2N_KYBER_512_R3_POLYBYTES, &a->vec[i]);
+}
+
+/*************************************************
+* Name: polyvec_frombytes_avx2
+*
+* Description: De-serialize vector of polynomials;
+* inverse of polyvec_tobytes_avx2
+*
+* Arguments: - uint8_t *r: pointer to output byte array
+* - const polyvec *a: pointer to input vector of polynomials
+* (of length S2N_KYBER_512_R3_POLYVECBYTES)
+**************************************************/
+void polyvec_frombytes_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES])
+{
+ unsigned int i;
+ for(i=0;i<S2N_KYBER_512_R3_K;i++)
+ poly_frombytes_avx2(&r->vec[i], a+i*S2N_KYBER_512_R3_POLYBYTES);
+}
+
+/*************************************************
+* Name: polyvec_ntt_avx2
+*
+* Description: Apply forward NTT to all elements of a vector of polynomials
+*
+* Arguments: - polyvec *r: pointer to in/output vector of polynomials
+**************************************************/
+void polyvec_ntt_avx2(polyvec *r)
+{
+ unsigned int i;
+ for(i=0;i<S2N_KYBER_512_R3_K;i++)
+ poly_ntt_avx2(&r->vec[i]);
+}
+
+/*************************************************
+* Name: polyvec_invntt_tomont_avx2
+*
+* Description: Apply inverse NTT to all elements of a vector of polynomials
+* and multiply by Montgomery factor 2^16
+*
+* Arguments: - polyvec *r: pointer to in/output vector of polynomials
+**************************************************/
+void polyvec_invntt_tomont_avx2(polyvec *r)
+{
+ unsigned int i;
+ for(i=0;i<S2N_KYBER_512_R3_K;i++)
+ poly_invntt_tomont_avx2(&r->vec[i]);
+}
+
+/*************************************************
+* Name: polyvec_basemul_acc_montgomery_avx2
+*
+* Description: Multiply elements in a and b in NTT domain, accumulate into r,
+* and multiply by 2^-16.
+*
+* Arguments: - poly *r: pointer to output polynomial
+* - const polyvec *a: pointer to first input vector of polynomials
+* - const polyvec *b: pointer to second input vector of polynomials
+**************************************************/
+void polyvec_basemul_acc_montgomery_avx2(poly *r, const polyvec *a, const polyvec *b)
+{
+ unsigned int i;
+ poly tmp;
+
+ poly_basemul_montgomery_avx2(r,&a->vec[0],&b->vec[0]);
+ for(i=1;i<S2N_KYBER_512_R3_K;i++) {
+ poly_basemul_montgomery_avx2(&tmp,&a->vec[i],&b->vec[i]);
+ poly_add_avx2(r,r,&tmp);
+ }
+}
+
+/*************************************************
+* Name: polyvec_reduce_avx2
+*
+* Description: Applies Barrett reduction to each coefficient
+* of each element of a vector of polynomials;
+* for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments: - polyvec *r: pointer to input/output polynomial
+**************************************************/
+void polyvec_reduce_avx2(polyvec *r)
+{
+ unsigned int i;
+ for(i=0;i<S2N_KYBER_512_R3_K;i++)
+ poly_reduce_avx2(&r->vec[i]);
+}
+
+/*************************************************
+* Name: polyvec_add_avx2
+*
+* Description: Add vectors of polynomials
+*
+* Arguments: - polyvec *r: pointer to output vector of polynomials
+* - const polyvec *a: pointer to first input vector of polynomials
+* - const polyvec *b: pointer to second input vector of polynomials
+**************************************************/
+void polyvec_add_avx2(polyvec *r, const polyvec *a, const polyvec *b)
+{
+ unsigned int i;
+ for(i=0;i<S2N_KYBER_512_R3_K;i++)
+ poly_add_avx2(&r->vec[i], &a->vec[i], &b->vec[i]);
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.h
new file mode 100644
index 0000000000..536e1b23d0
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_poly_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#define polyvec S2N_KYBER_512_R3_NAMESPACE(polyvec)
+typedef struct{
+ poly vec[S2N_KYBER_512_R3_K];
+} polyvec;
+
+#define polyvec_compress_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_compress_avx2)
+void polyvec_compress_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+2], const polyvec *a);
+
+#define polyvec_decompress_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_decompress_avx2)
+void polyvec_decompress_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+12]);
+
+#define polyvec_tobytes_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_tobytes_avx2)
+void polyvec_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], const polyvec *a);
+
+#define polyvec_frombytes_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_frombytes_avx2)
+void polyvec_frombytes_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES]);
+
+#define polyvec_ntt_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_ntt_avx2)
+void polyvec_ntt_avx2(polyvec *r);
+
+#define polyvec_invntt_tomont_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_invntt_tomont_avx2)
+void polyvec_invntt_tomont_avx2(polyvec *r);
+
+#define polyvec_basemul_acc_montgomery_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_basemul_acc_montgomery_avx2)
+void polyvec_basemul_acc_montgomery_avx2(poly *r, const polyvec *a, const polyvec *b);
+
+#define polyvec_reduce_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_reduce_avx2)
+void polyvec_reduce_avx2(polyvec *r);
+
+#define polyvec_add_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_add_avx2)
+void polyvec_add_avx2(polyvec *r, const polyvec *a, const polyvec *b);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.c
new file mode 100644
index 0000000000..6219ad7e88
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.c
@@ -0,0 +1,60 @@
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_reduce.h"
+
+/*************************************************
+* Name: montgomery_reduce
+*
+* Description: Montgomery reduction; given a 32-bit integer a, computes
+* 16-bit integer congruent to a * R^-1 mod q,
+* where R=2^16
+*
+* Arguments: - int32_t a: input integer to be reduced;
+* has to be in {-q2^15,...,q2^15-1}
+*
+* Returns: integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q.
+**************************************************/
+int16_t montgomery_reduce(int32_t a) {
+ int32_t t;
+ int16_t u;
+
+ u = a * S2N_KYBER_512_R3_QINV;
+ t = (int32_t)u * S2N_KYBER_512_R3_Q;
+ t = a - t;
+ t >>= 16;
+ return t;
+}
+
+/*************************************************
+* Name: barrett_reduce
+*
+* Description: Barrett reduction; given a 16-bit integer a, computes
+* 16-bit integer congruent to a mod q in {0,...,q}
+*
+* Arguments: - int16_t a: input integer to be reduced
+*
+* Returns: integer in {0,...,q} congruent to a modulo q.
+**************************************************/
+int16_t barrett_reduce(int16_t a) {
+ int16_t t;
+ const int16_t v = ((1U << 26) + S2N_KYBER_512_R3_Q / 2) / S2N_KYBER_512_R3_Q;
+
+ t = (int32_t)v * a >> 26;
+ t *= S2N_KYBER_512_R3_Q;
+ return a - t;
+}
+
+/*************************************************
+* Name: csubq
+*
+* Description: Conditionallly subtract q
+*
+* Arguments: - int16_t x: input integer
+*
+* Returns: a - q if a >= q, else a
+**************************************************/
+int16_t csubq(int16_t a) {
+ a -= S2N_KYBER_512_R3_Q;
+ a += (a >> 15) & S2N_KYBER_512_R3_Q;
+ return a;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.h
new file mode 100644
index 0000000000..bab9fa54f9
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+
+#define S2N_KYBER_512_R3_QINV 62209 /* q^-1 mod 2^16 */
+
+#define montgomery_reduce S2N_KYBER_512_R3_NAMESPACE(montgomery_reduce)
+int16_t montgomery_reduce(int32_t a);
+
+#define barrett_reduce S2N_KYBER_512_R3_NAMESPACE(barrett_reduce)
+int16_t barrett_reduce(int16_t a);
+
+#define csubq S2N_KYBER_512_R3_NAMESPACE(csubq)
+int16_t csubq(int16_t x);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce_avx2.h
new file mode 100644
index 0000000000..24f0ede4e0
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce_avx2.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "kyber512r3_params.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+#define reduce_avx2_asm S2N_KYBER_512_R3_NAMESPACE(reduce_avx2_asm)
+void reduce_avx2_asm(__m256i *r, const __m256i *qdata);
+
+#define tomont_avx2_asm S2N_KYBER_512_R3_NAMESPACE(tomont_avx2_asm)
+void tomont_avx2_asm(__m256i *r, const __m256i *qdata);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.c
new file mode 100644
index 0000000000..1461e0b9b1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.c
@@ -0,0 +1,420 @@
+#include <stdint.h>
+#include <string.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_consts_avx2.h"
+#include "kyber512r3_rejsample_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+//#define BMI
+
+#ifndef BMI
+static const uint8_t idx[256][8] = {
+ {-1, -1, -1, -1, -1, -1, -1, -1},
+ { 0, -1, -1, -1, -1, -1, -1, -1},
+ { 2, -1, -1, -1, -1, -1, -1, -1},
+ { 0, 2, -1, -1, -1, -1, -1, -1},
+ { 4, -1, -1, -1, -1, -1, -1, -1},
+ { 0, 4, -1, -1, -1, -1, -1, -1},
+ { 2, 4, -1, -1, -1, -1, -1, -1},
+ { 0, 2, 4, -1, -1, -1, -1, -1},
+ { 6, -1, -1, -1, -1, -1, -1, -1},
+ { 0, 6, -1, -1, -1, -1, -1, -1},
+ { 2, 6, -1, -1, -1, -1, -1, -1},
+ { 0, 2, 6, -1, -1, -1, -1, -1},
+ { 4, 6, -1, -1, -1, -1, -1, -1},
+ { 0, 4, 6, -1, -1, -1, -1, -1},
+ { 2, 4, 6, -1, -1, -1, -1, -1},
+ { 0, 2, 4, 6, -1, -1, -1, -1},
+ { 8, -1, -1, -1, -1, -1, -1, -1},
+ { 0, 8, -1, -1, -1, -1, -1, -1},
+ { 2, 8, -1, -1, -1, -1, -1, -1},
+ { 0, 2, 8, -1, -1, -1, -1, -1},
+ { 4, 8, -1, -1, -1, -1, -1, -1},
+ { 0, 4, 8, -1, -1, -1, -1, -1},
+ { 2, 4, 8, -1, -1, -1, -1, -1},
+ { 0, 2, 4, 8, -1, -1, -1, -1},
+ { 6, 8, -1, -1, -1, -1, -1, -1},
+ { 0, 6, 8, -1, -1, -1, -1, -1},
+ { 2, 6, 8, -1, -1, -1, -1, -1},
+ { 0, 2, 6, 8, -1, -1, -1, -1},
+ { 4, 6, 8, -1, -1, -1, -1, -1},
+ { 0, 4, 6, 8, -1, -1, -1, -1},
+ { 2, 4, 6, 8, -1, -1, -1, -1},
+ { 0, 2, 4, 6, 8, -1, -1, -1},
+ {10, -1, -1, -1, -1, -1, -1, -1},
+ { 0, 10, -1, -1, -1, -1, -1, -1},
+ { 2, 10, -1, -1, -1, -1, -1, -1},
+ { 0, 2, 10, -1, -1, -1, -1, -1},
+ { 4, 10, -1, -1, -1, -1, -1, -1},
+ { 0, 4, 10, -1, -1, -1, -1, -1},
+ { 2, 4, 10, -1, -1, -1, -1, -1},
+ { 0, 2, 4, 10, -1, -1, -1, -1},
+ { 6, 10, -1, -1, -1, -1, -1, -1},
+ { 0, 6, 10, -1, -1, -1, -1, -1},
+ { 2, 6, 10, -1, -1, -1, -1, -1},
+ { 0, 2, 6, 10, -1, -1, -1, -1},
+ { 4, 6, 10, -1, -1, -1, -1, -1},
+ { 0, 4, 6, 10, -1, -1, -1, -1},
+ { 2, 4, 6, 10, -1, -1, -1, -1},
+ { 0, 2, 4, 6, 10, -1, -1, -1},
+ { 8, 10, -1, -1, -1, -1, -1, -1},
+ { 0, 8, 10, -1, -1, -1, -1, -1},
+ { 2, 8, 10, -1, -1, -1, -1, -1},
+ { 0, 2, 8, 10, -1, -1, -1, -1},
+ { 4, 8, 10, -1, -1, -1, -1, -1},
+ { 0, 4, 8, 10, -1, -1, -1, -1},
+ { 2, 4, 8, 10, -1, -1, -1, -1},
+ { 0, 2, 4, 8, 10, -1, -1, -1},
+ { 6, 8, 10, -1, -1, -1, -1, -1},
+ { 0, 6, 8, 10, -1, -1, -1, -1},
+ { 2, 6, 8, 10, -1, -1, -1, -1},
+ { 0, 2, 6, 8, 10, -1, -1, -1},
+ { 4, 6, 8, 10, -1, -1, -1, -1},
+ { 0, 4, 6, 8, 10, -1, -1, -1},
+ { 2, 4, 6, 8, 10, -1, -1, -1},
+ { 0, 2, 4, 6, 8, 10, -1, -1},
+ {12, -1, -1, -1, -1, -1, -1, -1},
+ { 0, 12, -1, -1, -1, -1, -1, -1},
+ { 2, 12, -1, -1, -1, -1, -1, -1},
+ { 0, 2, 12, -1, -1, -1, -1, -1},
+ { 4, 12, -1, -1, -1, -1, -1, -1},
+ { 0, 4, 12, -1, -1, -1, -1, -1},
+ { 2, 4, 12, -1, -1, -1, -1, -1},
+ { 0, 2, 4, 12, -1, -1, -1, -1},
+ { 6, 12, -1, -1, -1, -1, -1, -1},
+ { 0, 6, 12, -1, -1, -1, -1, -1},
+ { 2, 6, 12, -1, -1, -1, -1, -1},
+ { 0, 2, 6, 12, -1, -1, -1, -1},
+ { 4, 6, 12, -1, -1, -1, -1, -1},
+ { 0, 4, 6, 12, -1, -1, -1, -1},
+ { 2, 4, 6, 12, -1, -1, -1, -1},
+ { 0, 2, 4, 6, 12, -1, -1, -1},
+ { 8, 12, -1, -1, -1, -1, -1, -1},
+ { 0, 8, 12, -1, -1, -1, -1, -1},
+ { 2, 8, 12, -1, -1, -1, -1, -1},
+ { 0, 2, 8, 12, -1, -1, -1, -1},
+ { 4, 8, 12, -1, -1, -1, -1, -1},
+ { 0, 4, 8, 12, -1, -1, -1, -1},
+ { 2, 4, 8, 12, -1, -1, -1, -1},
+ { 0, 2, 4, 8, 12, -1, -1, -1},
+ { 6, 8, 12, -1, -1, -1, -1, -1},
+ { 0, 6, 8, 12, -1, -1, -1, -1},
+ { 2, 6, 8, 12, -1, -1, -1, -1},
+ { 0, 2, 6, 8, 12, -1, -1, -1},
+ { 4, 6, 8, 12, -1, -1, -1, -1},
+ { 0, 4, 6, 8, 12, -1, -1, -1},
+ { 2, 4, 6, 8, 12, -1, -1, -1},
+ { 0, 2, 4, 6, 8, 12, -1, -1},
+ {10, 12, -1, -1, -1, -1, -1, -1},
+ { 0, 10, 12, -1, -1, -1, -1, -1},
+ { 2, 10, 12, -1, -1, -1, -1, -1},
+ { 0, 2, 10, 12, -1, -1, -1, -1},
+ { 4, 10, 12, -1, -1, -1, -1, -1},
+ { 0, 4, 10, 12, -1, -1, -1, -1},
+ { 2, 4, 10, 12, -1, -1, -1, -1},
+ { 0, 2, 4, 10, 12, -1, -1, -1},
+ { 6, 10, 12, -1, -1, -1, -1, -1},
+ { 0, 6, 10, 12, -1, -1, -1, -1},
+ { 2, 6, 10, 12, -1, -1, -1, -1},
+ { 0, 2, 6, 10, 12, -1, -1, -1},
+ { 4, 6, 10, 12, -1, -1, -1, -1},
+ { 0, 4, 6, 10, 12, -1, -1, -1},
+ { 2, 4, 6, 10, 12, -1, -1, -1},
+ { 0, 2, 4, 6, 10, 12, -1, -1},
+ { 8, 10, 12, -1, -1, -1, -1, -1},
+ { 0, 8, 10, 12, -1, -1, -1, -1},
+ { 2, 8, 10, 12, -1, -1, -1, -1},
+ { 0, 2, 8, 10, 12, -1, -1, -1},
+ { 4, 8, 10, 12, -1, -1, -1, -1},
+ { 0, 4, 8, 10, 12, -1, -1, -1},
+ { 2, 4, 8, 10, 12, -1, -1, -1},
+ { 0, 2, 4, 8, 10, 12, -1, -1},
+ { 6, 8, 10, 12, -1, -1, -1, -1},
+ { 0, 6, 8, 10, 12, -1, -1, -1},
+ { 2, 6, 8, 10, 12, -1, -1, -1},
+ { 0, 2, 6, 8, 10, 12, -1, -1},
+ { 4, 6, 8, 10, 12, -1, -1, -1},
+ { 0, 4, 6, 8, 10, 12, -1, -1},
+ { 2, 4, 6, 8, 10, 12, -1, -1},
+ { 0, 2, 4, 6, 8, 10, 12, -1},
+ {14, -1, -1, -1, -1, -1, -1, -1},
+ { 0, 14, -1, -1, -1, -1, -1, -1},
+ { 2, 14, -1, -1, -1, -1, -1, -1},
+ { 0, 2, 14, -1, -1, -1, -1, -1},
+ { 4, 14, -1, -1, -1, -1, -1, -1},
+ { 0, 4, 14, -1, -1, -1, -1, -1},
+ { 2, 4, 14, -1, -1, -1, -1, -1},
+ { 0, 2, 4, 14, -1, -1, -1, -1},
+ { 6, 14, -1, -1, -1, -1, -1, -1},
+ { 0, 6, 14, -1, -1, -1, -1, -1},
+ { 2, 6, 14, -1, -1, -1, -1, -1},
+ { 0, 2, 6, 14, -1, -1, -1, -1},
+ { 4, 6, 14, -1, -1, -1, -1, -1},
+ { 0, 4, 6, 14, -1, -1, -1, -1},
+ { 2, 4, 6, 14, -1, -1, -1, -1},
+ { 0, 2, 4, 6, 14, -1, -1, -1},
+ { 8, 14, -1, -1, -1, -1, -1, -1},
+ { 0, 8, 14, -1, -1, -1, -1, -1},
+ { 2, 8, 14, -1, -1, -1, -1, -1},
+ { 0, 2, 8, 14, -1, -1, -1, -1},
+ { 4, 8, 14, -1, -1, -1, -1, -1},
+ { 0, 4, 8, 14, -1, -1, -1, -1},
+ { 2, 4, 8, 14, -1, -1, -1, -1},
+ { 0, 2, 4, 8, 14, -1, -1, -1},
+ { 6, 8, 14, -1, -1, -1, -1, -1},
+ { 0, 6, 8, 14, -1, -1, -1, -1},
+ { 2, 6, 8, 14, -1, -1, -1, -1},
+ { 0, 2, 6, 8, 14, -1, -1, -1},
+ { 4, 6, 8, 14, -1, -1, -1, -1},
+ { 0, 4, 6, 8, 14, -1, -1, -1},
+ { 2, 4, 6, 8, 14, -1, -1, -1},
+ { 0, 2, 4, 6, 8, 14, -1, -1},
+ {10, 14, -1, -1, -1, -1, -1, -1},
+ { 0, 10, 14, -1, -1, -1, -1, -1},
+ { 2, 10, 14, -1, -1, -1, -1, -1},
+ { 0, 2, 10, 14, -1, -1, -1, -1},
+ { 4, 10, 14, -1, -1, -1, -1, -1},
+ { 0, 4, 10, 14, -1, -1, -1, -1},
+ { 2, 4, 10, 14, -1, -1, -1, -1},
+ { 0, 2, 4, 10, 14, -1, -1, -1},
+ { 6, 10, 14, -1, -1, -1, -1, -1},
+ { 0, 6, 10, 14, -1, -1, -1, -1},
+ { 2, 6, 10, 14, -1, -1, -1, -1},
+ { 0, 2, 6, 10, 14, -1, -1, -1},
+ { 4, 6, 10, 14, -1, -1, -1, -1},
+ { 0, 4, 6, 10, 14, -1, -1, -1},
+ { 2, 4, 6, 10, 14, -1, -1, -1},
+ { 0, 2, 4, 6, 10, 14, -1, -1},
+ { 8, 10, 14, -1, -1, -1, -1, -1},
+ { 0, 8, 10, 14, -1, -1, -1, -1},
+ { 2, 8, 10, 14, -1, -1, -1, -1},
+ { 0, 2, 8, 10, 14, -1, -1, -1},
+ { 4, 8, 10, 14, -1, -1, -1, -1},
+ { 0, 4, 8, 10, 14, -1, -1, -1},
+ { 2, 4, 8, 10, 14, -1, -1, -1},
+ { 0, 2, 4, 8, 10, 14, -1, -1},
+ { 6, 8, 10, 14, -1, -1, -1, -1},
+ { 0, 6, 8, 10, 14, -1, -1, -1},
+ { 2, 6, 8, 10, 14, -1, -1, -1},
+ { 0, 2, 6, 8, 10, 14, -1, -1},
+ { 4, 6, 8, 10, 14, -1, -1, -1},
+ { 0, 4, 6, 8, 10, 14, -1, -1},
+ { 2, 4, 6, 8, 10, 14, -1, -1},
+ { 0, 2, 4, 6, 8, 10, 14, -1},
+ {12, 14, -1, -1, -1, -1, -1, -1},
+ { 0, 12, 14, -1, -1, -1, -1, -1},
+ { 2, 12, 14, -1, -1, -1, -1, -1},
+ { 0, 2, 12, 14, -1, -1, -1, -1},
+ { 4, 12, 14, -1, -1, -1, -1, -1},
+ { 0, 4, 12, 14, -1, -1, -1, -1},
+ { 2, 4, 12, 14, -1, -1, -1, -1},
+ { 0, 2, 4, 12, 14, -1, -1, -1},
+ { 6, 12, 14, -1, -1, -1, -1, -1},
+ { 0, 6, 12, 14, -1, -1, -1, -1},
+ { 2, 6, 12, 14, -1, -1, -1, -1},
+ { 0, 2, 6, 12, 14, -1, -1, -1},
+ { 4, 6, 12, 14, -1, -1, -1, -1},
+ { 0, 4, 6, 12, 14, -1, -1, -1},
+ { 2, 4, 6, 12, 14, -1, -1, -1},
+ { 0, 2, 4, 6, 12, 14, -1, -1},
+ { 8, 12, 14, -1, -1, -1, -1, -1},
+ { 0, 8, 12, 14, -1, -1, -1, -1},
+ { 2, 8, 12, 14, -1, -1, -1, -1},
+ { 0, 2, 8, 12, 14, -1, -1, -1},
+ { 4, 8, 12, 14, -1, -1, -1, -1},
+ { 0, 4, 8, 12, 14, -1, -1, -1},
+ { 2, 4, 8, 12, 14, -1, -1, -1},
+ { 0, 2, 4, 8, 12, 14, -1, -1},
+ { 6, 8, 12, 14, -1, -1, -1, -1},
+ { 0, 6, 8, 12, 14, -1, -1, -1},
+ { 2, 6, 8, 12, 14, -1, -1, -1},
+ { 0, 2, 6, 8, 12, 14, -1, -1},
+ { 4, 6, 8, 12, 14, -1, -1, -1},
+ { 0, 4, 6, 8, 12, 14, -1, -1},
+ { 2, 4, 6, 8, 12, 14, -1, -1},
+ { 0, 2, 4, 6, 8, 12, 14, -1},
+ {10, 12, 14, -1, -1, -1, -1, -1},
+ { 0, 10, 12, 14, -1, -1, -1, -1},
+ { 2, 10, 12, 14, -1, -1, -1, -1},
+ { 0, 2, 10, 12, 14, -1, -1, -1},
+ { 4, 10, 12, 14, -1, -1, -1, -1},
+ { 0, 4, 10, 12, 14, -1, -1, -1},
+ { 2, 4, 10, 12, 14, -1, -1, -1},
+ { 0, 2, 4, 10, 12, 14, -1, -1},
+ { 6, 10, 12, 14, -1, -1, -1, -1},
+ { 0, 6, 10, 12, 14, -1, -1, -1},
+ { 2, 6, 10, 12, 14, -1, -1, -1},
+ { 0, 2, 6, 10, 12, 14, -1, -1},
+ { 4, 6, 10, 12, 14, -1, -1, -1},
+ { 0, 4, 6, 10, 12, 14, -1, -1},
+ { 2, 4, 6, 10, 12, 14, -1, -1},
+ { 0, 2, 4, 6, 10, 12, 14, -1},
+ { 8, 10, 12, 14, -1, -1, -1, -1},
+ { 0, 8, 10, 12, 14, -1, -1, -1},
+ { 2, 8, 10, 12, 14, -1, -1, -1},
+ { 0, 2, 8, 10, 12, 14, -1, -1},
+ { 4, 8, 10, 12, 14, -1, -1, -1},
+ { 0, 4, 8, 10, 12, 14, -1, -1},
+ { 2, 4, 8, 10, 12, 14, -1, -1},
+ { 0, 2, 4, 8, 10, 12, 14, -1},
+ { 6, 8, 10, 12, 14, -1, -1, -1},
+ { 0, 6, 8, 10, 12, 14, -1, -1},
+ { 2, 6, 8, 10, 12, 14, -1, -1},
+ { 0, 2, 6, 8, 10, 12, 14, -1},
+ { 4, 6, 8, 10, 12, 14, -1, -1},
+ { 0, 4, 6, 8, 10, 12, 14, -1},
+ { 2, 4, 6, 8, 10, 12, 14, -1},
+ { 0, 2, 4, 6, 8, 10, 12, 14}
+};
+#endif
+
+#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
+#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)
+
+unsigned int rej_uniform_avx2(int16_t * restrict r, const uint8_t *buf)
+{
+ unsigned int ctr, pos;
+ uint16_t val0, val1;
+ uint32_t good;
+#ifdef BMI
+ uint64_t idx0, idx1, idx2, idx3;
+#endif
+ const __m256i bound = _mm256_load_si256(&qdata.vec[_16XQ/16]);
+ const __m256i ones = _mm256_set1_epi8(1);
+ const __m256i mask = _mm256_set1_epi16(0xFFF);
+ const __m256i idx8 = _mm256_set_epi8(15,14,14,13,12,11,11,10,
+ 9, 8, 8, 7, 6, 5, 5, 4,
+ 11,10,10, 9, 8, 7, 7, 6,
+ 5, 4, 4, 3, 2, 1, 1, 0);
+ __m256i f0, f1, g0, g1, g2, g3;
+ __m128i f, t, pilo, pihi;
+
+ ctr = pos = 0;
+ while(ctr <= S2N_KYBER_512_R3_N - 32 && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 48) {
+ // correcting cast-align and cast-qual errors
+ // old version: f0 = _mm256_loadu_si256((__m256i *)&buf[pos]);
+ f0 = _mm256_loadu_si256((const void *)&buf[pos]);
+ // old version: f1 = _mm256_loadu_si256((__m256i *)&buf[pos+24]);
+ f1 = _mm256_loadu_si256((const void *)&buf[pos+24]);
+ f0 = _mm256_permute4x64_epi64(f0, 0x94);
+ f1 = _mm256_permute4x64_epi64(f1, 0x94);
+ f0 = _mm256_shuffle_epi8(f0, idx8);
+ f1 = _mm256_shuffle_epi8(f1, idx8);
+ g0 = _mm256_srli_epi16(f0, 4);
+ g1 = _mm256_srli_epi16(f1, 4);
+ f0 = _mm256_blend_epi16(f0, g0, 0xAA);
+ f1 = _mm256_blend_epi16(f1, g1, 0xAA);
+ f0 = _mm256_and_si256(f0, mask);
+ f1 = _mm256_and_si256(f1, mask);
+ pos += 48;
+
+ g0 = _mm256_cmpgt_epi16(bound, f0);
+ g1 = _mm256_cmpgt_epi16(bound, f1);
+
+ g0 = _mm256_packs_epi16(g0, g1);
+ good = _mm256_movemask_epi8(g0);
+
+#ifdef BMI
+ idx0 = _pdep_u64(good >> 0, 0x0101010101010101);
+ idx1 = _pdep_u64(good >> 8, 0x0101010101010101);
+ idx2 = _pdep_u64(good >> 16, 0x0101010101010101);
+ idx3 = _pdep_u64(good >> 24, 0x0101010101010101);
+ idx0 = (idx0 << 8) - idx0;
+ idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
+ idx1 = (idx1 << 8) - idx1;
+ idx1 = _pext_u64(0x0E0C0A0806040200, idx1);
+ idx2 = (idx2 << 8) - idx2;
+ idx2 = _pext_u64(0x0E0C0A0806040200, idx2);
+ idx3 = (idx3 << 8) - idx3;
+ idx3 = _pext_u64(0x0E0C0A0806040200, idx3);
+
+ g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0));
+ g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1));
+ g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1);
+ g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1);
+#else
+ // correcting cast-align and cast-qual errors
+ // old version: g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >> 0) & 0xFF]));
+ g0 = _mm256_castsi128_si256(_mm_loadl_epi64((const void *)&idx[(good >> 0) & 0xFF]));
+ // old version: g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >> 8) & 0xFF]));
+ g1 = _mm256_castsi128_si256(_mm_loadl_epi64((const void *)&idx[(good >> 8) & 0xFF]));
+ // old version: g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx[(good >> 16) & 0xFF]), 1);
+ g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((const void *)&idx[(good >> 16) & 0xFF]), 1);
+ // old version: g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx[(good >> 24) & 0xFF]), 1);
+ g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((const void *)&idx[(good >> 24) & 0xFF]), 1);
+#endif
+
+ g2 = _mm256_add_epi8(g0, ones);
+ g3 = _mm256_add_epi8(g1, ones);
+ g0 = _mm256_unpacklo_epi8(g0, g2);
+ g1 = _mm256_unpacklo_epi8(g1, g3);
+
+ f0 = _mm256_shuffle_epi8(f0, g0);
+ f1 = _mm256_shuffle_epi8(f1, g1);
+
+ // correcting cast-align errors
+ // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0));
+ _mm_storeu_si128((void *)&r[ctr], _mm256_castsi256_si128(f0));
+ ctr += _mm_popcnt_u32((good >> 0) & 0xFF);
+ // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1));
+ _mm_storeu_si128((void *)&r[ctr], _mm256_extracti128_si256(f0, 1));
+ ctr += _mm_popcnt_u32((good >> 16) & 0xFF);
+ // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1));
+ _mm_storeu_si128((void *)&r[ctr], _mm256_castsi256_si128(f1));
+ ctr += _mm_popcnt_u32((good >> 8) & 0xFF);
+ // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1));
+ _mm_storeu_si128((void *)&r[ctr], _mm256_extracti128_si256(f1, 1));
+ ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
+ }
+
+ while(ctr <= S2N_KYBER_512_R3_N - 8 && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 12) {
+ // correcting cast-align and cast-qual errors
+ // old version: f = _mm_loadu_si128((__m128i *)&buf[pos]);
+ f = _mm_loadu_si128((const void *)&buf[pos]);
+ f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8));
+ t = _mm_srli_epi16(f, 4);
+ f = _mm_blend_epi16(f, t, 0xAA);
+ f = _mm_and_si128(f, _mm256_castsi256_si128(mask));
+ pos += 12;
+
+ t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f);
+ good = _mm_movemask_epi8(t);
+
+#ifdef BMI
+ good &= 0x5555;
+ idx0 = _pdep_u64(good, 0x1111111111111111);
+ idx0 = (idx0 << 8) - idx0;
+ idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
+ pilo = _mm_cvtsi64_si128(idx0);
+#else
+ good = _pext_u32(good, 0x5555);
+ // correcting cast-align and cast-qual errors
+ // old version: pilo = _mm_loadl_epi64((__m128i *)&idx[good]);
+ pilo = _mm_loadl_epi64((const void *)&idx[good]);
+#endif
+
+ pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
+ pilo = _mm_unpacklo_epi8(pilo, pihi);
+ f = _mm_shuffle_epi8(f, pilo);
+ // correcting cast-align error
+ // old version: _mm_storeu_si128((__m128i *)&r[ctr], f);
+ _mm_storeu_si128((void *)&r[ctr], f);
+ ctr += _mm_popcnt_u32(good);
+ }
+
+ while(ctr < S2N_KYBER_512_R3_N && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 3) {
+ val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
+ val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4));
+ pos += 3;
+
+ if(val0 < S2N_KYBER_512_R3_Q)
+ r[ctr++] = val0;
+ if(val1 < S2N_KYBER_512_R3_Q && ctr < S2N_KYBER_512_R3_N)
+ r[ctr++] = val1;
+ }
+
+ return ctr;
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.h
new file mode 100644
index 0000000000..bd8a970464
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_fips202.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#define S2N_KYBER_512_R3_XOF_BLOCKBYTES S2N_KYBER_512_R3_SHAKE128_RATE
+#define S2N_KYBER_512_R3_REJ_UNIFORM_AVX_NBLOCKS ((12*S2N_KYBER_512_R3_N/8*(1 << 12)/S2N_KYBER_512_R3_Q + S2N_KYBER_512_R3_XOF_BLOCKBYTES)/S2N_KYBER_512_R3_XOF_BLOCKBYTES)
+#define S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN (S2N_KYBER_512_R3_REJ_UNIFORM_AVX_NBLOCKS*S2N_KYBER_512_R3_XOF_BLOCKBYTES)
+
+#define rej_uniform_avx2 S2N_KYBER_512_R3_NAMESPACE(rej_uniform_avx2)
+unsigned int rej_uniform_avx2(int16_t *r, const uint8_t *buf);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_shuffle_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_shuffle_avx2.S
new file mode 100644
index 0000000000..ce7200e5ca
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_shuffle_avx2.S
@@ -0,0 +1,272 @@
+#include "kyber512r3_consts_avx2.h"
+
+// The small macros (.inc files) are combined with .S files directly
+/*****.include "fq.inc"*****/
+/***************************/
+.macro red16 r,rs=0,x=12
+vpmulhw %ymm1,%ymm\r,%ymm\x
+.if \rs
+vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
+.else
+vpsraw $10,%ymm\x,%ymm\x
+.endif
+vpmullw %ymm0,%ymm\x,%ymm\x
+vpsubw %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro csubq r,x=12
+vpsubw %ymm0,%ymm\r,%ymm\r
+vpsraw $15,%ymm\r,%ymm\x
+vpand %ymm0,%ymm\x,%ymm\x
+vpaddw %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro caddq r,x=12
+vpsraw $15,%ymm\r,%ymm\x
+vpand %ymm0,%ymm\x,%ymm\x
+vpaddw %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro fqmulprecomp al,ah,b,x=12
+vpmullw %ymm\al,%ymm\b,%ymm\x
+vpmulhw %ymm\ah,%ymm\b,%ymm\b
+vpmulhw %ymm0,%ymm\x,%ymm\x
+vpsubw %ymm\x,%ymm\b,%ymm\b
+.endm
+/***************************/
+
+/*****.include "shuffle.inc"*****/
+/********************************/
+.macro shuffle8 r0,r1,r2,r3
+vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
+vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle4 r0,r1,r2,r3
+vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
+vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle2 r0,r1,r2,r3
+#vpsllq $32,%ymm\r1,%ymm\r2
+vmovsldup %ymm\r1,%ymm\r2
+vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
+vpsrlq $32,%ymm\r0,%ymm\r0
+#vmovshdup %ymm\r0,%ymm\r0
+vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle1 r0,r1,r2,r3
+vpslld $16,%ymm\r1,%ymm\r2
+vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
+vpsrld $16,%ymm\r0,%ymm\r0
+vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+/********************************/
+
+.text
+nttunpack128_avx:
+#load
+vmovdqa (%rdi),%ymm4
+vmovdqa 32(%rdi),%ymm5
+vmovdqa 64(%rdi),%ymm6
+vmovdqa 96(%rdi),%ymm7
+vmovdqa 128(%rdi),%ymm8
+vmovdqa 160(%rdi),%ymm9
+vmovdqa 192(%rdi),%ymm10
+vmovdqa 224(%rdi),%ymm11
+
+shuffle8 4,8,3,8
+shuffle8 5,9,4,9
+shuffle8 6,10,5,10
+shuffle8 7,11,6,11
+
+shuffle4 3,5,7,5
+shuffle4 8,10,3,10
+shuffle4 4,6,8,6
+shuffle4 9,11,4,11
+
+shuffle2 7,8,9,8
+shuffle2 5,6,7,6
+shuffle2 3,4,5,4
+shuffle2 10,11,3,11
+
+shuffle1 9,5,10,5
+shuffle1 8,4,9,4
+shuffle1 7,3,8,3
+shuffle1 6,11,7,11
+
+#store
+vmovdqa %ymm10,(%rdi)
+vmovdqa %ymm5,32(%rdi)
+vmovdqa %ymm9,64(%rdi)
+vmovdqa %ymm4,96(%rdi)
+vmovdqa %ymm8,128(%rdi)
+vmovdqa %ymm3,160(%rdi)
+vmovdqa %ymm7,192(%rdi)
+vmovdqa %ymm11,224(%rdi)
+
+ret
+
+.global cdecl(nttunpack_avx2_asm)
+cdecl(nttunpack_avx2_asm):
+call nttunpack128_avx
+add $256,%rdi
+call nttunpack128_avx
+ret
+
+ntttobytes128_avx:
+#load
+vmovdqa (%rsi),%ymm5
+vmovdqa 32(%rsi),%ymm6
+vmovdqa 64(%rsi),%ymm7
+vmovdqa 96(%rsi),%ymm8
+vmovdqa 128(%rsi),%ymm9
+vmovdqa 160(%rsi),%ymm10
+vmovdqa 192(%rsi),%ymm11
+vmovdqa 224(%rsi),%ymm12
+
+#csubq
+csubq 5,13
+csubq 6,13
+csubq 7,13
+csubq 8,13
+csubq 9,13
+csubq 10,13
+csubq 11,13
+csubq 12,13
+
+#bitpack
+vpsllw $12,%ymm6,%ymm4
+vpor %ymm4,%ymm5,%ymm4
+
+vpsrlw $4,%ymm6,%ymm5
+vpsllw $8,%ymm7,%ymm6
+vpor %ymm5,%ymm6,%ymm5
+
+vpsrlw $8,%ymm7,%ymm6
+vpsllw $4,%ymm8,%ymm7
+vpor %ymm6,%ymm7,%ymm6
+
+vpsllw $12,%ymm10,%ymm7
+vpor %ymm7,%ymm9,%ymm7
+
+vpsrlw $4,%ymm10,%ymm8
+vpsllw $8,%ymm11,%ymm9
+vpor %ymm8,%ymm9,%ymm8
+
+vpsrlw $8,%ymm11,%ymm9
+vpsllw $4,%ymm12,%ymm10
+vpor %ymm9,%ymm10,%ymm9
+
+shuffle1 4,5,3,5
+shuffle1 6,7,4,7
+shuffle1 8,9,6,9
+
+shuffle2 3,4,8,4
+shuffle2 6,5,3,5
+shuffle2 7,9,6,9
+
+shuffle4 8,3,7,3
+shuffle4 6,4,8,4
+shuffle4 5,9,6,9
+
+shuffle8 7,8,5,8
+shuffle8 6,3,7,3
+shuffle8 4,9,6,9
+
+#store
+vmovdqu %ymm5,(%rdi)
+vmovdqu %ymm7,32(%rdi)
+vmovdqu %ymm6,64(%rdi)
+vmovdqu %ymm8,96(%rdi)
+vmovdqu %ymm3,128(%rdi)
+vmovdqu %ymm9,160(%rdi)
+
+ret
+
+.global cdecl(ntttobytes_avx2_asm)
+cdecl(ntttobytes_avx2_asm):
+#consts
+vmovdqa _16XQ*2(%rdx),%ymm0
+call ntttobytes128_avx
+add $256,%rsi
+add $192,%rdi
+call ntttobytes128_avx
+ret
+
+nttfrombytes128_avx:
+#load
+vmovdqu (%rsi),%ymm4
+vmovdqu 32(%rsi),%ymm5
+vmovdqu 64(%rsi),%ymm6
+vmovdqu 96(%rsi),%ymm7
+vmovdqu 128(%rsi),%ymm8
+vmovdqu 160(%rsi),%ymm9
+
+shuffle8 4,7,3,7
+shuffle8 5,8,4,8
+shuffle8 6,9,5,9
+
+shuffle4 3,8,6,8
+shuffle4 7,5,3,5
+shuffle4 4,9,7,9
+
+shuffle2 6,5,4,5
+shuffle2 8,7,6,7
+shuffle2 3,9,8,9
+
+shuffle1 4,7,10,7
+shuffle1 5,8,4,8
+shuffle1 6,9,5,9
+
+#bitunpack
+vpsrlw $12,%ymm10,%ymm11
+vpsllw $4,%ymm7,%ymm12
+vpor %ymm11,%ymm12,%ymm11
+vpand %ymm0,%ymm10,%ymm10
+vpand %ymm0,%ymm11,%ymm11
+
+vpsrlw $8,%ymm7,%ymm12
+vpsllw $8,%ymm4,%ymm13
+vpor %ymm12,%ymm13,%ymm12
+vpand %ymm0,%ymm12,%ymm12
+
+vpsrlw $4,%ymm4,%ymm13
+vpand %ymm0,%ymm13,%ymm13
+
+vpsrlw $12,%ymm8,%ymm14
+vpsllw $4,%ymm5,%ymm15
+vpor %ymm14,%ymm15,%ymm14
+vpand %ymm0,%ymm8,%ymm8
+vpand %ymm0,%ymm14,%ymm14
+
+vpsrlw $8,%ymm5,%ymm15
+vpsllw $8,%ymm9,%ymm1
+vpor %ymm15,%ymm1,%ymm15
+vpand %ymm0,%ymm15,%ymm15
+
+vpsrlw $4,%ymm9,%ymm1
+vpand %ymm0,%ymm1,%ymm1
+
+#store
+vmovdqa %ymm10,(%rdi)
+vmovdqa %ymm11,32(%rdi)
+vmovdqa %ymm12,64(%rdi)
+vmovdqa %ymm13,96(%rdi)
+vmovdqa %ymm8,128(%rdi)
+vmovdqa %ymm14,160(%rdi)
+vmovdqa %ymm15,192(%rdi)
+vmovdqa %ymm1,224(%rdi)
+
+ret
+
+.global cdecl(nttfrombytes_avx2_asm)
+cdecl(nttfrombytes_avx2_asm):
+#consts
+vmovdqa _16XMASK*2(%rdx),%ymm0
+call nttfrombytes128_avx
+add $256,%rdi
+add $192,%rsi
+call nttfrombytes128_avx
+ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric-shake.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric-shake.c
new file mode 100644
index 0000000000..390a2a4e38
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric-shake.c
@@ -0,0 +1,49 @@
+#include "kyber512r3_params.h"
+#include "kyber512r3_fips202.h"
+#include "kyber512r3_symmetric.h"
+#include <stdlib.h>
+
+/*************************************************
+* Name: kyber_shake128_absorb
+*
+* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
+
+* Arguments: - keccak_state *s: pointer to (uninitialized) output Keccak state
+* - const uint8_t *input: pointer to S2N_KYBER_512_R3_SYMBYTES input to be absorbed into s
+* - uint8_t i additional byte of input
+* - uint8_t j additional byte of input
+**************************************************/
+void kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y) {
+ size_t i;
+ uint8_t extseed[S2N_KYBER_512_R3_SYMBYTES + 2];
+
+ for (i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) {
+ extseed[i] = input[i];
+ }
+ extseed[i++] = x;
+ extseed[i] = y;
+ shake128_absorb(s, extseed, S2N_KYBER_512_R3_SYMBYTES + 2);
+}
+
+/*************************************************
+* Name: shake256_prf
+*
+* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
+* and then generates outlen bytes of SHAKE256 output
+*
+* Arguments: - uint8_t *output: pointer to output
+* - size_t outlen: number of requested output bytes
+* - const uint8_t * key: pointer to the key (of length S2N_KYBER_512_R3_SYMBYTES)
+* - uint8_t nonce: single-byte nonce (public PRF input)
+**************************************************/
+void shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) {
+ uint8_t extkey[S2N_KYBER_512_R3_SYMBYTES + 1];
+ size_t i;
+
+ for (i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) {
+ extkey[i] = key[i];
+ }
+ extkey[i] = nonce;
+
+ shake256(output, outlen, extkey, S2N_KYBER_512_R3_SYMBYTES + 1);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric.h
new file mode 100644
index 0000000000..e898a29450
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "kyber512r3_params.h"
+#include "kyber512r3_fips202.h"
+#include <stdint.h>
+
+#define keccak_state S2N_KYBER_512_R3_NAMESPACE(keccak_state)
+typedef shake128ctx keccak_state;
+
+#define xof_state S2N_KYBER_512_R3_NAMESPACE(xof_state)
+typedef keccak_state xof_state;
+
+#define kyber_shake128_absorb S2N_KYBER_512_R3_NAMESPACE(kyber_shake128_absorb)
+void kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y);
+
+#define shake256_prf S2N_KYBER_512_R3_NAMESPACE(shake256_prf)
+void shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c
index 7381deed4e..8eda65be59 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c
@@ -14,13 +14,23 @@
*/
#include "s2n_pq.h"
+#include "crypto/s2n_openssl.h"
-static bool sikep434r2_asm_enabled = false;
+static bool sikep434r3_asm_enabled = false;
+
+/* BIKE Round-3 code supports several levels of optimization */
+static bool bike_r3_avx2_enabled = false;
+static bool bike_r3_avx512_enabled = false;
+static bool bike_r3_pclmul_enabled = false;
+static bool bike_r3_vpclmul_enabled = false;
+
+static bool kyber512r3_avx2_bmi2_enabled = false;
#if defined(S2N_CPUID_AVAILABLE)
/* https://en.wikipedia.org/wiki/CPUID */
#include <cpuid.h>
+#define PROCESSOR_INFO_AND_FEATURES 1
#define EXTENDED_FEATURES_LEAF 7
#define EXTENDED_FEATURES_SUBLEAF_ZERO 0
@@ -35,6 +45,12 @@ static bool sikep434r2_asm_enabled = false;
#define bit_BMI2 (1 << 8)
#endif
+/* BIKE related CPU features */
+#define EBX_BIT_AVX2 (1 << 5)
+#define EBX_BIT_AVX512 (1 << 16)
+#define ECX_BIT_VPCLMUL (1 << 10)
+#define ECX_BIT_PCLMUL (1 << 1)
+
bool s2n_get_cpuid_count(uint32_t leaf, uint32_t sub_leaf, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) {
/* 0x80000000 probes for extended cpuid info */
uint32_t max_level = __get_cpuid_max(leaf & 0x80000000, 0);
@@ -67,56 +83,228 @@ bool s2n_cpu_supports_adx() {
return (ebx & bit_ADX);
}
-bool s2n_cpu_supports_sikep434r2_asm() {
-#if defined(S2N_SIKEP434R2_ASM)
- /* The sikep434r2 assembly code always requires BMI2. If the assembly
+bool s2n_cpu_supports_avx2() {
+ uint32_t eax, ebx, ecx, edx;
+ if (!s2n_get_cpuid_count(EXTENDED_FEATURES_LEAF, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) {
+ return false;
+ }
+
+ return (ebx & EBX_BIT_AVX2);
+}
+
+bool s2n_cpu_supports_sikep434r3_asm() {
+#if defined(S2N_SIKE_P434_R3_ASM)
+ /* The sikep434r3 assembly code always requires BMI2. If the assembly
* was compiled with support for ADX, we also require ADX at runtime. */
- #if defined(S2N_ADX)
- return s2n_cpu_supports_bmi2() && s2n_cpu_supports_adx();
- #else
- return s2n_cpu_supports_bmi2();
- #endif
+#if defined(S2N_ADX)
+ return s2n_cpu_supports_bmi2() && s2n_cpu_supports_adx();
+#else
+ return s2n_cpu_supports_bmi2();
+#endif
+#else
+ /* sikep434r3 assembly was not supported at compile time */
+ return false;
+#endif /* defined(S2N_SIKE_P434_R3_ASM) */
+}
+
+bool s2n_cpu_supports_bike_r3_avx2() {
+#if defined(S2N_BIKE_R3_AVX2)
+ uint32_t eax, ebx, ecx, edx;
+ if (!s2n_get_cpuid_count(EXTENDED_FEATURES_LEAF, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) {
+ return false;
+ }
+ return ((ebx & EBX_BIT_AVX2) != 0);
+#else
+ return false;
+#endif
+}
+
+bool s2n_cpu_supports_bike_r3_avx512() {
+#if defined(S2N_BIKE_R3_AVX512)
+ uint32_t eax, ebx, ecx, edx;
+ if (!s2n_get_cpuid_count(EXTENDED_FEATURES_LEAF, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) {
+ return false;
+ }
+ return ((ebx & EBX_BIT_AVX512) != 0);
+#else
+ return false;
+#endif
+}
+
+bool s2n_cpu_supports_bike_r3_pclmul() {
+#if defined(S2N_BIKE_R3_PCLMUL)
+ uint32_t eax, ebx, ecx, edx;
+ if (!s2n_get_cpuid_count(PROCESSOR_INFO_AND_FEATURES, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) {
+ return false;
+ }
+ return ((ecx & ECX_BIT_PCLMUL) != 0);
#else
- /* sikep434r2 assembly was not supported at compile time */
return false;
-#endif /* defined(S2N_SIKEP434R2_ASM) */
+#endif
+}
+
+bool s2n_cpu_supports_bike_r3_vpclmul() {
+#if defined(S2N_BIKE_R3_AVX512)
+ uint32_t eax, ebx, ecx, edx;
+ if (!s2n_get_cpuid_count(EXTENDED_FEATURES_LEAF, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) {
+ return false;
+ }
+ return ((ecx & ECX_BIT_VPCLMUL) != 0);
+#else
+ return false;
+#endif
+}
+
+bool s2n_cpu_supports_kyber512r3_avx2_bmi2() {
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+ return s2n_cpu_supports_bmi2() && s2n_cpu_supports_avx2();
+#else
+ return false;
+#endif
}
#else /* defined(S2N_CPUID_AVAILABLE) */
/* If CPUID is not available, we cannot perform necessary run-time checks. */
-bool s2n_cpu_supports_sikep434r2_asm() {
+bool s2n_cpu_supports_sikep434r3_asm() {
+ return false;
+}
+
+bool s2n_cpu_supports_bike_r3_avx2() {
+ return false;
+}
+
+bool s2n_cpu_supports_bike_r3_avx512() {
+ return false;
+}
+
+bool s2n_cpu_supports_bike_r3_pclmul() {
+ return false;
+}
+
+bool s2n_cpu_supports_bike_r3_vpclmul() {
+ return false;
+}
+
+bool s2n_cpu_supports_kyber512r3_avx2_bmi2() {
return false;
}
#endif /* defined(S2N_CPUID_AVAILABLE) */
-bool s2n_sikep434r2_asm_is_enabled() {
- return sikep434r2_asm_enabled;
+bool s2n_sikep434r3_asm_is_enabled() {
+ return sikep434r3_asm_enabled;
+}
+
+bool s2n_bike_r3_is_avx2_enabled() {
+ return bike_r3_avx2_enabled;
+}
+
+bool s2n_bike_r3_is_avx512_enabled() {
+ return bike_r3_avx512_enabled;
+}
+
+bool s2n_bike_r3_is_pclmul_enabled() {
+ return bike_r3_pclmul_enabled;
+}
+
+bool s2n_bike_r3_is_vpclmul_enabled() {
+ return bike_r3_vpclmul_enabled;
+}
+
+bool s2n_kyber512r3_is_avx2_bmi2_enabled() {
+ return kyber512r3_avx2_bmi2_enabled;
}
bool s2n_pq_is_enabled() {
#if defined(S2N_NO_PQ)
return false;
#else
- return !s2n_is_in_fips_mode();
+ /* aws-lc is currently the only supported FIPS library known to support PQ. */
+ return s2n_libcrypto_is_awslc() || (!s2n_is_in_fips_mode());
#endif
}
-S2N_RESULT s2n_disable_sikep434r2_asm() {
- sikep434r2_asm_enabled = false;
+S2N_RESULT s2n_disable_sikep434r3_asm() {
+ sikep434r3_asm_enabled = false;
+ return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_disable_bike_r3_opt_all() {
+ bike_r3_avx2_enabled = false;
+ bike_r3_avx512_enabled = false;
+ bike_r3_pclmul_enabled = false;
+ bike_r3_vpclmul_enabled = false;
return S2N_RESULT_OK;
}
-S2N_RESULT s2n_try_enable_sikep434r2_asm() {
- if (s2n_pq_is_enabled() && s2n_cpu_supports_sikep434r2_asm()) {
- sikep434r2_asm_enabled = true;
+S2N_RESULT s2n_disable_kyber512r3_opt_avx2_bmi2() {
+ kyber512r3_avx2_bmi2_enabled = false;
+ return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_try_enable_bike_r3_opt_pclmul() {
+ if (s2n_pq_is_enabled() && s2n_cpu_supports_bike_r3_pclmul()) {
+ bike_r3_pclmul_enabled = true;
}
return S2N_RESULT_OK;
}
-S2N_RESULT s2n_pq_init() {
- ENSURE_OK(s2n_try_enable_sikep434r2_asm(), S2N_ERR_SAFETY);
+S2N_RESULT s2n_try_enable_bike_r3_opt_avx2() {
+ /* When AVX2 is available, PCLMUL is too by default. */
+ RESULT_ENSURE_OK(s2n_try_enable_bike_r3_opt_pclmul(), S2N_ERR_SAFETY);
+ if (s2n_pq_is_enabled() && s2n_cpu_supports_bike_r3_avx2()) {
+ bike_r3_avx2_enabled = true;
+ }
+ return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_try_enable_bike_r3_opt_avx512() {
+ /* When AVX512 is available, AVX2 is too by default. */
+ RESULT_ENSURE_OK(s2n_try_enable_bike_r3_opt_avx2(), S2N_ERR_SAFETY);
+ if (s2n_pq_is_enabled() && s2n_cpu_supports_bike_r3_avx512()) {
+ bike_r3_avx512_enabled = true;
+ }
+ return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_try_enable_bike_r3_opt_vpclmul() {
+ RESULT_ENSURE_OK(s2n_try_enable_bike_r3_opt_avx512(), S2N_ERR_SAFETY);
+ /* Only Enable VPCLMUL if AVX512 is also supported. This is to because the BIKE R3 VPCLMUL requires 512-bit version
+ * of VPCLMUL, and not the 256-bit version that is available on AMD Zen 3 processors. */
+ if (s2n_pq_is_enabled() && s2n_cpu_supports_bike_r3_vpclmul() && s2n_bike_r3_is_avx512_enabled()) {
+ bike_r3_vpclmul_enabled = true;
+ }
+ return S2N_RESULT_OK;
+}
+S2N_RESULT s2n_try_enable_sikep434r3_asm() {
+ if (s2n_pq_is_enabled() && s2n_cpu_supports_sikep434r3_asm()) {
+ sikep434r3_asm_enabled = true;
+ }
+ return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_try_enable_kyber512r3_opt_avx2_bmi2() {
+ if (s2n_pq_is_enabled() && s2n_cpu_supports_kyber512r3_avx2_bmi2()) {
+ kyber512r3_avx2_bmi2_enabled = true;
+ }
+ return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_bike_r3_x86_64_opt_init()
+{
+ /* try_enable_vpclmul function recursively tries to enable
+ * all the optimizations (avx2, avx512, pclmul, vpclmul),
+ * so it's sufficient to call only this function. */
+ RESULT_ENSURE_OK(s2n_try_enable_bike_r3_opt_vpclmul(), S2N_ERR_SAFETY);
+ return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_pq_init() {
+ RESULT_ENSURE_OK(s2n_try_enable_sikep434r3_asm(), S2N_ERR_SAFETY);
+ RESULT_ENSURE_OK(s2n_bike_r3_x86_64_opt_init(), S2N_ERR_SAFETY);
+ RESULT_ENSURE_OK(s2n_try_enable_kyber512r3_opt_avx2_bmi2(), S2N_ERR_SAFETY);
+
return S2N_RESULT_OK;
}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h
index 7e5d93e991..2af5c4c940 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h
@@ -20,8 +20,23 @@
#include "utils/s2n_safety.h"
#include "crypto/s2n_fips.h"
-bool s2n_sikep434r2_asm_is_enabled(void);
+bool s2n_sikep434r3_asm_is_enabled(void);
+S2N_RESULT s2n_disable_sikep434r3_asm(void);
+S2N_RESULT s2n_try_enable_sikep434r3_asm(void);
+
+bool s2n_bike_r3_is_avx2_enabled(void);
+bool s2n_bike_r3_is_avx512_enabled(void);
+bool s2n_bike_r3_is_pclmul_enabled(void);
+bool s2n_bike_r3_is_vpclmul_enabled(void);
+S2N_RESULT s2n_disable_bike_r3_opt_all(void);
+S2N_RESULT s2n_try_enable_bike_r3_opt_pclmul(void);
+S2N_RESULT s2n_try_enable_bike_r3_opt_avx2(void);
+S2N_RESULT s2n_try_enable_bike_r3_opt_avx512(void);
+S2N_RESULT s2n_try_enable_bike_r3_opt_vpclmul(void);
+
+bool s2n_kyber512r3_is_avx2_bmi2_enabled(void);
+S2N_RESULT s2n_try_enable_kyber512r3_opt_avx2_bmi2(void);
+S2N_RESULT s2n_disable_kyber512r3_opt_avx2_bmi2(void);
+
bool s2n_pq_is_enabled(void);
-S2N_RESULT s2n_disable_sikep434r2_asm(void);
-S2N_RESULT s2n_try_enable_sikep434r2_asm(void);
S2N_RESULT s2n_pq_init(void);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c
index 845def4a31..275a3e132d 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c
@@ -23,21 +23,21 @@ static S2N_RESULT s2n_get_random_bytes_default(uint8_t *buffer, uint32_t num_byt
static s2n_get_random_bytes_callback s2n_get_random_bytes_cb = s2n_get_random_bytes_default;
S2N_RESULT s2n_get_random_bytes(uint8_t *buffer, uint32_t num_bytes) {
- ENSURE_REF(buffer);
- GUARD_RESULT(s2n_get_random_bytes_cb(buffer, num_bytes));
+ RESULT_ENSURE_REF(buffer);
+ RESULT_GUARD(s2n_get_random_bytes_cb(buffer, num_bytes));
return S2N_RESULT_OK;
}
static S2N_RESULT s2n_get_random_bytes_default(uint8_t *buffer, uint32_t num_bytes) {
struct s2n_blob out = { .data = buffer, .size = num_bytes };
- GUARD_RESULT(s2n_get_private_random_data(&out));
+ RESULT_GUARD(s2n_get_private_random_data(&out));
return S2N_RESULT_OK;
}
S2N_RESULT s2n_set_rand_bytes_callback_for_testing(s2n_get_random_bytes_callback rand_bytes_callback) {
- ENSURE(s2n_in_unit_test(), S2N_ERR_NOT_IN_UNIT_TEST);
+ RESULT_ENSURE(s2n_in_unit_test(), S2N_ERR_NOT_IN_UNIT_TEST);
s2n_get_random_bytes_cb = rand_bytes_callback;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h
index f6674fa2bc..64465f19ed 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h
@@ -150,7 +150,7 @@ void fpdiv2_503(const digit_t* a, digit_t* c);
void fpcorrection503(digit_t* a);
// 503-bit Montgomery reduction, c = a mod p
-void rdc_mont(const digit_t* a, digit_t* c);
+void rdc_mont(const dfelm_t ma, felm_t mc);
// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768
void fpmul503_mont(const felm_t a, const felm_t b, felm_t c);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h
index 128a0127bf..983537c2ca 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h
@@ -7,7 +7,7 @@
#define SHAKE128_RATE 168
#define SHAKE256_RATE 136
-void cshake256_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen);
+void cshake256_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen);
void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen);
#endif // FIPS202_R1_H
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c
index 7f3c63fd85..bdf2834121 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c
@@ -63,7 +63,7 @@ int random_mod_order_B(unsigned char* random_digits)
unsigned long long nbytes = NBITS_TO_NBYTES(OBOB_BITS-1);
clear_words((void*)random_digits, MAXWORDS_ORDER);
- GUARD_AS_POSIX(s2n_get_random_bytes(random_digits, nbytes));
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(random_digits, nbytes));
random_digits[nbytes-1] &= MASK_BOB; // Masking last byte
return S2N_SUCCESS;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c
index 3122eb6539..ee905ca74a 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c
@@ -16,13 +16,13 @@ int SIKE_P503_r1_crypto_kem_keypair(unsigned char *pk, unsigned char *sk)
{ // SIKE's key generation
// Outputs: secret key sk (SIKE_P503_R1_SECRET_KEY_BYTES = MSG_BYTES + SECRETKEY_B_BYTES + SIKE_P503_R1_PUBLIC_KEY_BYTES bytes)
// public key pk (SIKE_P503_R1_PUBLIC_KEY_BYTES bytes)
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
digit_t _sk[SECRETKEY_B_BYTES/sizeof(digit_t)];
// Generate lower portion of secret key sk <- s||SK
- GUARD_AS_POSIX(s2n_get_random_bytes(sk, MSG_BYTES));
- GUARD(random_mod_order_B((unsigned char*)_sk));
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(sk, MSG_BYTES));
+ POSIX_GUARD(random_mod_order_B((unsigned char*)_sk));
// Generate public key pk
EphemeralKeyGeneration_B(_sk, pk);
@@ -40,7 +40,7 @@ int SIKE_P503_r1_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsi
// Input: public key pk (SIKE_P503_R1_PUBLIC_KEY_BYTES bytes)
// Outputs: shared secret ss (SIKE_P503_R1_SHARED_SECRET_BYTES bytes)
// ciphertext message ct (SIKE_P503_R1_CIPHERTEXT_BYTES = SIKE_P503_R1_PUBLIC_KEY_BYTES + MSG_BYTES bytes)
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
const uint16_t G = 0;
const uint16_t H = 1;
@@ -55,7 +55,7 @@ int SIKE_P503_r1_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsi
unsigned int i;
// Generate ephemeralsk <- G(m||pk) mod oA
- GUARD_AS_POSIX(s2n_get_random_bytes(temp, MSG_BYTES));
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(temp, MSG_BYTES));
memcpy(&temp[MSG_BYTES], pk, SIKE_P503_R1_PUBLIC_KEY_BYTES);
cshake256_simple(ephemeralsk.b, SECRETKEY_A_BYTES, G, temp, SIKE_P503_R1_PUBLIC_KEY_BYTES+MSG_BYTES);
@@ -82,7 +82,7 @@ int SIKE_P503_r1_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, cons
// Input: secret key sk (SIKE_P503_R1_SECRET_KEY_BYTES = MSG_BYTES + SECRETKEY_B_BYTES + SIKE_P503_R1_PUBLIC_KEY_BYTES bytes)
// ciphertext message ct (SIKE_P503_R1_CIPHERTEXT_BYTES = SIKE_P503_R1_PUBLIC_KEY_BYTES + MSG_BYTES bytes)
// Outputs: shared secret ss (SIKE_P503_R1_SHARED_SECRET_BYTES bytes)
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
const uint16_t G = 0;
const uint16_t H = 1;
@@ -117,9 +117,13 @@ int SIKE_P503_r1_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, cons
// Generate shared secret ss <- H(m||ct) or output ss <- H(s||ct)
EphemeralKeyGeneration_A(ephemeralsk_.d, c0_);
- if (memcmp(c0_, ct, SIKE_P503_R1_PUBLIC_KEY_BYTES) != 0) {
- memcpy(temp, sk, MSG_BYTES);
- }
+
+ // Note: This step deviates from the NIST supplied code by using constant time operations.
+ // We only want to copy the data if c0_ and ct are different
+ bool dont_copy = s2n_constant_time_equals(c0_, ct, SIKE_P503_R1_PUBLIC_KEY_BYTES);
+ // The last argument to s2n_constant_time_copy_or_dont is dont and thus prevents the copy when non-zero/true
+ s2n_constant_time_copy_or_dont(temp, sk, MSG_BYTES, dont_copy);
+
memcpy(&temp[MSG_BYTES], ct, SIKE_P503_R1_CIPHERTEXT_BYTES);
cshake256_simple(ss, SIKE_P503_R1_SHARED_SECRET_BYTES, H, temp, SIKE_P503_R1_CIPHERTEXT_BYTES+MSG_BYTES);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434.c
deleted file mode 100644
index 4288a5d186..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: supersingular isogeny parameters and generation of functions for P434
-*********************************************************************************************/
-
-#include "P434_api.h"
-#include "P434_internal.h"
-#include "pq-crypto/s2n_pq.h"
-
-// Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points:
-// --------------------------------------------------------------------------------------------------
-// Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at the leftmost position (i.e., little endian format).
-// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as {a, b}, with a in the least significant position.
-// Elliptic curve points P = (x,y) are encoded as {x, y}, with x in the least significant position.
-// Internally, the number of digits used to represent all these elements is obtained by approximating the number of bits to the immediately greater multiple of 32.
-// For example, a 434-bit field element is represented with Ceil(434 / 64) = 7 64-bit digits or Ceil(434 / 32) = 14 32-bit digits.
-
-//
-// Curve isogeny system "SIDHp434". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over GF(p434^2), where A=6, B=1, C=1 and p434 = 2^216*3^137-1
-//
-
-
-// The constants p434, p434p1, and p434x2 have been duplicated in
-// sikep434r2_fp_x64_asm.S. If, for any reason, the constants are changed in
-// one file, they should be updated in the other file as well.
-const uint64_t p434[NWORDS64_FIELD] = {0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFDC1767AE2FFFFFF,
- 0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344};
-const uint64_t p434p1[NWORDS64_FIELD] = {0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFDC1767AE3000000,
- 0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344};
-const uint64_t p434x2[NWORDS64_FIELD] = {0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFB82ECF5C5FFFFFF,
- 0xF78CB8F062B15D47, 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688};
-// Order of Alice's subgroup
-const uint64_t Alice_order[NWORDS64_ORDER] = {0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000001000000};
-// Order of Bob's subgroup
-const uint64_t Bob_order[NWORDS64_ORDER] = {0x58AEA3FDC1767AE3, 0xC520567BC65C7831, 0x1773446CFC5FD681, 0x0000000002341F27};
-// Alice's generator values {XPA0 + XPA1*i, XQA0 + xQA1*i, XRA0 + XRA1*i} in GF(p434^2), expressed in Montgomery representation
-const uint64_t A_gen[6 * NWORDS64_FIELD] = {0x05ADF455C5C345BF, 0x91935C5CC767AC2B, 0xAFE4E879951F0257, 0x70E792DC89FA27B1,
- 0xF797F526BB48C8CD, 0x2181DB6131AF621F, 0x00000A1C08B1ECC4, // XPA0
- 0x74840EB87CDA7788, 0x2971AA0ECF9F9D0B, 0xCB5732BDF41715D5, 0x8CD8E51F7AACFFAA,
- 0xA7F424730D7E419F, 0xD671EB919A179E8C, 0x0000FFA26C5A924A, // XPA1
- 0xFEC6E64588B7273B, 0xD2A626D74CBBF1C6, 0xF8F58F07A78098C7, 0xE23941F470841B03,
- 0x1B63EDA2045538DD, 0x735CFEB0FFD49215, 0x0001C4CB77542876, // XQA0
- 0xADB0F733C17FFDD6, 0x6AFFBD037DA0A050, 0x680EC43DB144E02F, 0x1E2E5D5FF524E374,
- 0xE2DDA115260E2995, 0xA6E4B552E2EDE508, 0x00018ECCDDF4B53E, // XQA1
- 0x01BA4DB518CD6C7D, 0x2CB0251FE3CC0611, 0x259B0C6949A9121B, 0x60E17AC16D2F82AD,
- 0x3AA41F1CE175D92D, 0x413FBE6A9B9BC4F3, 0x00022A81D8D55643, // XRA0
- 0xB8ADBC70FC82E54A, 0xEF9CDDB0D5FADDED, 0x5820C734C80096A0, 0x7799994BAA96E0E4,
- 0x044961599E379AF8, 0xDB2B94FBF09F27E2, 0x0000B87FC716C0C6}; // XRA1
-// Bob's generator values {XPB0, XQB0, XRB0 + XRB1*i} in GF(p434^2), expressed in Montgomery representation
-const uint64_t B_gen[6 * NWORDS64_FIELD] = {0x6E5497556EDD48A3, 0x2A61B501546F1C05, 0xEB919446D049887D, 0x5864A4A69D450C4F,
- 0xB883F276A6490D2B, 0x22CC287022D5F5B9, 0x0001BED4772E551F, // XPB0
- 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
- 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, // XPB1
- 0xFAE2A3F93D8B6B8E, 0x494871F51700FE1C, 0xEF1A94228413C27C, 0x498FF4A4AF60BD62,
- 0xB00AD2A708267E8A, 0xF4328294E017837F, 0x000034080181D8AE, // XQB0
- 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
- 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, // XQB1
- 0x283B34FAFEFDC8E4, 0x9208F44977C3E647, 0x7DEAE962816F4E9A, 0x68A2BA8AA262EC9D,
- 0x8176F112EA43F45B, 0x02106D022634F504, 0x00007E8A50F02E37, // XRB0
- 0xB378B7C1DA22CCB1, 0x6D089C99AD1D9230, 0xEBE15711813E2369, 0x2B35A68239D48A53,
- 0x445F6FD138407C93, 0xBEF93B29A3F6B54B, 0x000173FA910377D3}; // XRB1
-// Montgomery constant Montgomery_R2 = (2^448)^2 mod p434
-const uint64_t Montgomery_R2[NWORDS64_FIELD] = {0x28E55B65DCD69B30, 0xACEC7367768798C2, 0xAB27973F8311688D, 0x175CC6AF8D6C7C0B,
- 0xABCD92BF2DDE347E, 0x69E16A61C7686D9A, 0x000025A89BCDD12A};
-// Value one in Montgomery representation
-const uint64_t Montgomery_one[NWORDS64_FIELD] = {0x000000000000742C, 0x0000000000000000, 0x0000000000000000, 0xB90FF404FC000000,
- 0xD801A4FB559FACD4, 0xE93254545F77410C, 0x0000ECEEA7BD2EDA};
-
-// Fixed parameters for isogeny tree computation
-const unsigned int strat_Alice[MAX_Alice - 1] = {
- 48, 28, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 13, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1,
- 1, 1, 5, 4, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 21, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1,
- 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1};
-
-const unsigned int strat_Bob[MAX_Bob - 1] = {
- 66, 33, 17, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 1,
- 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 32, 16, 8, 4, 3, 1, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2,
- 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1};
-
-// Setting up macro defines and including GF(p), GF(p^2), curve, isogeny and kex functions
-#define fpcopy fpcopy434
-#define fpzero fpzero434
-#define fpadd fpadd434
-#define fpsub fpsub434
-#define fpneg fpneg434
-#define fpdiv2 fpdiv2_434
-#define fpcorrection fpcorrection434
-#define fpmul_mont fpmul434_mont
-#define fpsqr_mont fpsqr434_mont
-#define fpinv_mont fpinv434_mont
-#define fpinv_chain_mont fpinv434_chain_mont
-#define fp2copy fp2copy434
-#define fp2zero fp2zero434
-#define fp2add fp2add434
-#define fp2sub fp2sub434
-#define fp2neg fp2neg434
-#define fp2div2 fp2div2_434
-#define fp2correction fp2correction434
-#define fp2mul_mont fp2mul434_mont
-#define fp2sqr_mont fp2sqr434_mont
-#define fp2inv_mont fp2inv434_mont
-#define mp_add_asm mp_add434_asm
-#define mp_subaddx2_asm mp_subadd434x2_asm
-#define mp_dblsubx2_asm mp_dblsub434x2_asm
-#define random_mod_order_A oqs_kem_sidh_p434_random_mod_order_A
-#define random_mod_order_B oqs_kem_sidh_p434_random_mod_order_B
-#define EphemeralKeyGeneration_A oqs_kem_sidh_p434_EphemeralKeyGeneration_A
-#define EphemeralKeyGeneration_B oqs_kem_sidh_p434_EphemeralKeyGeneration_B
-#define EphemeralSecretAgreement_A oqs_kem_sidh_p434_EphemeralSecretAgreement_A
-#define EphemeralSecretAgreement_B oqs_kem_sidh_p434_EphemeralSecretAgreement_B
-
-#include "fp.c"
-#include "fpx.c"
-#include "ec_isogeny.c"
-#include "sidh.c"
-#include "sike_r2_kem.c"
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_api.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_api.h
deleted file mode 100644
index bdf3eee8cd..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_api.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: API header file for P434
-*********************************************************************************************/
-
-#ifndef P434_API_H
-#define P434_API_H
-
-#include "P434_internal.h"
-
-/*********************** Key encapsulation mechanism API ***********************/
-
-#define CRYPTO_SECRETKEYBYTES 374 // MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes
-#define CRYPTO_PUBLICKEYBYTES 330
-#define CRYPTO_BYTES 16
-#define CRYPTO_CIPHERTEXTBYTES 346 // CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes
-
-// Encoding of keys for KEM-based isogeny system "SIKEp434" (wire format):
-// ----------------------------------------------------------------------
-// Elements over GF(p434) are encoded in 55 octets in little endian format (i.e., the least significant octet is located in the lowest memory address).
-// Elements (a+b*i) over GF(p434^2), where a and b are defined over GF(p434), are encoded as {a, b}, with a in the lowest memory portion.
-//
-// Private keys sk consist of the concatenation of a 16-byte random value, a value in the range [0, 2^217-1] and the public key pk. In the SIKE API,
-// private keys are encoded in 374 octets in little endian format.
-// Public keys pk consist of 3 elements in GF(p434^2). In the SIKE API, pk is encoded in 330 octets.
-// Ciphertexts ct consist of the concatenation of a public key value and a 16-byte value. In the SIKE API, ct is encoded in 330 + 16 = 346 octets.
-// Shared keys ss consist of a value of 16 octets.
-
-/*********************** Ephemeral key exchange API ***********************/
-
-// SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use it with static keys.
-// See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith, C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016.
-// Extended version available at: http://eprint.iacr.org/2016/859
-
-// Generation of Alice's secret key
-// Outputs random value in [0, 2^216 - 1] to be used as Alice's private key
-int oqs_kem_sidh_p434_random_mod_order_A(unsigned char *random_digits);
-
-// Generation of Bob's secret key
-// Outputs random value in [0, 2^Floor(Log(2,3^137)) - 1] to be used as Bob's private key
-int oqs_kem_sidh_p434_random_mod_order_B(unsigned char *random_digits);
-
-// Alice's ephemeral public key generation
-// Input: a private key PrivateKeyA in the range [0, 2^216 - 1], stored in 27 bytes.
-// Output: the public key PublicKeyA consisting of 3 GF(p434^2) elements encoded in 330 bytes.
-int oqs_kem_sidh_p434_EphemeralKeyGeneration_A(const digit_t *PrivateKeyA, unsigned char *PublicKeyA);
-
-// Bob's ephemeral key-pair generation
-// It produces a private key PrivateKeyB and computes the public key PublicKeyB.
-// The private key is an integer in the range [0, 2^Floor(Log(2,3^137)) - 1], stored in 28 bytes.
-// The public key consists of 3 GF(p434^2) elements encoded in 330 bytes.
-int oqs_kem_sidh_p434_EphemeralKeyGeneration_B(const digit_t *PrivateKeyB, unsigned char *PublicKeyB);
-
-// Alice's ephemeral shared secret computation
-// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB
-// Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^216 - 1], stored in 27 bytes.
-// Bob's PublicKeyB consists of 3 GF(p434^2) elements encoded in 330 bytes.
-// Output: a shared secret SharedSecretA that consists of one element in GF(p434^2) encoded in 110 bytes.
-int oqs_kem_sidh_p434_EphemeralSecretAgreement_A(const digit_t *PrivateKeyA, const unsigned char *PublicKeyB, unsigned char *SharedSecretA);
-
-// Bob's ephemeral shared secret computation
-// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA
-// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^137)) - 1], stored in 28 bytes.
-// Alice's PublicKeyA consists of 3 GF(p434^2) elements encoded in 330 bytes.
-// Output: a shared secret SharedSecretB that consists of one element in GF(p434^2) encoded in 110 bytes.
-int oqs_kem_sidh_p434_EphemeralSecretAgreement_B(const digit_t *PrivateKeyB, const unsigned char *PublicKeyA, unsigned char *SharedSecretB);
-
-
-#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_internal.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_internal.h
deleted file mode 100644
index 30056d455b..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_internal.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: internal header file for P434
-*********************************************************************************************/
-
-#ifndef P434_INTERNAL_H
-#define P434_INTERNAL_H
-
-#include "config.h"
-
-#if (TARGET == TARGET_AMD64)
-#define NWORDS_FIELD 7 // Number of words of a 434-bit field element
-#define p434_ZERO_WORDS 3 // Number of "0" digits in the least significant part of p434 + 1
-#elif (TARGET == TARGET_x86)
-#define NWORDS_FIELD 14
-#define p434_ZERO_WORDS 6
-#elif (TARGET == TARGET_ARM)
-#define NWORDS_FIELD 14
-#define p434_ZERO_WORDS 6
-#elif (TARGET == TARGET_ARM64)
-#define NWORDS_FIELD 7
-#define p434_ZERO_WORDS 3
-#endif
-
-// Basic constants
-
-#define NBITS_FIELD 434
-#define MAXBITS_FIELD 448
-#define MAXWORDS_FIELD ((MAXBITS_FIELD + RADIX - 1) / RADIX) // Max. number of words to represent field elements
-#define NWORDS64_FIELD ((NBITS_FIELD + 63) / 64) // Number of 64-bit words of a 434-bit field element
-#define NBITS_ORDER 256
-#define NWORDS_ORDER ((NBITS_ORDER + RADIX - 1) / RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp.
-#define NWORDS64_ORDER ((NBITS_ORDER + 63) / 64) // Number of 64-bit words of a 224-bit element
-#define MAXBITS_ORDER NBITS_ORDER
-#define ALICE 0
-#define BOB 1
-#define OALICE_BITS 216
-#define OBOB_BITS 218
-#define OBOB_EXPON 137
-#define MASK_ALICE 0xFF
-#define MASK_BOB 0x01
-#define PRIME p434
-#define PARAM_A 6
-#define PARAM_C 1
-// Fixed parameters for isogeny tree computation
-#define MAX_INT_POINTS_ALICE 7
-#define MAX_INT_POINTS_BOB 8
-#define MAX_Alice 108
-#define MAX_Bob 137
-#define MSG_BYTES 16
-#define SECRETKEY_A_BYTES ((OALICE_BITS + 7) / 8)
-#define SECRETKEY_B_BYTES ((OBOB_BITS - 1 + 7) / 8)
-#define FP2_ENCODED_BYTES 2 * ((NBITS_FIELD + 7) / 8)
-
-// SIDH's basic element definitions and point representations
-
-typedef digit_t felm_t[NWORDS_FIELD]; // Datatype for representing 434-bit field elements (448-bit max.)
-typedef digit_t dfelm_t[2 * NWORDS_FIELD]; // Datatype for representing double-precision 2x434-bit field elements (448-bit max.)
-typedef struct felm_s {
- felm_t e[2];
-} f2elm_t; // Datatype for representing quadratic extension field elements GF(p434^2)
-
-typedef struct {
- f2elm_t X;
- f2elm_t Z;
-} point_proj; // Point representation in projective XZ Montgomery coordinates.
-typedef point_proj point_proj_t[1];
-
-/**************** Function prototypes ****************/
-/************* Multiprecision functions **************/
-
-// Copy wordsize digits, c = a, where lng(a) = nwords
-void copy_words(const digit_t *a, digit_t *c, const unsigned int nwords);
-
-// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit
-unsigned int mp_add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords);
-
-// 434-bit multiprecision addition, c = a+b
-void mp_add434_asm(const digit_t *a, const digit_t *b, digit_t *c);
-
-// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit
-unsigned int mp_sub(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords);
-
-// 2x434-bit multiprecision subtraction followed by addition with p434*2^448, c = a-b+(p434*2^448) if a-b < 0, otherwise c=a-b
-void mp_subaddx2_asm(const digit_t *a, const digit_t *b, digit_t *c);
-void mp_subadd434x2_asm(const digit_t *a, const digit_t *b, digit_t *c);
-
-// Double 2x434-bit multiprecision subtraction, c = c-a-b, where c > a and c > b
-void mp_dblsub434x2_asm(const digit_t *a, const digit_t *b, digit_t *c);
-
-// Multiprecision right shift by one
-void mp_shiftr1(digit_t *x, const unsigned int nwords);
-
-// Digit multiplication, digit * digit -> 2-digit result
-void digit_x_digit(const digit_t a, const digit_t b, digit_t *c);
-
-// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords.
-void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords);
-
-/************ Field arithmetic functions *************/
-
-// Copy of a field element, c = a
-void fpcopy434(const digit_t *a, digit_t *c);
-
-// Zeroing a field element, a = 0
-void fpzero434(digit_t *a);
-
-// Modular addition, c = a+b mod p434
-extern void fpadd434(const digit_t *a, const digit_t *b, digit_t *c);
-extern void fpadd434_asm(const digit_t *a, const digit_t *b, digit_t *c);
-
-// Modular subtraction, c = a-b mod p434
-extern void fpsub434(const digit_t *a, const digit_t *b, digit_t *c);
-extern void fpsub434_asm(const digit_t *a, const digit_t *b, digit_t *c);
-
-// Modular negation, a = -a mod p434
-extern void fpneg434(digit_t *a);
-
-// Modular division by two, c = a/2 mod p434.
-void fpdiv2_434(const digit_t *a, digit_t *c);
-
-// Modular correction to reduce field element a in [0, 2*p434-1] to [0, p434-1].
-void fpcorrection434(digit_t *a);
-
-// 434-bit Montgomery reduction, c = a mod p
-void rdc_mont(const digit_t *a, digit_t *c);
-
-// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p434, where R=2^768
-void fpmul434_mont(const digit_t *a, const digit_t *b, digit_t *c);
-void mul434_asm(const digit_t *a, const digit_t *b, digit_t *c);
-void rdc434_asm(const digit_t *ma, digit_t *mc);
-
-// Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p434, where R=2^768
-void fpsqr434_mont(const digit_t *ma, digit_t *mc);
-
-// Conversion to Montgomery representation
-void to_mont(const digit_t *a, digit_t *mc);
-
-// Conversion from Montgomery representation to standard representation
-void from_mont(const digit_t *ma, digit_t *c);
-
-// Field inversion, a = a^-1 in GF(p434)
-void fpinv434_mont(digit_t *a);
-
-// Chain to compute (p434-3)/4 using Montgomery arithmetic
-void fpinv434_chain_mont(digit_t *a);
-
-/************ GF(p^2) arithmetic functions *************/
-
-// Copy of a GF(p434^2) element, c = a
-void fp2copy434(const f2elm_t *a, f2elm_t *c);
-
-// Zeroing a GF(p434^2) element, a = 0
-void fp2zero434(f2elm_t *a);
-
-// GF(p434^2) negation, a = -a in GF(p434^2)
-void fp2neg434(f2elm_t *a);
-
-// GF(p434^2) addition, c = a+b in GF(p434^2)
-void fp2add434(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
-
-// GF(p434^2) subtraction, c = a-b in GF(p434^2)
-extern void fp2sub434(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
-
-// GF(p434^2) division by two, c = a/2 in GF(p434^2)
-void fp2div2_434(const f2elm_t *a, f2elm_t *c);
-
-// Modular correction, a = a in GF(p434^2)
-void fp2correction434(f2elm_t *a);
-
-// GF(p434^2) squaring using Montgomery arithmetic, c = a^2 in GF(p434^2)
-void fp2sqr434_mont(const f2elm_t *a, f2elm_t *c);
-
-// GF(p434^2) multiplication using Montgomery arithmetic, c = a*b in GF(p434^2)
-void fp2mul434_mont(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
-
-// Conversion of a GF(p434^2) element to Montgomery representation
-void to_fp2mont(const f2elm_t *a, f2elm_t *mc);
-
-// Conversion of a GF(p434^2) element from Montgomery representation to standard representation
-void from_fp2mont(const f2elm_t *ma, f2elm_t *c);
-
-// GF(p434^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2)
-void fp2inv434_mont(f2elm_t *a);
-
-/************ Elliptic curve and isogeny functions *************/
-
-// Computes the j-invariant of a Montgomery curve with projective constant.
-void j_inv(const f2elm_t *A, const f2elm_t *C, f2elm_t *jinv);
-
-// Simultaneous doubling and differential addition.
-void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t *xPQ, const f2elm_t *A24);
-
-// Doubling of a Montgomery point in projective coordinates (X:Z).
-void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24);
-
-// Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings.
-void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24, const int e);
-
-// Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4.
-void get_4_isog(const point_proj_t P, f2elm_t *A24plus, f2elm_t *C24, f2elm_t *coeff);
-
-// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny.
-void eval_4_isog(point_proj_t P, f2elm_t *coeff);
-
-// Tripling of a Montgomery point in projective coordinates (X:Z).
-void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus);
-
-// Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings.
-void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus, const int e);
-
-// Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3.
-void get_3_isog(const point_proj_t P, f2elm_t *A24minus, f2elm_t *A24plus, f2elm_t *coeff);
-
-// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and a point P with coefficients given in coeff.
-void eval_3_isog(point_proj_t Q, const f2elm_t *coeff);
-
-// 3-way simultaneous inversion
-void inv_3_way(f2elm_t *z1, f2elm_t *z2, f2elm_t *z3);
-
-// Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A.
-void get_A(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xR, f2elm_t *A);
-
-#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/config.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/config.h
deleted file mode 100644
index 6199e5a708..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/config.h
+++ /dev/null
@@ -1,218 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: configuration file and platform-dependent macros
-*********************************************************************************************/
-
-#ifndef SIKE_CONFIG_H
-#define SIKE_CONFIG_H
-
-#include <stdint.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-// Definition of operating system
-
-#define OS_WIN 1
-#define OS_LINUX 2
-
-#if defined(_WIN32) // Microsoft Windows OS
-#define OS_TARGET OS_WIN
-#else
-#define OS_TARGET OS_LINUX // default to Linux
-#endif
-
-// Definition of compiler (removed in OQS)
-
-#define COMPILER_GCC 1
-#define COMPILER_CLANG 2
-
-#if defined(__GNUC__) // GNU GCC compiler
-#define COMPILER COMPILER_GCC
-#elif defined(__clang__) // Clang compiler
-#define COMPILER COMPILER_CLANG
-#else
-#error -- "Unsupported COMPILER"
-#endif
-
-// Definition of the targeted architecture and basic data types
-#define TARGET_AMD64 1
-#define TARGET_x86 2
-#define TARGET_ARM 3
-#define TARGET_ARM64 4
-
-#if defined(__x86_64__)
-#define TARGET TARGET_AMD64
-#define RADIX 64
-#define LOG2RADIX 6
-typedef uint64_t digit_t; // Unsigned 64-bit digit
-typedef uint32_t hdigit_t; // Unsigned 32-bit digit
-#elif defined(__i386__)
-#define TARGET TARGET_x86
-#define RADIX 32
-#define LOG2RADIX 5
-typedef uint32_t digit_t; // Unsigned 32-bit digit
-typedef uint16_t hdigit_t; // Unsigned 16-bit digit
-#elif defined(__arm__)
-#define TARGET TARGET_ARM
-#define RADIX 32
-#define LOG2RADIX 5
-typedef uint32_t digit_t; // Unsigned 32-bit digit
-typedef uint16_t hdigit_t; // Unsigned 16-bit digit
-#elif defined(__aarch64__)
-#define TARGET TARGET_ARM64
-#define RADIX 64
-#define LOG2RADIX 6
-typedef uint64_t digit_t; // Unsigned 64-bit digit
-typedef uint32_t hdigit_t; // Unsigned 32-bit digit
-#else
-#error-- "Unsupported ARCHITECTURE"
-#endif
-
-#define RADIX64 64
-
-// Extended datatype support
-#if !defined(S2N_SIKEP434R2_ASM)
-typedef uint64_t uint128_t[2];
-#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_LINUX)
-typedef unsigned uint128_t __attribute__((mode(TI)));
-#elif (TARGET == TARGET_ARM64 && OS_TARGET == OS_LINUX)
-typedef unsigned uint128_t __attribute__((mode(TI)));
-#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_WIN)
-typedef uint64_t uint128_t[2];
-#endif
-
-// Macro definitions
-
-#define NBITS_TO_NBYTES(nbits) (((nbits) + 7) / 8) // Conversion macro from number of bits to number of bytes
-#define NBITS_TO_NWORDS(nbits) (((nbits) + (sizeof(digit_t) * 8) - 1) / (sizeof(digit_t) * 8)) // Conversion macro from number of bits to number of computer words
-#define NBYTES_TO_NWORDS(nbytes) (((nbytes) + sizeof(digit_t) - 1) / sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words
-
-// Macro to avoid compiler warnings when detecting unreferenced parameters
-#define UNREFERENCED_PARAMETER(PAR) ((void) (PAR))
-
-/********************** Constant-time unsigned comparisons ***********************/
-
-// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
-
-unsigned int is_digit_nonzero_ct(digit_t x) { // Is x != 0?
- return (unsigned int) ((x | (0 - x)) >> (RADIX - 1));
-}
-
-unsigned int is_digit_zero_ct(digit_t x) { // Is x = 0?
- return (unsigned int) (1 ^ is_digit_nonzero_ct(x));
-}
-
-unsigned int is_digit_lessthan_ct(digit_t x, digit_t y) { // Is x < y?
- return (unsigned int) ((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1));
-}
-
-/********************** Macros for platform-dependent operations **********************/
-
-#if (!defined(S2N_SIKEP434R2_ASM)) || (TARGET == TARGET_ARM)
-
-// Digit multiplication
-#define MUL(multiplier, multiplicand, hi, lo) \
- digit_x_digit((multiplier), (multiplicand), &(lo));
-
-// Digit addition with carry
-#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \
- { \
- digit_t tempReg = (addend1) + (digit_t)(carryIn); \
- (sumOut) = (addend2) + tempReg; \
- (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); \
- }
-
-// Digit subtraction with borrow
-#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \
- { \
- digit_t tempReg = (minuend) - (subtrahend); \
- unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) &is_digit_zero_ct(tempReg))); \
- (differenceOut) = tempReg - (digit_t)(borrowIn); \
- (borrowOut) = borrowReg; \
- }
-
-// Shift right with flexible datatype
-#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \
- (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift)));
-
-// Shift left with flexible datatype
-#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \
- (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (DigitSize - (shift)));
-
-#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_WIN)
-
-// Digit multiplication
-#define MUL(multiplier, multiplicand, hi, lo) \
- (lo) = _umul128((multiplier), (multiplicand), (hi));
-
-// Digit addition with carry
-#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \
- (carryOut) = _addcarry_u64((carryIn), (addend1), (addend2), &(sumOut));
-
-// Digit subtraction with borrow
-#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \
- (borrowOut) = _subborrow_u64((borrowIn), (minuend), (subtrahend), &(differenceOut));
-
-// Digit shift right
-#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \
- (shiftOut) = __shiftright128((lowIn), (highIn), (shift));
-
-// Digit shift left
-#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \
- (shiftOut) = __shiftleft128((lowIn), (highIn), (shift));
-
-// 64x64-bit multiplication
-#define MUL128(multiplier, multiplicand, product) \
- (product)[0] = _umul128((multiplier), (multiplicand), &(product)[1]);
-
-// 128-bit addition with output carry
-#define ADC128(addend1, addend2, carry, addition) \
- (carry) = _addcarry_u64(0, (addend1)[0], (addend2)[0], &(addition)[0]); \
- (carry) = _addcarry_u64((carry), (addend1)[1], (addend2)[1], &(addition)[1]);
-
-#define MULADD128(multiplier, multiplicand, addend, carry, result) \
- ; \
- { \
- uint128_t product; \
- MUL128(multiplier, multiplicand, product); \
- ADC128(addend, product, carry, result); \
- }
-
-#elif ((TARGET == TARGET_AMD64 || TARGET == TARGET_ARM64) && OS_TARGET == OS_LINUX)
-
-// Digit multiplication
-#define MUL(multiplier, multiplicand, hi, lo) \
- { \
- uint128_t tempReg = (uint128_t)(multiplier) * (uint128_t)(multiplicand); \
- *(hi) = (digit_t)(tempReg >> RADIX); \
- (lo) = (digit_t) tempReg; \
- }
-
-// Digit addition with carry
-#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \
- { \
- uint128_t tempReg = (uint128_t)(addend1) + (uint128_t)(addend2) + (uint128_t)(carryIn); \
- (carryOut) = (digit_t)(tempReg >> RADIX); \
- (sumOut) = (digit_t) tempReg; \
- }
-
-// Digit subtraction with borrow
-#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \
- { \
- uint128_t tempReg = (uint128_t)(minuend) - (uint128_t)(subtrahend) - (uint128_t)(borrowIn); \
- (borrowOut) = (digit_t)(tempReg >> (sizeof(uint128_t) * 8 - 1)); \
- (differenceOut) = (digit_t) tempReg; \
- }
-
-// Digit shift right
-#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \
- (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (RADIX - (shift)));
-
-// Digit shift left
-#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \
- (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
-
-#endif
-
-#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/ec_isogeny.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/ec_isogeny.c
deleted file mode 100644
index 8a3f85e92b..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/ec_isogeny.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: elliptic curve and isogeny functions
-*********************************************************************************************/
-
-void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24) { // Doubling of a Montgomery point in projective coordinates (X:Z).
- // Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C.
- // Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2).
- f2elm_t _t0, _t1;
- f2elm_t *t0=&_t0, *t1=&_t1;
-
- fp2sub(&P->X, &P->Z, t0); // t0 = X1-Z1
- fp2add(&P->X, &P->Z, t1); // t1 = X1+Z1
- fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2
- fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2
- fp2mul_mont(C24, t0, &Q->Z); // Z2 = C24*(X1-Z1)^2
- fp2mul_mont(t1, &Q->Z, &Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2
- fp2sub(t1, t0, t1); // t1 = (X1+Z1)^2-(X1-Z1)^2
- fp2mul_mont(A24plus, t1, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2]
- fp2add(&Q->Z, t0, &Q->Z); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2
- fp2mul_mont(&Q->Z, t1, &Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2]
-}
-
-void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24, const int e) { // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings.
- // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C.
- // Output: projective Montgomery x-coordinates Q <- (2^e)*P.
- int i;
-
- copy_words((const digit_t *) P, (digit_t *) Q, 2 * 2 * NWORDS_FIELD);
-
- for (i = 0; i < e; i++) {
- xDBL(Q, Q, A24plus, C24);
- }
-}
-
-void get_4_isog(const point_proj_t P, f2elm_t *A24plus, f2elm_t *C24, f2elm_t *coeff) { // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4.
- // Input: projective point of order four P = (X4:Z4).
- // Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients
- // that are used to evaluate the isogeny at a point in eval_4_isog().
-
- fp2sub(&P->X, &P->Z, &coeff[1]); // coeff[1] = X4-Z4
- fp2add(&P->X, &P->Z, &coeff[2]); // coeff[2] = X4+Z4
- fp2sqr_mont(&P->Z, &coeff[0]); // coeff[0] = Z4^2
- fp2add(&coeff[0], &coeff[0], &coeff[0]); // coeff[0] = 2*Z4^2
- fp2sqr_mont(&coeff[0], C24); // C24 = 4*Z4^4
- fp2add(&coeff[0], &coeff[0], &coeff[0]); // coeff[0] = 4*Z4^2
- fp2sqr_mont(&P->X, A24plus); // A24plus = X4^2
- fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2
- fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4
-}
-
-void eval_4_isog(point_proj_t P, f2elm_t *coeff) { // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined
- // by the 3 coefficients in coeff (computed in the function get_4_isog()).
- // Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z).
- // Output: the projective point P = phi(P) = (X:Z) in the codomain.
- f2elm_t _t0, _t1;
- f2elm_t *t0=&_t0, *t1=&_t1;
-
- fp2add(&P->X, &P->Z, t0); // t0 = X+Z
- fp2sub(&P->X, &P->Z, t1); // t1 = X-Z
- fp2mul_mont(t0, &coeff[1], &P->X); // X = (X+Z)*coeff[1]
- fp2mul_mont(t1, &coeff[2], &P->Z); // Z = (X-Z)*coeff[2]
- fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z)
- fp2mul_mont(t0, &coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z)
- fp2add(&P->X, &P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1]
- fp2sub(&P->X, &P->Z, &P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1]
- fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2
- fp2sqr_mont(&P->Z, &P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2
- fp2add(t1, t0, &P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2
- fp2sub(&P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z)
- fp2mul_mont(&P->X, t1, &P->X); // Xfinal
- fp2mul_mont(&P->Z, t0, &P->Z); // Zfinal
-}
-
-void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus) { // Tripling of a Montgomery point in projective coordinates (X:Z).
- // Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C.
- // Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3).
- f2elm_t _t0, _t1, _t2, _t3, _t4, _t5, _t6;
- f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3, *t4=&_t4, *t5=&_t5, *t6=&_t6;
-
- fp2sub(&P->X, &P->Z, t0); // t0 = X-Z
- fp2sqr_mont(t0, t2); // t2 = (X-Z)^2
- fp2add(&P->X, &P->Z, t1); // t1 = X+Z
- fp2sqr_mont(t1, t3); // t3 = (X+Z)^2
- fp2add(t0, t1, t4); // t4 = 2*X
- fp2sub(t1, t0, t0); // t0 = 2*Z
- fp2sqr_mont(t4, t1); // t1 = 4*X^2
- fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2
- fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2
- fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2
- fp2mul_mont(t3, t5, t3); // t3 = A24plus*(X+Z)^3
- fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2
- fp2mul_mont(t2, t6, t2); // t2 = A24minus*(X-Z)^3
- fp2sub(t2, t3, t3); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3
- fp2sub(t5, t6, t2); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2
- fp2mul_mont(t1, t2, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2]
- fp2add(t3, t1, t2); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3
- fp2sqr_mont(t2, t2); // t2 = t2^2
- fp2mul_mont(t4, t2, &Q->X); // X3 = 2*X*t2
- fp2sub(t3, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2]
- fp2sqr_mont(t1, t1); // t1 = t1^2
- fp2mul_mont(t0, t1, &Q->Z); // Z3 = 2*Z*t1
-}
-
-void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus, const int e) { // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings.
- // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C.
- // Output: projective Montgomery x-coordinates Q <- (3^e)*P.
- int i;
-
- copy_words((const digit_t *) P, (digit_t *) Q, 2 * 2 * NWORDS_FIELD);
-
- for (i = 0; i < e; i++) {
- xTPL(Q, Q, A24minus, A24plus);
- }
-}
-
-void get_3_isog(const point_proj_t P, f2elm_t *A24minus, f2elm_t *A24plus, f2elm_t *coeff) { // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3.
- // Input: projective point of order three P = (X3:Z3).
- // Output: the 3-isogenous Montgomery curve with projective coefficient A/C.
- f2elm_t _t0, _t1, _t2, _t3, _t4;
- f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3, *t4=&_t4;
-
- fp2sub(&P->X, &P->Z, &coeff[0]); // coeff0 = X-Z
- fp2sqr_mont(&coeff[0], t0); // t0 = (X-Z)^2
- fp2add(&P->X, &P->Z, &coeff[1]); // coeff1 = X+Z
- fp2sqr_mont(&coeff[1], t1); // t1 = (X+Z)^2
- fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2
- fp2add(&coeff[0], &coeff[1], t3); // t3 = 2*X
- fp2sqr_mont(t3, t3); // t3 = 4*X^2
- fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2
- fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2
- fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2
- fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2
- fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2)
- fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2
- fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2]
- fp2add(t1, t2, t4); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2
- fp2add(t4, t4, t4); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2)
- fp2add(t0, t4, t4); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2
- fp2mul_mont(t3, t4, A24plus); // A24plus = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2]
-}
-
-void eval_3_isog(point_proj_t Q, const f2elm_t *coeff) { // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and
- // a point P with 2 coefficients in coeff (computed in the function get_3_isog()).
- // Inputs: projective points P = (X3:Z3) and Q = (X:Z).
- // Output: the projective point Q <- phi(Q) = (X3:Z3).
- f2elm_t _t0, _t1, _t2;
- f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2;
-
- fp2add(&Q->X, &Q->Z, t0); // t0 = X+Z
- fp2sub(&Q->X, &Q->Z, t1); // t1 = X-Z
- fp2mul_mont(t0, &coeff[0], t0); // t0 = coeff0*(X+Z)
- fp2mul_mont(t1, &coeff[1], t1); // t1 = coeff1*(X-Z)
- fp2add(t0, t1, t2); // t2 = coeff0*(X+Z) + coeff1*(X-Z)
- fp2sub(t1, t0, t0); // t0 = coeff1*(X-Z) - coeff0*(X+Z)
- fp2sqr_mont(t2, t2); // t2 = [coeff0*(X+Z) + coeff1*(X-Z)]^2
- fp2sqr_mont(t0, t0); // t0 = [coeff1*(X-Z) - coeff0*(X+Z)]^2
- fp2mul_mont(&Q->X, t2, &Q->X); // X3final = X*[coeff0*(X+Z) + coeff1*(X-Z)]^2
- fp2mul_mont(&Q->Z, t0, &Q->Z); // Z3final = Z*[coeff1*(X-Z) - coeff0*(X+Z)]^2
-}
-
-void inv_3_way(f2elm_t *z1, f2elm_t *z2, f2elm_t *z3) { // 3-way simultaneous inversion
- // Input: z1,z2,z3
- // Output: 1/z1,1/z2,1/z3 (override inputs).
- f2elm_t _t0, _t1, _t2, _t3;
- f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3;
-
- fp2mul_mont(z1, z2, t0); // t0 = z1*z2
- fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3
- fp2inv_mont(t1); // t1 = 1/(z1*z2*z3)
- fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2)
- fp2mul_mont(t2, z2, t3); // t3 = 1/z1
- fp2mul_mont(t2, z1, z2); // z2 = 1/z2
- fp2mul_mont(t0, t1, z3); // z3 = 1/z3
- fp2copy(t3, z1); // z1 = 1/z1
-}
-
-void get_A(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xR, f2elm_t *A) { // Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A.
- // Input: the x-coordinates xP, xQ, and xR of the points P, Q and R.
- // Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x.
- f2elm_t _t0, _t1, one = {0};
- f2elm_t *t0=&_t0, *t1=&_t1;
-
- fpcopy((const digit_t *) &Montgomery_one, one.e[0]);
- fp2add(xP, xQ, t1); // t1 = xP+xQ
- fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ
- fp2mul_mont(xR, t1, A); // A = xR*t1
- fp2add(t0, A, A); // A = A+t0
- fp2mul_mont(t0, xR, t0); // t0 = t0*xR
- fp2sub(A, &one, A); // A = A-1
- fp2add(t0, t0, t0); // t0 = t0+t0
- fp2add(t1, xR, t1); // t1 = t1+xR
- fp2add(t0, t0, t0); // t0 = t0+t0
- fp2sqr_mont(A, A); // A = A^2
- fp2inv_mont(t0); // t0 = 1/t0
- fp2mul_mont(A, t0, A); // A = A*t0
- fp2sub(A, t1, A); // Afinal = A-t1
-}
-
-void j_inv(const f2elm_t *A, const f2elm_t *C, f2elm_t *jinv) { // Computes the j-invariant of a Montgomery curve with projective constant.
- // Input: A,C in GF(p^2).
- // Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x.
- f2elm_t _t0, _t1;
- f2elm_t *t0=&_t0, *t1=&_t1;
-
- fp2sqr_mont(A, jinv); // jinv = A^2
- fp2sqr_mont(C, t1); // t1 = C^2
- fp2add(t1, t1, t0); // t0 = t1+t1
- fp2sub(jinv, t0, t0); // t0 = jinv-t0
- fp2sub(t0, t1, t0); // t0 = t0-t1
- fp2sub(t0, t1, jinv); // jinv = t0-t1
- fp2sqr_mont(t1, t1); // t1 = t1^2
- fp2mul_mont(jinv, t1, jinv); // jinv = jinv*t1
- fp2add(t0, t0, t0); // t0 = t0+t0
- fp2add(t0, t0, t0); // t0 = t0+t0
- fp2sqr_mont(t0, t1); // t1 = t0^2
- fp2mul_mont(t0, t1, t0); // t0 = t0*t1
- fp2add(t0, t0, t0); // t0 = t0+t0
- fp2add(t0, t0, t0); // t0 = t0+t0
- fp2inv_mont(jinv); // jinv = 1/jinv
- fp2mul_mont(jinv, t0, jinv); // jinv = t0*jinv
-}
-
-void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t *xPQ, const f2elm_t *A24) { // Simultaneous doubling and differential addition.
- // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4.
- // Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP.
- f2elm_t _t0, _t1, _t2;
- f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2;
-
- fp2add(&P->X, &P->Z, t0); // t0 = XP+ZP
- fp2sub(&P->X, &P->Z, t1); // t1 = XP-ZP
- fp2sqr_mont(t0, &P->X); // XP = (XP+ZP)^2
- fp2sub(&Q->X, &Q->Z, t2); // t2 = XQ-ZQ
- fp2correction(t2);
- fp2add(&Q->X, &Q->Z, &Q->X); // XQ = XQ+ZQ
- fp2mul_mont(t0, t2, t0); // t0 = (XP+ZP)*(XQ-ZQ)
- fp2sqr_mont(t1, &P->Z); // ZP = (XP-ZP)^2
- fp2mul_mont(t1, &Q->X, t1); // t1 = (XP-ZP)*(XQ+ZQ)
- fp2sub(&P->X, &P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2
- fp2mul_mont(&P->X, &P->Z, &P->X); // XP = (XP+ZP)^2*(XP-ZP)^2
- fp2mul_mont(t2, A24, &Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2]
- fp2sub(t0, t1, &Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)
- fp2add(&Q->X, &P->Z, &P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2
- fp2add(t0, t1, &Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)
- fp2mul_mont(&P->Z, t2, &P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2]
- fp2sqr_mont(&Q->Z, &Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2
- fp2sqr_mont(&Q->X, &Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2
- fp2mul_mont(&Q->Z, xPQ, &Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2
-}
-
-static void swap_points(point_proj_t P, point_proj_t Q, const digit_t option) { // Swap points.
- // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P
- for (unsigned int i = 0; i < NWORDS_FIELD; i++) {
- digit_t temp = option & (P->X.e[0][i] ^ Q->X.e[0][i]);
- P->X.e[0][i] = temp ^ P->X.e[0][i];
- Q->X.e[0][i] = temp ^ Q->X.e[0][i];
- temp = option & (P->Z.e[0][i] ^ Q->Z.e[0][i]);
- P->Z.e[0][i] = temp ^ P->Z.e[0][i];
- Q->Z.e[0][i] = temp ^ Q->Z.e[0][i];
- temp = option & (P->X.e[1][i] ^ Q->X.e[1][i]);
- P->X.e[1][i] = temp ^ P->X.e[1][i];
- Q->X.e[1][i] = temp ^ Q->X.e[1][i];
- temp = option & (P->Z.e[1][i] ^ Q->Z.e[1][i]);
- P->Z.e[1][i] = temp ^ P->Z.e[1][i];
- Q->Z.e[1][i] = temp ^ Q->Z.e[1][i];
- }
-}
-
-void LADDER3PT(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xPQ, const digit_t *m, const unsigned int AliceOrBob, point_proj_t R, const f2elm_t *A) {
- point_proj_t R0 = {0}, R2 = {0};
- f2elm_t _A24 = {0};
- f2elm_t *A24=&_A24;
- digit_t mask;
- int i, nbits, swap, prevbit = 0;
-
- if (AliceOrBob == ALICE) {
- nbits = OALICE_BITS;
- } else {
- nbits = OBOB_BITS - 1;
- }
-
- // Initializing constant
- fpcopy((const digit_t *) &Montgomery_one, A24->e[0]);
- fp2add(A24, A24, A24);
- fp2add(A, A24, A24);
- fp2div2(A24, A24);
- fp2div2(A24, A24); // A24 = (A+2)/4
-
- // Initializing points
- fp2copy(xQ, &R0->X);
- fpcopy((const digit_t *) &Montgomery_one, (digit_t *) R0->Z.e);
- fp2copy(xPQ, &R2->X);
- fpcopy((const digit_t *) &Montgomery_one, (digit_t *) R2->Z.e);
- fp2copy(xP, &R->X);
- fpcopy((const digit_t *) &Montgomery_one, (digit_t *) R->Z.e);
- fpzero((digit_t *) (R->Z.e)[1]);
-
- // Main loop
- for (i = 0; i < nbits; i++) {
- int bit = (m[i >> LOG2RADIX] >> (i & (RADIX - 1))) & 1;
- swap = bit ^ prevbit;
- prevbit = bit;
- mask = 0 - (digit_t) swap;
-
- swap_points(R, R2, mask);
- xDBLADD(R0, R2, &R->X, A24);
- fp2mul_mont(&R2->X, &R->Z, &R2->X);
- }
- swap = 0 ^ prevbit;
- mask = 0 - (digit_t) swap;
- swap_points(R, R2, mask);
-}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.h
deleted file mode 100644
index 1196bff2c0..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef FIPS202_H
-#define FIPS202_H
-
-#define SHAKE256_RATE 136
-
-/** Data structure for the state of the SHAKE-256 non-incremental hashing API. */
-typedef struct {
-/** Internal state. */
- uint64_t ctx[25];
-} shake256_ctx;
-
-void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen);
-
-#endif // FIPS202_H
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fp.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fp.c
deleted file mode 100644
index 0e09ce25a0..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fp.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: Portable C and x86_64 ASM functions for modular arithmetic for P434
-*********************************************************************************************/
-
-#include "P434_internal.h"
-
-// Modular addition, c = a+b mod p434.
-// Inputs: a, b in [0, 2*p434-1]
-// Output: c in [0, 2*p434-1]
-void fpadd434(const digit_t *a, const digit_t *b, digit_t *c) {
-#if defined(S2N_SIKEP434R2_ASM)
- if (s2n_sikep434r2_asm_is_enabled()) {
- fpadd434_asm(a, b, c);
- return;
- }
-#endif
-
- unsigned int i, carry = 0;
- digit_t mask;
-
- for (i = 0; i < NWORDS_FIELD; i++) {
- ADDC(carry, a[i], b[i], carry, c[i]);
- }
-
- carry = 0;
- for (i = 0; i < NWORDS_FIELD; i++) {
- SUBC(carry, c[i], ((const digit_t *) p434x2)[i], carry, c[i]);
- }
- mask = 0 - (digit_t) carry;
-
- carry = 0;
- for (i = 0; i < NWORDS_FIELD; i++) {
- ADDC(carry, c[i], ((const digit_t *) p434x2)[i] & mask, carry, c[i]);
- }
-}
-
-// Modular subtraction, c = a-b mod p434.
-// Inputs: a, b in [0, 2*p434-1]
-// Output: c in [0, 2*p434-1]
-void fpsub434(const digit_t *a, const digit_t *b, digit_t *c) {
-#if defined(S2N_SIKEP434R2_ASM)
- if (s2n_sikep434r2_asm_is_enabled()) {
- fpsub434_asm(a, b, c);
- return;
- }
-#endif
-
- unsigned int i, borrow = 0;
- digit_t mask;
-
- for (i = 0; i < NWORDS_FIELD; i++) {
- SUBC(borrow, a[i], b[i], borrow, c[i]);
- }
- mask = 0 - (digit_t) borrow;
-
- borrow = 0;
- for (i = 0; i < NWORDS_FIELD; i++) {
- ADDC(borrow, c[i], ((const digit_t *) p434x2)[i] & mask, borrow, c[i]);
- }
-}
-
-// Modular negation, a = -a mod p434.
-// Input/output: a in [0, 2*p434-1]
-void fpneg434(digit_t *a) {
- unsigned int i, borrow = 0;
-
- for (i = 0; i < NWORDS_FIELD; i++) {
- SUBC(borrow, ((const digit_t *) p434x2)[i], a[i], borrow, a[i]);
- }
-}
-
-// Modular division by two, c = a/2 mod p434.
-// Input : a in [0, 2*p434-1]
-// Output: c in [0, 2*p434-1]
-void fpdiv2_434(const digit_t *a, digit_t *c) {
- unsigned int i, carry = 0;
- digit_t mask;
-
- mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p434
- for (i = 0; i < NWORDS_FIELD; i++) {
- ADDC(carry, a[i], ((const digit_t *) p434)[i] & mask, carry, c[i]);
- }
-
- mp_shiftr1(c, NWORDS_FIELD);
-}
-
-// Modular correction to reduce field element a in [0, 2*p434-1] to [0, p434-1].
-void fpcorrection434(digit_t *a) {
- unsigned int i, borrow = 0;
- digit_t mask;
-
- for (i = 0; i < NWORDS_FIELD; i++) {
- SUBC(borrow, a[i], ((const digit_t *) p434)[i], borrow, a[i]);
- }
- mask = 0 - (digit_t) borrow;
-
- borrow = 0;
- for (i = 0; i < NWORDS_FIELD; i++) {
- ADDC(borrow, a[i], ((const digit_t *) p434)[i] & mask, borrow, a[i]);
- }
-}
-
-// Digit multiplication, digit * digit -> 2-digit result
-void digit_x_digit(const digit_t a, const digit_t b, digit_t *c) {
- register digit_t al, ah, bl, bh, temp;
- digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
- digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t) * 4), mask_high = (digit_t)(-1) << (sizeof(digit_t) * 4);
-
- al = a & mask_low; // Low part
- ah = a >> (sizeof(digit_t) * 4); // High part
- bl = b & mask_low;
- bh = b >> (sizeof(digit_t) * 4);
-
- albl = al * bl;
- albh = al * bh;
- ahbl = ah * bl;
- ahbh = ah * bh;
- c[0] = albl & mask_low; // C00
-
- res1 = albl >> (sizeof(digit_t) * 4);
- res2 = ahbl & mask_low;
- res3 = albh & mask_low;
- temp = res1 + res2 + res3;
- carry = temp >> (sizeof(digit_t) * 4);
- c[0] ^= temp << (sizeof(digit_t) * 4); // C01
-
- res1 = ahbl >> (sizeof(digit_t) * 4);
- res2 = albh >> (sizeof(digit_t) * 4);
- res3 = ahbh & mask_low;
- temp = res1 + res2 + res3 + carry;
- c[1] = temp & mask_low; // C10
- carry = temp & mask_high;
- c[1] ^= (ahbh & mask_high) + carry; // C11
-}
-
-// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords.
-void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) {
-#if defined(S2N_SIKEP434R2_ASM)
- if (s2n_sikep434r2_asm_is_enabled()) {
- UNREFERENCED_PARAMETER(nwords);
- mul434_asm(a, b, c);
- return;
- }
-#endif
-
- unsigned int i, j, carry;
- digit_t t = 0, u = 0, v = 0, UV[2];
-
- for (i = 0; i < nwords; i++) {
- for (j = 0; j <= i; j++) {
- MUL(a[j], b[i - j], UV + 1, UV[0]);
- ADDC(0, UV[0], v, carry, v);
- ADDC(carry, UV[1], u, carry, u);
- t += carry;
- }
- c[i] = v;
- v = u;
- u = t;
- t = 0;
- }
-
- for (i = nwords; i < 2 * nwords - 1; i++) {
- for (j = i - nwords + 1; j < nwords; j++) {
- MUL(a[j], b[i - j], UV + 1, UV[0]);
- ADDC(0, UV[0], v, carry, v);
- ADDC(carry, UV[1], u, carry, u);
- t += carry;
- }
- c[i] = v;
- v = u;
- u = t;
- t = 0;
- }
- c[2 * nwords - 1] = v;
-}
-
-// Efficient Montgomery reduction using comba and exploiting the special form of the prime p434.
-// mc = ma*R^-1 mod p434x2, where R = 2^448.
-// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1].
-// ma is assumed to be in Montgomery representation.
-void rdc_mont(const digit_t *ma, digit_t *mc) {
-#if defined(S2N_SIKEP434R2_ASM)
- if (s2n_sikep434r2_asm_is_enabled()) {
- rdc434_asm(ma, mc);
- return;
- }
-#endif
-
- unsigned int i, j, carry, count = p434_ZERO_WORDS;
- digit_t UV[2], t = 0, u = 0, v = 0;
-
- for (i = 0; i < NWORDS_FIELD; i++) {
- mc[i] = 0;
- }
-
- for (i = 0; i < NWORDS_FIELD; i++) {
- for (j = 0; j < i; j++) {
- if (j < (i - p434_ZERO_WORDS + 1)) {
- MUL(mc[j], ((const digit_t *) p434p1)[i - j], UV + 1, UV[0]);
- ADDC(0, UV[0], v, carry, v);
- ADDC(carry, UV[1], u, carry, u);
- t += carry;
- }
- }
- ADDC(0, v, ma[i], carry, v);
- ADDC(carry, u, 0, carry, u);
- t += carry;
- mc[i] = v;
- v = u;
- u = t;
- t = 0;
- }
-
- for (i = NWORDS_FIELD; i < 2 * NWORDS_FIELD - 1; i++) {
- if (count > 0) {
- count -= 1;
- }
- for (j = i - NWORDS_FIELD + 1; j < NWORDS_FIELD; j++) {
- if (j < (NWORDS_FIELD - count)) {
- MUL(mc[j], ((const digit_t *) p434p1)[i - j], UV + 1, UV[0]);
- ADDC(0, UV[0], v, carry, v);
- ADDC(carry, UV[1], u, carry, u);
- t += carry;
- }
- }
- ADDC(0, v, ma[i], carry, v);
- ADDC(carry, u, 0, carry, u);
- t += carry;
- mc[i - NWORDS_FIELD] = v;
- v = u;
- u = t;
- t = 0;
- }
-
- /* `carry` isn't read after this, but it's still a necessary argument to the macro */
- /* cppcheck-suppress unreadVariable */
- ADDC(0, v, ma[2 * NWORDS_FIELD - 1], carry, v);
- mc[NWORDS_FIELD - 1] = v;
-}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fpx.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fpx.c
deleted file mode 100644
index e5b356b93b..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fpx.c
+++ /dev/null
@@ -1,387 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: core functions over GF(p) and GF(p^2)
-*********************************************************************************************/
-
-// Conversion of GF(p^2) element from Montgomery to standard representation, and encoding by removing leading 0 bytes
-void fp2_encode(const f2elm_t *x, unsigned char *enc) {
- unsigned int i;
- f2elm_t t;
-
- from_fp2mont(x, &t);
- for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) {
- enc[i] = ((unsigned char *) t.e)[i];
- enc[i + FP2_ENCODED_BYTES / 2] = ((unsigned char *) t.e)[i + MAXBITS_FIELD / 8];
- }
-}
-
-// Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation
-void fp2_decode(const unsigned char *enc, f2elm_t *x) {
- unsigned int i;
-
- for (i = 0; i < 2 * (MAXBITS_FIELD / 8); i++)
- ((unsigned char *) x->e)[i] = 0;
- for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) {
- ((unsigned char *) x->e)[i] = enc[i];
- ((unsigned char *) x->e)[i + MAXBITS_FIELD / 8] = enc[i + FP2_ENCODED_BYTES / 2];
- }
- to_fp2mont(x, x);
-}
-
-// Copy a field element, c = a.
-__inline void fpcopy(const felm_t a, felm_t c) {
- unsigned int i;
-
- for (i = 0; i < NWORDS_FIELD; i++)
- c[i] = a[i];
-}
-
-// Zero a field element, a = 0.
-__inline void fpzero(felm_t a) {
- unsigned int i;
-
- for (i = 0; i < NWORDS_FIELD; i++)
- a[i] = 0;
-}
-
-// Conversion to Montgomery representation,
-// mc = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1].
-// The Montgomery constant R^2 mod p is the global value "Montgomery_R2".
-void to_mont(const felm_t a, felm_t mc) {
- fpmul_mont(a, (const digit_t *) &Montgomery_R2, mc);
-}
-
-// Conversion from Montgomery representation to standard representation,
-// c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1].
-void from_mont(const felm_t ma, felm_t c) {
- digit_t one[NWORDS_FIELD] = {0};
-
- one[0] = 1;
- fpmul_mont(ma, one, c);
- fpcorrection(c);
-}
-
-// Copy wordsize digits, c = a, where lng(a) = nwords.
-void copy_words(const digit_t *a, digit_t *c, const unsigned int nwords) {
- unsigned int i;
-
- for (i = 0; i < nwords; i++)
- c[i] = a[i];
-}
-
-// Multiprecision multiplication, c = a*b mod p.
-void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc) {
- dfelm_t temp = {0};
-
- mp_mul(ma, mb, temp, NWORDS_FIELD);
- rdc_mont(temp, mc);
-}
-
-// Multiprecision squaring, c = a^2 mod p.
-void fpsqr_mont(const felm_t ma, felm_t mc) {
- dfelm_t temp = {0};
-
- mp_mul(ma, ma, temp, NWORDS_FIELD);
- rdc_mont(temp, mc);
-}
-
-// Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p.
-void fpinv_mont(felm_t a) {
- felm_t tt;
-
- fpcopy(a, tt);
- fpinv_chain_mont(tt);
- fpsqr_mont(tt, tt);
- fpsqr_mont(tt, tt);
- fpmul_mont(a, tt, a);
-}
-
-// Copy a GF(p^2) element, c = a.
-void fp2copy(const f2elm_t *a, f2elm_t *c) {
- fpcopy(a->e[0], c->e[0]);
- fpcopy(a->e[1], c->e[1]);
-}
-
-// Zero a GF(p^2) element, a = 0.
-void fp2zero(f2elm_t *a) {
- fpzero(a->e[0]);
- fpzero(a->e[1]);
-}
-
-// GF(p^2) negation, a = -a in GF(p^2).
-void fp2neg(f2elm_t *a) {
- fpneg(a->e[0]);
- fpneg(a->e[1]);
-}
-
-// GF(p^2) addition, c = a+b in GF(p^2).
-__inline void fp2add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) {
- fpadd(a->e[0], b->e[0], c->e[0]);
- fpadd(a->e[1], b->e[1], c->e[1]);
-}
-
-// GF(p^2) subtraction, c = a-b in GF(p^2).
-__inline void fp2sub(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) {
- fpsub(a->e[0], b->e[0], c->e[0]);
- fpsub(a->e[1], b->e[1], c->e[1]);
-}
-
-// GF(p^2) division by two, c = a/2 in GF(p^2).
-void fp2div2(const f2elm_t *a, f2elm_t *c) {
- fpdiv2(a->e[0], c->e[0]);
- fpdiv2(a->e[1], c->e[1]);
-}
-
-// Modular correction, a = a in GF(p^2).
-void fp2correction(f2elm_t *a) {
- fpcorrection(a->e[0]);
- fpcorrection(a->e[1]);
-}
-
-// Multiprecision addition, c = a+b.
-__inline static void mp_addfast(const digit_t *a, const digit_t *b, digit_t *c) {
-#if defined(S2N_SIKEP434R2_ASM)
- if (s2n_sikep434r2_asm_is_enabled()) {
- mp_add_asm(a, b, c);
- return;
- }
-#endif
-
- mp_add(a, b, c, NWORDS_FIELD);
-}
-
-// GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2).
-// Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1]
-// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1]
-void fp2sqr_mont(const f2elm_t *a, f2elm_t *c) {
- felm_t t1, t2, t3;
-
- mp_addfast(a->e[0], a->e[1], t1); // t1 = a0+a1
- fpsub(a->e[0], a->e[1], t2); // t2 = a0-a1
- mp_addfast(a->e[0], a->e[0], t3); // t3 = 2a0
- fpmul_mont(t1, t2, c->e[0]); // c0 = (a0+a1)(a0-a1)
- fpmul_mont(t3, a->e[1], c->e[1]); // c1 = 2a0*a1
-}
-
-// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit.
-unsigned int mp_sub(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) {
- unsigned int i, borrow = 0;
-
- for (i = 0; i < nwords; i++)
- SUBC(borrow, a[i], b[i], borrow, c[i]);
-
- return borrow;
-}
-
-// Multiprecision subtraction followed by addition with p*2^MAXBITS_FIELD, c = a-b+(p*2^MAXBITS_FIELD) if a-b < 0, otherwise c=a-b.
-__inline static void mp_subaddfast(const digit_t *a, const digit_t *b, digit_t *c) {
-#if defined(S2N_SIKEP434R2_ASM)
- if (s2n_sikep434r2_asm_is_enabled()) {
- mp_subaddx2_asm(a, b, c);
- return;
- }
-#endif
-
- felm_t t1;
-
- digit_t mask = 0 - (digit_t) mp_sub(a, b, c, 2 * NWORDS_FIELD);
- for (int i = 0; i < NWORDS_FIELD; i++)
- t1[i] = ((const digit_t *) PRIME)[i] & mask;
- mp_addfast((digit_t *) &c[NWORDS_FIELD], t1, (digit_t *) &c[NWORDS_FIELD]);
-}
-
-// Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD.
-__inline static void mp_dblsubfast(const digit_t *a, const digit_t *b, digit_t *c) {
-#if defined(S2N_SIKEP434R2_ASM)
- if (s2n_sikep434r2_asm_is_enabled()) {
- mp_dblsubx2_asm(a, b, c);
- return;
- }
-#endif
-
- mp_sub(c, a, c, 2 * NWORDS_FIELD);
- mp_sub(c, b, c, 2 * NWORDS_FIELD);
-}
-
-// GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2).
-// Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1]
-// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1]
-void fp2mul_mont(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) {
- felm_t t1, t2;
- dfelm_t tt1, tt2, tt3;
-
- mp_addfast(a->e[0], a->e[1], t1); // t1 = a0+a1
- mp_addfast(b->e[0], b->e[1], t2); // t2 = b0+b1
- mp_mul(a->e[0], b->e[0], tt1, NWORDS_FIELD); // tt1 = a0*b0
- mp_mul(a->e[1], b->e[1], tt2, NWORDS_FIELD); // tt2 = a1*b1
- mp_mul(t1, t2, tt3, NWORDS_FIELD); // tt3 = (a0+a1)*(b0+b1)
- mp_dblsubfast(tt1, tt2, tt3); // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1
- mp_subaddfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1 + p*2^MAXBITS_FIELD if a0*b0 - a1*b1 < 0, else tt1 = a0*b0 - a1*b1
- rdc_mont(tt3, c->e[1]); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1
- rdc_mont(tt1, c->e[0]); // c[0] = a0*b0 - a1*b1
-}
-
-// Chain to compute a^(p-3)/4 using Montgomery arithmetic.
-void fpinv_chain_mont(felm_t a) {
- unsigned int i, j;
-
- felm_t t[31], tt;
-
- // Precomputed table
- fpsqr_mont(a, tt);
- fpmul_mont(a, tt, t[0]);
- for (i = 0; i <= 29; i++)
- fpmul_mont(t[i], tt, t[i + 1]);
-
- fpcopy(a, tt);
- for (i = 0; i < 7; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[5], tt, tt);
- for (i = 0; i < 10; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[14], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[3], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[23], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[13], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[24], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[7], tt, tt);
- for (i = 0; i < 8; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[12], tt, tt);
- for (i = 0; i < 8; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[30], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[1], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[30], tt, tt);
- for (i = 0; i < 7; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[21], tt, tt);
- for (i = 0; i < 9; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[2], tt, tt);
- for (i = 0; i < 9; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[19], tt, tt);
- for (i = 0; i < 9; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[1], tt, tt);
- for (i = 0; i < 7; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[24], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[26], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[16], tt, tt);
- for (i = 0; i < 7; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[10], tt, tt);
- for (i = 0; i < 7; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[6], tt, tt);
- for (i = 0; i < 7; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[0], tt, tt);
- for (i = 0; i < 9; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[20], tt, tt);
- for (i = 0; i < 8; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[9], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[25], tt, tt);
- for (i = 0; i < 9; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[30], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[26], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(a, tt, tt);
- for (i = 0; i < 7; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[28], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[6], tt, tt);
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[10], tt, tt);
- for (i = 0; i < 9; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[22], tt, tt);
- for (j = 0; j < 35; j++) {
- for (i = 0; i < 6; i++)
- fpsqr_mont(tt, tt);
- fpmul_mont(t[30], tt, tt);
- }
- fpcopy(tt, a);
-}
-
-// GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2).
-void fp2inv_mont(f2elm_t *a) {
- f2elm_t t1;
-
- fpsqr_mont(a->e[0], t1.e[0]); // t10 = a0^2
- fpsqr_mont(a->e[1], t1.e[1]); // t11 = a1^2
- fpadd(t1.e[0], t1.e[1], t1.e[0]); // t10 = a0^2+a1^2
- fpinv_mont(t1.e[0]); // t10 = (a0^2+a1^2)^-1
- fpneg(a->e[1]); // a = a0-i*a1
- fpmul_mont(a->e[0], t1.e[0], a->e[0]);
- fpmul_mont(a->e[1], t1.e[0], a->e[1]); // a = (a0-i*a1)*(a0^2+a1^2)^-1
-}
-
-// Conversion of a GF(p^2) element to Montgomery representation,
-// mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2).
-void to_fp2mont(const f2elm_t *a, f2elm_t *mc) {
- to_mont(a->e[0], mc->e[0]);
- to_mont(a->e[1], mc->e[1]);
-}
-
-// Conversion of a GF(p^2) element from Montgomery representation to standard representation,
-// c_i = ma_i*R^(-1) = a_i in GF(p^2).
-void from_fp2mont(const f2elm_t *ma, f2elm_t *c) {
- from_mont(ma->e[0], c->e[0]);
- from_mont(ma->e[1], c->e[1]);
-}
-
-// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit.
-unsigned int mp_add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) {
- unsigned int i, carry = 0;
-
- for (i = 0; i < nwords; i++) {
- /* cppcheck-suppress shiftTooManyBits */
- /* cppcheck-suppress unmatchedSuppression */
- ADDC(carry, a[i], b[i], carry, c[i]);
- }
-
- return carry;
-}
-
-// Multiprecision right shift by one.
-void mp_shiftr1(digit_t *x, const unsigned int nwords) {
- unsigned int i;
-
- for (i = 0; i < nwords - 1; i++) {
- SHIFTR(x[i + 1], x[i], 1, x[i], RADIX);
- }
- x[nwords - 1] >>= 1;
-}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sidh.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sidh.c
deleted file mode 100644
index d3fdbe722c..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sidh.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH)
-*********************************************************************************************/
-
-#include "../s2n_pq_random.h"
-#include "utils/s2n_safety.h"
-
-static void init_basis(const digit_t *gen, f2elm_t *XP, f2elm_t *XQ, f2elm_t *XR) { // Initialization of basis points
-
- fpcopy(gen, XP->e[0]);
- fpcopy(gen + NWORDS_FIELD, XP->e[1]);
- fpcopy(gen + 2 * NWORDS_FIELD, XQ->e[0]);
- fpcopy(gen + 3 * NWORDS_FIELD, XQ->e[1]);
- fpcopy(gen + 4 * NWORDS_FIELD, XR->e[0]);
- fpcopy(gen + 5 * NWORDS_FIELD, XR->e[1]);
-}
-
-int random_mod_order_A(unsigned char *random_digits) { // Generation of Alice's secret key
- // Outputs random value in [0, 2^eA - 1]
- GUARD_AS_POSIX(s2n_get_random_bytes(random_digits, SECRETKEY_A_BYTES));
- random_digits[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; // Masking last byte
- return S2N_SUCCESS;
-}
-
-int random_mod_order_B(unsigned char *random_digits) { // Generation of Bob's secret key
- // Outputs random value in [0, 2^Floor(Log(2, oB)) - 1]
- GUARD_AS_POSIX(s2n_get_random_bytes(random_digits, SECRETKEY_B_BYTES));
- random_digits[SECRETKEY_B_BYTES - 1] &= MASK_BOB; // Masking last byte
- return S2N_SUCCESS;
-}
-
-int EphemeralKeyGeneration_A(const digit_t *PrivateKeyA, unsigned char *PublicKeyA) { // Alice's ephemeral public key generation
- // Input: a private key PrivateKeyA in the range [0, 2^eA - 1].
- // Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes.
- point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_ALICE];
- f2elm_t _XPA, _XQA, _XRA, coeff[3], _A24plus = {0}, _C24 = {0}, _A = {0};
- f2elm_t *XPA=&_XPA, *XQA=&_XQA, *XRA=&_XRA, *A24plus=&_A24plus, *C24=&_C24, *A=&_A;
- unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0;
-
- // Initialize basis points
- init_basis((const digit_t *) A_gen, XPA, XQA, XRA);
- init_basis((const digit_t *) B_gen, &phiP->X, &phiQ->X, &phiR->X);
- fpcopy((const digit_t *) &Montgomery_one, (phiP->Z.e)[0]);
- fpcopy((const digit_t *) &Montgomery_one, (phiQ->Z.e)[0]);
- fpcopy((const digit_t *) &Montgomery_one, (phiR->Z.e)[0]);
-
- // Initialize constants: A24plus = A+2C, C24 = 4C, where A=6, C=1
- fpcopy((const digit_t *) &Montgomery_one, A24plus->e[0]);
- fp2add(A24plus, A24plus, A24plus);
- fp2add(A24plus, A24plus, C24);
- fp2add(A24plus, C24, A);
- fp2add(C24, C24, A24plus);
-
- // Retrieve kernel point
- LADDER3PT(XPA, XQA, XRA, PrivateKeyA, ALICE, R, A);
-
- // Traverse tree
- index = 0;
- for (row = 1; row < MAX_Alice; row++) {
- while (index < MAX_Alice - row) {
- fp2copy(&R->X, &pts[npts]->X);
- fp2copy(&R->Z, &pts[npts]->Z);
- pts_index[npts++] = index;
- m = strat_Alice[ii++];
- xDBLe(R, R, A24plus, C24, (int) (2 * m));
- index += m;
- }
- get_4_isog(R, A24plus, C24, coeff);
-
- for (i = 0; i < npts; i++) {
- eval_4_isog(pts[i], coeff);
- }
- eval_4_isog(phiP, coeff);
- eval_4_isog(phiQ, coeff);
- eval_4_isog(phiR, coeff);
-
- fp2copy(&pts[npts - 1]->X, &R->X);
- fp2copy(&pts[npts - 1]->Z, &R->Z);
- index = pts_index[npts - 1];
- npts -= 1;
- }
-
- get_4_isog(R, A24plus, C24, coeff);
- eval_4_isog(phiP, coeff);
- eval_4_isog(phiQ, coeff);
- eval_4_isog(phiR, coeff);
-
- inv_3_way(&phiP->Z, &phiQ->Z, &phiR->Z);
- fp2mul_mont(&phiP->X, &phiP->Z, &phiP->X);
- fp2mul_mont(&phiQ->X, &phiQ->Z, &phiQ->X);
- fp2mul_mont(&phiR->X, &phiR->Z, &phiR->X);
-
- // Format public key
- fp2_encode(&phiP->X, PublicKeyA);
- fp2_encode(&phiQ->X, PublicKeyA + FP2_ENCODED_BYTES);
- fp2_encode(&phiR->X, PublicKeyA + 2 * FP2_ENCODED_BYTES);
-
- return 0;
-}
-
-int EphemeralKeyGeneration_B(const digit_t *PrivateKeyB, unsigned char *PublicKeyB) { // Bob's ephemeral public key generation
- // Input: a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1].
- // Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes.
- point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_BOB];
- f2elm_t _XPB, _XQB, _XRB, coeff[3], _A24plus = {0}, _A24minus = {0}, _A = {0};
- f2elm_t *XPB=&_XPB, *XQB=&_XQB, *XRB=&_XRB, *A24plus=&_A24plus, *A24minus=&_A24minus, *A=&_A;
- unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0;
-
- // Initialize basis points
- init_basis((const digit_t *) B_gen, XPB, XQB, XRB);
- init_basis((const digit_t *) A_gen, &phiP->X, &phiQ->X, &phiR->X);
- fpcopy((const digit_t *) &Montgomery_one, (phiP->Z.e)[0]);
- fpcopy((const digit_t *) &Montgomery_one, (phiQ->Z.e)[0]);
- fpcopy((const digit_t *) &Montgomery_one, (phiR->Z.e)[0]);
-
- // Initialize constants: A24minus = A-2C, A24plus = A+2C, where A=6, C=1
- fpcopy((const digit_t *) &Montgomery_one, A24plus->e[0]);
- fp2add(A24plus, A24plus, A24plus);
- fp2add(A24plus, A24plus, A24minus);
- fp2add(A24plus, A24minus, A);
- fp2add(A24minus, A24minus, A24plus);
-
- // Retrieve kernel point
- LADDER3PT(XPB, XQB, XRB, PrivateKeyB, BOB, R, A);
-
- // Traverse tree
- index = 0;
- for (row = 1; row < MAX_Bob; row++) {
- while (index < MAX_Bob - row) {
- fp2copy(&R->X, &pts[npts]->X);
- fp2copy(&R->Z, &pts[npts]->Z);
- pts_index[npts++] = index;
- m = strat_Bob[ii++];
- xTPLe(R, R, A24minus, A24plus, (int) m);
- index += m;
- }
- get_3_isog(R, A24minus, A24plus, coeff);
-
- for (i = 0; i < npts; i++) {
- eval_3_isog(pts[i], coeff);
- }
- eval_3_isog(phiP, coeff);
- eval_3_isog(phiQ, coeff);
- eval_3_isog(phiR, coeff);
-
- fp2copy(&pts[npts - 1]->X, &R->X);
- fp2copy(&pts[npts - 1]->Z, &R->Z);
- index = pts_index[npts - 1];
- npts -= 1;
- }
-
- get_3_isog(R, A24minus, A24plus, coeff);
- eval_3_isog(phiP, coeff);
- eval_3_isog(phiQ, coeff);
- eval_3_isog(phiR, coeff);
-
- inv_3_way(&phiP->Z, &phiQ->Z, &phiR->Z);
- fp2mul_mont(&phiP->X, &phiP->Z, &phiP->X);
- fp2mul_mont(&phiQ->X, &phiQ->Z, &phiQ->X);
- fp2mul_mont(&phiR->X, &phiR->Z, &phiR->X);
-
- // Format public key
- fp2_encode(&phiP->X, PublicKeyB);
- fp2_encode(&phiQ->X, PublicKeyB + FP2_ENCODED_BYTES);
- fp2_encode(&phiR->X, PublicKeyB + 2 * FP2_ENCODED_BYTES);
-
- return 0;
-}
-
-int EphemeralSecretAgreement_A(const digit_t *PrivateKeyA, const unsigned char *PublicKeyB, unsigned char *SharedSecretA) { // Alice's ephemeral shared secret computation
- // It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB
- // Inputs: Alice's PrivateKeyA is an integer in the range [0, oA-1].
- // Bob's PublicKeyB consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes.
- // Output: a shared secret SharedSecretA that consists of one element in GF(p^2) encoded by removing leading 0 bytes.
- point_proj_t R, pts[MAX_INT_POINTS_ALICE];
- f2elm_t coeff[3], PKB[3], _jinv;
- f2elm_t _A24plus = {0}, _C24 = {0}, _A = {0};
- f2elm_t *jinv=&_jinv, *A24plus=&_A24plus, *C24=&_C24, *A=&_A;
- unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0;
-
- // Initialize images of Bob's basis
- fp2_decode(PublicKeyB, &PKB[0]);
- fp2_decode(PublicKeyB + FP2_ENCODED_BYTES, &PKB[1]);
- fp2_decode(PublicKeyB + 2 * FP2_ENCODED_BYTES, &PKB[2]);
-
- // Initialize constants: A24plus = A+2C, C24 = 4C, where C=1
- get_A(&PKB[0], &PKB[1], &PKB[2], A);
- fpadd((const digit_t *) &Montgomery_one, (const digit_t *) &Montgomery_one, C24->e[0]);
- fp2add(A, C24, A24plus);
- fpadd(C24->e[0], C24->e[0], C24->e[0]);
-
- // Retrieve kernel point
- LADDER3PT(&PKB[0], &PKB[1], &PKB[2], PrivateKeyA, ALICE, R, A);
-
- // Traverse tree
- index = 0;
- for (row = 1; row < MAX_Alice; row++) {
- while (index < MAX_Alice - row) {
- fp2copy(&R->X, &pts[npts]->X);
- fp2copy(&R->Z, &pts[npts]->Z);
- pts_index[npts++] = index;
- m = strat_Alice[ii++];
- xDBLe(R, R, A24plus, C24, (int) (2 * m));
- index += m;
- }
- get_4_isog(R, A24plus, C24, coeff);
-
- for (i = 0; i < npts; i++) {
- eval_4_isog(pts[i], coeff);
- }
-
- fp2copy(&pts[npts - 1]->X, &R->X);
- fp2copy(&pts[npts - 1]->Z, &R->Z);
- index = pts_index[npts - 1];
- npts -= 1;
- }
-
- get_4_isog(R, A24plus, C24, coeff);
- fp2add(A24plus, A24plus, A24plus);
- fp2sub(A24plus, C24, A24plus);
- fp2add(A24plus, A24plus, A24plus);
- j_inv(A24plus, C24, jinv);
- fp2_encode(jinv, SharedSecretA); // Format shared secret
-
- return 0;
-}
-
-int EphemeralSecretAgreement_B(const digit_t *PrivateKeyB, const unsigned char *PublicKeyA, unsigned char *SharedSecretB) { // Bob's ephemeral shared secret computation
- // It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA
- // Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,oB)) - 1].
- // Alice's PublicKeyA consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes.
- // Output: a shared secret SharedSecretB that consists of one element in GF(p^2) encoded by removing leading 0 bytes.
- point_proj_t R, pts[MAX_INT_POINTS_BOB];
- f2elm_t coeff[3], PKB[3], _jinv;
- f2elm_t _A24plus = {0}, _A24minus = {0}, _A = {0};
- f2elm_t *jinv=&_jinv, *A24plus=&_A24plus, *A24minus=&_A24minus, *A=&_A;
- unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0;
-
- // Initialize images of Alice's basis
- fp2_decode(PublicKeyA, &PKB[0]);
- fp2_decode(PublicKeyA + FP2_ENCODED_BYTES, &PKB[1]);
- fp2_decode(PublicKeyA + 2 * FP2_ENCODED_BYTES, &PKB[2]);
-
- // Initialize constants: A24plus = A+2C, A24minus = A-2C, where C=1
- get_A(&PKB[0], &PKB[1], &PKB[2], A);
- fpadd((const digit_t *) &Montgomery_one, (const digit_t *) &Montgomery_one, A24minus->e[0]);
- fp2add(A, A24minus, A24plus);
- fp2sub(A, A24minus, A24minus);
-
- // Retrieve kernel point
- LADDER3PT(&PKB[0], &PKB[1], &PKB[2], PrivateKeyB, BOB, R, A);
-
- // Traverse tree
- index = 0;
- for (row = 1; row < MAX_Bob; row++) {
- while (index < MAX_Bob - row) {
- fp2copy(&R->X, &pts[npts]->X);
- fp2copy(&R->Z, &pts[npts]->Z);
- pts_index[npts++] = index;
- m = strat_Bob[ii++];
- xTPLe(R, R, A24minus, A24plus, (int) m);
- index += m;
- }
- get_3_isog(R, A24minus, A24plus, coeff);
-
- for (i = 0; i < npts; i++) {
- eval_3_isog(pts[i], coeff);
- }
-
- fp2copy(&pts[npts - 1]->X, &R->X);
- fp2copy(&pts[npts - 1]->Z, &R->Z);
- index = pts_index[npts - 1];
- npts -= 1;
- }
-
- get_3_isog(R, A24minus, A24plus, coeff);
- fp2add(A24plus, A24minus, A);
- fp2add(A, A, A);
- fp2sub(A24plus, A24minus, A24plus);
- j_inv(A, A24plus, jinv);
- fp2_encode(jinv, SharedSecretB); // Format shared secret
-
- return 0;
-}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sike_r2_kem.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sike_r2_kem.c
deleted file mode 100644
index 7768ad3650..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sike_r2_kem.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: supersingular isogeny key encapsulation (SIKE) protocol
-*********************************************************************************************/
-
-#include <string.h>
-#include "../s2n_pq_random.h"
-#include "fips202.h"
-#include "utils/s2n_safety.h"
-#include "tls/s2n_kem.h"
-#include "pq-crypto/s2n_pq.h"
-
-int SIKE_P434_r2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
- // SIKE's key generation
- // Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes)
- // public key pk (CRYPTO_PUBLICKEYBYTES bytes)
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
-
- digit_t _sk[(SECRETKEY_B_BYTES / sizeof(digit_t)) + 1];
-
- // Generate lower portion of secret key sk <- s||SK
- GUARD_AS_POSIX(s2n_get_random_bytes(sk, MSG_BYTES));
- GUARD(random_mod_order_B((unsigned char *)_sk));
-
- // Generate public key pk
- EphemeralKeyGeneration_B(_sk, pk);
-
- memcpy(sk + MSG_BYTES, _sk, SECRETKEY_B_BYTES);
-
- // Append public key pk to secret key sk
- memcpy(&sk[MSG_BYTES + SECRETKEY_B_BYTES], pk, CRYPTO_PUBLICKEYBYTES);
-
- return 0;
-}
-
-int SIKE_P434_r2_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) {
- // SIKE's encapsulation
- // Input: public key pk (CRYPTO_PUBLICKEYBYTES bytes)
- // Outputs: shared secret ss (CRYPTO_BYTES bytes)
- // ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes)
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
-
- union {
- unsigned char b[SECRETKEY_A_BYTES];
- digit_t d[SECRETKEY_A_BYTES/sizeof(digit_t)];
- } ephemeralsk;
- unsigned char jinvariant[FP2_ENCODED_BYTES];
- unsigned char h[MSG_BYTES];
- unsigned char temp[CRYPTO_CIPHERTEXTBYTES + MSG_BYTES];
-
- // Generate ephemeralsk <- G(m||pk) mod oA
- GUARD_AS_POSIX(s2n_get_random_bytes(temp, MSG_BYTES));
- memcpy(&temp[MSG_BYTES], pk, CRYPTO_PUBLICKEYBYTES);
- shake256(ephemeralsk.b, SECRETKEY_A_BYTES, temp, CRYPTO_PUBLICKEYBYTES + MSG_BYTES);
-
- /* ephemeralsk is a union; the memory set here through .b will get accessed through the .d member later */
- /* cppcheck-suppress unreadVariable */
- /* cppcheck-suppress unmatchedSuppression */
- ephemeralsk.b[SECRETKEY_A_BYTES - 1] &= MASK_ALICE;
-
- // Encrypt
- EphemeralKeyGeneration_A(ephemeralsk.d, ct);
- EphemeralSecretAgreement_A(ephemeralsk.d, pk, jinvariant);
- shake256(h, MSG_BYTES, jinvariant, FP2_ENCODED_BYTES);
- for (int i = 0; i < MSG_BYTES; i++) {
- ct[i + CRYPTO_PUBLICKEYBYTES] = temp[i] ^ h[i];
- }
- // Generate shared secret ss <- H(m||ct)
- memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES);
- shake256(ss, CRYPTO_BYTES, temp, CRYPTO_CIPHERTEXTBYTES + MSG_BYTES);
-
- return 0;
-}
-
-int SIKE_P434_r2_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) {
- // SIKE's decapsulation
- // Input: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes)
- // ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes)
- // Outputs: shared secret ss (CRYPTO_BYTES bytes)
- ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
-
- union {
- unsigned char b[SECRETKEY_A_BYTES];
- digit_t d[SECRETKEY_A_BYTES/sizeof(digit_t)];
- } ephemeralsk_;
- unsigned char jinvariant_[FP2_ENCODED_BYTES];
- unsigned char h_[MSG_BYTES];
- unsigned char c0_[CRYPTO_PUBLICKEYBYTES];
- unsigned char temp[CRYPTO_CIPHERTEXTBYTES + MSG_BYTES];
-
- digit_t _sk[(SECRETKEY_B_BYTES / sizeof(digit_t)) + 1];
- memcpy(_sk, sk + MSG_BYTES, SECRETKEY_B_BYTES);
-
- // Decrypt
- EphemeralSecretAgreement_B(_sk, ct, jinvariant_);
- shake256(h_, MSG_BYTES, jinvariant_, FP2_ENCODED_BYTES);
- for (int i = 0; i < MSG_BYTES; i++) {
- temp[i] = ct[i + CRYPTO_PUBLICKEYBYTES] ^ h_[i];
- }
- // Generate ephemeralsk_ <- G(m||pk) mod oA
- memcpy(&temp[MSG_BYTES], &sk[MSG_BYTES + SECRETKEY_B_BYTES], CRYPTO_PUBLICKEYBYTES);
- shake256(ephemeralsk_.b, SECRETKEY_A_BYTES, temp, CRYPTO_PUBLICKEYBYTES + MSG_BYTES);
-
- /* ephemeralsk_ is a union; the memory set here through .b will get accessed through the .d member later */
- /* cppcheck-suppress unreadVariable */
- /* cppcheck-suppress uninitvar */
- /* cppcheck-suppress unmatchedSuppression */
- ephemeralsk_.b[SECRETKEY_A_BYTES - 1] &= MASK_ALICE;
-
- // Generate shared secret ss <- H(m||ct) or output ss <- H(s||ct)
- EphemeralKeyGeneration_A(ephemeralsk_.d, c0_);
- if (memcmp(c0_, ct, CRYPTO_PUBLICKEYBYTES) != 0) {
- memcpy(temp, sk, MSG_BYTES);
- }
- memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES);
- shake256(ss, CRYPTO_BYTES, temp, CRYPTO_CIPHERTEXTBYTES + MSG_BYTES);
-
- return 0;
-}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sikep434r2_fp_x64_asm.S b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sikep434r2_fp_x64_asm.S
deleted file mode 100644
index 831fc1b7fb..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sikep434r2_fp_x64_asm.S
+++ /dev/null
@@ -1,962 +0,0 @@
-//*******************************************************************************************
-// SIDH: an efficient supersingular isogeny cryptography library
-//
-// Abstract: field arithmetic in x64 assembly for P434 on Linux
-//*******************************************************************************************
-
-.intel_syntax noprefix
-
-/* Requires bmi2 instruction set for mulx. adx instructions are optional, but preferred. */
-
-// Registers that are used for parameter passing:
-#define reg_p1 rdi
-#define reg_p2 rsi
-#define reg_p3 rdx
-
-// Define addition instructions
-#ifdef S2N_ADX
-
-#define ADD1 adox
-#define ADC1 adox
-#define ADD2 adcx
-#define ADC2 adcx
-
-#else // S2N_ADX
-
-#define ADD1 add
-#define ADC1 adc
-#define ADD2 add
-#define ADC2 adc
-
-#endif // S2N_ADX
-
-// The constants below (asm_p434, asm_p434p1, and asm_p434x2) are duplicated from
-// P434.c, and correspond to the arrays p434, p434p1, and p434x2. The values are
-// idenctical; they are just represented here as standard (base 10) ints, instead
-// of hex. If, for any reason, the constants are changed in one file, they should be
-// updated in the other file as well.
-
-.text
-.align 32
-.type asm_p434, @object
-.size asm_p434, 56
-asm_p434:
- .quad -1
- .quad -1
- .quad -1
- .quad -161717841442111489
- .quad 8918917783347572387
- .quad 7853257225132122198
- .quad 620258357900100
-.align 32
-.type asm_p434p1, @object
-.size asm_p434p1, 56
-asm_p434p1:
- .quad 0
- .quad 0
- .quad 0
- .quad -161717841442111488
- .quad 8918917783347572387
- .quad 7853257225132122198
- .quad 620258357900100
-.align 32
-.type asm_p434x2, @object
-.size asm_p434x2, 56
-asm_p434x2:
- .quad -2
- .quad -1
- .quad -1
- .quad -323435682884222977
- .quad -608908507014406841
- .quad -2740229623445307220
- .quad 1240516715800200
-
-//***********************************************************************
-// Field addition
-// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
-//***********************************************************************
-.global fpadd434_asm
-fpadd434_asm:
- push r12
- push r13
- push r14
- push r15
- push rbx
- push rbp
-
- xor rax, rax
- mov r8, [reg_p1]
- mov r9, [reg_p1+8]
- mov r10, [reg_p1+16]
- mov r11, [reg_p1+24]
- mov r12, [reg_p1+32]
- mov r13, [reg_p1+40]
- mov r14, [reg_p1+48]
- add r8, [reg_p2]
- adc r9, [reg_p2+8]
- adc r10, [reg_p2+16]
- adc r11, [reg_p2+24]
- adc r12, [reg_p2+32]
- adc r13, [reg_p2+40]
- adc r14, [reg_p2+48]
-
- mov rbx, [rip+asm_p434x2]
- sub r8, rbx
- mov rcx, [rip+asm_p434x2+8]
- sbb r9, rcx
- sbb r10, rcx
- mov rdi, [rip+asm_p434x2+24]
- sbb r11, rdi
- mov rsi, [rip+asm_p434x2+32]
- sbb r12, rsi
- mov rbp, [rip+asm_p434x2+40]
- sbb r13, rbp
- mov r15, [rip+asm_p434x2+48]
- sbb r14, r15
- sbb rax, 0
-
- and rbx, rax
- and rcx, rax
- and rdi, rax
- and rsi, rax
- and rbp, rax
- and r15, rax
-
- add r8, rbx
- adc r9, rcx
- adc r10, rcx
- adc r11, rdi
- adc r12, rsi
- adc r13, rbp
- adc r14, r15
- mov [reg_p3], r8
- mov [reg_p3+8], r9
- mov [reg_p3+16], r10
- mov [reg_p3+24], r11
- mov [reg_p3+32], r12
- mov [reg_p3+40], r13
- mov [reg_p3+48], r14
-
- pop rbp
- pop rbx
- pop r15
- pop r14
- pop r13
- pop r12
- ret
-
-//***********************************************************************
-// Field subtraction
-// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
-//***********************************************************************
-.global fpsub434_asm
-fpsub434_asm:
- push r12
- push r13
- push r14
-
- xor rax, rax
- mov r8, [reg_p1]
- mov r9, [reg_p1+8]
- mov r10, [reg_p1+16]
- mov r11, [reg_p1+24]
- mov r12, [reg_p1+32]
- mov r13, [reg_p1+40]
- mov r14, [reg_p1+48]
- sub r8, [reg_p2]
- sbb r9, [reg_p2+8]
- sbb r10, [reg_p2+16]
- sbb r11, [reg_p2+24]
- sbb r12, [reg_p2+32]
- sbb r13, [reg_p2+40]
- sbb r14, [reg_p2+48]
- sbb rax, 0
-
- mov rcx, [rip+asm_p434x2]
- mov rdi, [rip+asm_p434x2+8]
- mov rsi, [rip+asm_p434x2+24]
- and rcx, rax
- and rdi, rax
- and rsi, rax
- add r8, rcx
- adc r9, rdi
- adc r10, rdi
- adc r11, rsi
- mov [reg_p3], r8
- mov [reg_p3+8], r9
- mov [reg_p3+16], r10
- mov [reg_p3+24], r11
- setc cl
-
- mov r8, [rip+asm_p434x2+32]
- mov rdi, [rip+asm_p434x2+40]
- mov rsi, [rip+asm_p434x2+48]
- and r8, rax
- and rdi, rax
- and rsi, rax
- bt rcx, 0
- adc r12, r8
- adc r13, rdi
- adc r14, rsi
- mov [reg_p3+32], r12
- mov [reg_p3+40], r13
- mov [reg_p3+48], r14
-
- pop r14
- pop r13
- pop r12
- ret
-
-///////////////////////////////////////////////////////////////// MACRO
-// Schoolbook integer multiplication, a full row at a time
-// Inputs: memory pointers M0 and M1
-// Outputs: memory pointer C
-// Temps: regs T0:T9
-/////////////////////////////////////////////////////////////////
-
-#ifdef S2N_ADX
-.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
- mov rdx, \M0
- mulx \T0, \T1, \M1 // T0:T1 = A0*B0
- mov \C, \T1 // C0_final
- mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1
- xor rax, rax
- adox \T0, \T2
- mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2
- adox \T1, \T3
-
- mov rdx, 8\M0
- mulx \T3, \T4, \M1 // T3:T4 = A1*B0
- adox \T2, rax
- xor rax, rax
- mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1
- adox \T4, \T0
- mov 8\C, \T4 // C1_final
- adcx \T3, \T6
- mulx \T6, \T0, 16\M1 // T6:T0 = A1*B2
- adox \T3, \T1
- adcx \T5, \T0
- adcx \T6, rax
- adox \T5, \T2
-
- mov rdx, 16\M0
- mulx \T1, \T0, \M1 // T1:T0 = A2*B0
- adox \T6, rax
- xor rax, rax
- mulx \T4, \T2, 8\M1 // T4:T2 = A2*B1
- adox \T0, \T3
- mov 16\C, \T0 // C2_final
- adcx \T1, \T5
- mulx \T0, \T3, 16\M1 // T0:T3 = A2*B2
- adcx \T4, \T6
- adcx \T0, rax
- adox \T1, \T2
- adox \T3, \T4
- adox \T0, rax
- mov 24\C, \T1 // C3_final
- mov 32\C, \T3 // C4_final
- mov 40\C, \T0 // C5_final
-.endm
-
-.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
- mov rdx, \M0
- mulx \T0, \T1, \M1 // T0:T1 = A0*B0
- mov \C, \T1 // C0_final
- mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1
- xor rax, rax
- adox \T0, \T2
- mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2
- adox \T1, \T3
- mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3
- adox \T2, \T4
-
- mov rdx, 8\M0
- mulx \T5, \T4, \M1 // T5:T4 = A1*B0
- adox \T3, rax
- xor rax, rax
- mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1
- adox \T4, \T0
- mov 8\C, \T4 // C1_final
- adcx \T5, \T7
- mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2
- adcx \T6, \T8
- adox \T5, \T1
- mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3
- adcx \T7, \T9
- adcx \T8, rax
- adox \T6, \T2
-
- mov rdx, 16\M0
- mulx \T1, \T0, \M1 // T1:T0 = A2*B0
- adox \T7, \T3
- adox \T8, rax
- xor rax, rax
- mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1
- adox \T0, \T5
- mov 16\C, \T0 // C2_final
- adcx \T1, \T3
- mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2
- adcx \T2, \T4
- adox \T1, \T6
- mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3
- adcx \T3, \T9
- mov rdx, 24\M0
- adcx \T4, rax
-
- adox \T2, \T7
- adox \T3, \T8
- adox \T4, rax
-
- mulx \T5, \T0, \M1 // T5:T0 = A3*B0
- xor rax, rax
- mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1
- adcx \T5, \T7
- adox \T1, \T0
- mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2
- adcx \T6, \T8
- adox \T2, \T5
- mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3
- adcx \T7, \T9
- adcx \T8, rax
-
- adox \T3, \T6
- adox \T4, \T7
- adox \T8, rax
- mov 24\C, \T1 // C3_final
- mov 32\C, \T2 // C4_final
- mov 40\C, \T3 // C5_final
- mov 48\C, \T4 // C6_final
- mov 56\C, \T8 // C7_final
-.endm
-
-#else // S2N_ADX
-
-.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
- mov rdx, \M0
- mulx \T0, \T1, \M1 // T0:T1 = A0*B0
- mov \C, \T1 // C0_final
- mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1
- add \T0, \T2
- mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2
- adc \T1, \T3
-
- mov rdx, 8\M0
- mulx \T3, \T4, \M1 // T3:T4 = A1*B0
- adc \T2, 0
- mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1
- add \T4, \T0
- mov 8\C, \T4 // C1_final
- adc \T3, \T1
- adc \T5, \T2
- mulx \T0, \T1, 16\M1 // T0:T1 = A1*B2
- adc \T0, 0
-
- add \T3, \T6
- adc \T5, \T1
- adc \T0, 0
-
- mov rdx, 16\M0
- mulx \T1, \T2, \M1 // T1:T2 = A2*B0
- add \T2, \T3
- mov 16\C, \T2 // C2_final
- mulx \T4, \T6, 8\M1 // T4:T6 = A2*B1
- adc \T1, \T5
- adc \T0, \T4
- mulx \T2, \T3, 16\M1 // T0:T3 = A2*B2
- adc \T2, 0
- add \T1, \T6
- adc \T0, \T3
- adc \T2, 0
- mov 24\C, \T1 // C3_final
- mov 32\C, \T0 // C4_final
- mov 40\C, \T2 // C5_final
-.endm
-
-.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
- mov rdx, \M0
- mulx \T0, \T1, \M1 // T0:T1 = A0*B0
- mov \C, \T1 // C0_final
- mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1
- add \T0, \T2
- mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2
- adc \T1, \T3
- mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3
- adc \T2, \T4
- mov rdx, 8\M0
- adc \T3, 0
-
- mulx \T5, \T4, \M1 // T5:T4 = A1*B0
- mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1
- add \T5, \T7
- mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2
- adc \T6, \T8
- mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3
- adc \T7, \T9
- adc \T8, 0
-
- add \T4, \T0
- mov 8\C, \T4 // C1_final
- adc \T5, \T1
- adc \T6, \T2
- adc \T7, \T3
- mov rdx, 16\M0
- adc \T8, 0
-
- mulx \T1, \T0, \M1 // T1:T0 = A2*B0
- mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1
- add \T1, \T3
- mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2
- adc \T2, \T4
- mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3
- adc \T3, \T9
- mov rdx, 24\M0
- adc \T4, 0
-
- add \T0, \T5
- mov 16\C, \T0 // C2_final
- adc \T1, \T6
- adc \T2, \T7
- adc \T3, \T8
- adc \T4, 0
-
- mulx \T5, \T0, \M1 // T5:T0 = A3*B0
- mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1
- add \T5, \T7
- mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2
- adc \T6, \T8
- mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3
- adc \T7, \T9
- adc \T8, 0
-
- add \T1, \T0
- mov 24\C, \T1 // C3_final
- adc \T2, \T5
- mov 32\C, \T2 // C4_final
- adc \T3, \T6
- mov 40\C, \T3 // C5_final
- adc \T4, \T7
- mov 48\C, \T4 // C6_final
- adc \T8, 0
- mov 56\C, \T8 // C7_final
-.endm
-#endif // S2N_ADX
-
-//*****************************************************************************
-// 434-bit multiplication using Karatsuba (one level), schoolbook (one level)
-//*****************************************************************************
-.global mul434_asm
-mul434_asm:
- push r12
- push r13
- push r14
- push r15
- mov rcx, reg_p3
-
- // r8-r11 <- AH + AL, rax <- mask
- xor rax, rax
- mov r8, [reg_p1]
- mov r9, [reg_p1+8]
- mov r10, [reg_p1+16]
- mov r11, [reg_p1+24]
- push rbx
- push rbp
- sub rsp, 96
- add r8, [reg_p1+32]
- adc r9, [reg_p1+40]
- adc r10, [reg_p1+48]
- adc r11, 0
- sbb rax, 0
- mov [rsp], r8
- mov [rsp+8], r9
- mov [rsp+16], r10
- mov [rsp+24], r11
-
- // r12-r15 <- BH + BL, rbx <- mask
- xor rbx, rbx
- mov r12, [reg_p2]
- mov r13, [reg_p2+8]
- mov r14, [reg_p2+16]
- mov r15, [reg_p2+24]
- add r12, [reg_p2+32]
- adc r13, [reg_p2+40]
- adc r14, [reg_p2+48]
- adc r15, 0
- sbb rbx, 0
- mov [rsp+32], r12
- mov [rsp+40], r13
- mov [rsp+48], r14
- mov [rsp+56], r15
-
- // r12-r15 <- masked (BH + BL)
- and r12, rax
- and r13, rax
- and r14, rax
- and r15, rax
-
- // r8-r11 <- masked (AH + AL)
- and r8, rbx
- and r9, rbx
- and r10, rbx
- and r11, rbx
-
- // r8-r11 <- masked (AH + AL) + masked (BH + BL)
- add r8, r12
- adc r9, r13
- adc r10, r14
- adc r11, r15
- mov [rsp+64], r8
- mov [rsp+72], r9
- mov [rsp+80], r10
- mov [rsp+88], r11
-
- // [rsp] <- (AH+AL) x (BH+BL), low part
- MUL256_SCHOOL [rsp], [rsp+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp
-
- // [rcx] <- AL x BL
- MUL256_SCHOOL [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp // Result C0-C3
-
- // [rcx+64] <- AH x BH
- MUL192_SCHOOL [reg_p1+32], [reg_p2+32], [rcx+64], r8, r9, r10, r11, r12, r13, r14
-
- // r8-r11 <- (AH+AL) x (BH+BL), final step
- mov r8, [rsp+64]
- mov r9, [rsp+72]
- mov r10, [rsp+80]
- mov r11, [rsp+88]
- mov rax, [rsp+32]
- add r8, rax
- mov rax, [rsp+40]
- adc r9, rax
- mov rax, [rsp+48]
- adc r10, rax
- mov rax, [rsp+56]
- adc r11, rax
-
- // [rsp], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
- mov r12, [rsp]
- mov r13, [rsp+8]
- mov r14, [rsp+16]
- mov r15, [rsp+24]
- sub r12, [rcx]
- sbb r13, [rcx+8]
- sbb r14, [rcx+16]
- sbb r15, [rcx+24]
- sbb r8, [rcx+32]
- sbb r9, [rcx+40]
- sbb r10, [rcx+48]
- sbb r11, [rcx+56]
-
- // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
- sub r12, [rcx+64]
- sbb r13, [rcx+72]
- sbb r14, [rcx+80]
- sbb r15, [rcx+88]
- sbb r8, [rcx+96]
- sbb r9, [rcx+104]
- sbb r10, 0
- sbb r11, 0
-
- add r12, [rcx+32]
- mov [rcx+32], r12 // Result C4-C7
- adc r13, [rcx+40]
- mov [rcx+40], r13
- adc r14, [rcx+48]
- mov [rcx+48], r14
- adc r15, [rcx+56]
- mov [rcx+56], r15
- adc r8, [rcx+64]
- mov [rcx+64], r8 // Result C8-C15
- adc r9, [rcx+72]
- mov [rcx+72], r9
- adc r10, [rcx+80]
- mov [rcx+80], r10
- adc r11, [rcx+88]
- mov [rcx+88], r11
- mov r12, [rcx+96]
- adc r12, 0
- mov [rcx+96], r12
- mov r13, [rcx+104]
- adc r13, 0
- mov [rcx+104], r13
-
- add rsp, 96
- pop rbp
- pop rbx
- pop r15
- pop r14
- pop r13
- pop r12
- ret
-
-///////////////////////////////////////////////////////////////// MACRO
-// Schoolbook integer multiplication
-// Inputs: memory pointers M0 and M1
-// Outputs: regs T0:T5
-// Temps: regs T7:T6
-/////////////////////////////////////////////////////////////////
-.macro MUL64x256_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5
- mov rdx, \M0
- mulx \T1, \T0, \M1 // T0 <- C0_final
- mulx \T2, \T4, 8\M1
- xor rax, rax
- mulx \T3, \T5, 16\M1
- ADD1 \T1, \T4 // T1 <- C1_final
- ADC1 \T2, \T5 // T2 <- C2_final
- mulx \T4, \T5, 24\M1
- ADC1 \T3, \T5 // T3 <- C3_final
- ADC1 \T4, rax // T4 <- C4_final
-.endm
-
-#ifdef S2N_ADX
-.macro MUL128x256_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6
- mov rdx, \M0
- mulx \T1, \T0, \M1 // T0 <- C0_final
- mulx \T2, \T4, 8\M1
- xor rax, rax
- mulx \T3, \T5, 16\M1
- ADD1 \T1, \T4
- ADC1 \T2, \T5
- mulx \T4, \T5, 24\M1
- ADC1 \T3, \T5
- ADC1 \T4, rax
-
- xor rax, rax
- mov rdx, 8\M0
- mulx \T6, \T5, \M1
- ADD2 \T1, \T5 // T1 <- C1_final
- ADC2 \T2, \T6
- mulx \T5, \T6, 8\M1
- ADC2 \T3, \T5
- ADD1 \T2, \T6
- mulx \T5, \T6, 16\M1
- ADC2 \T4, \T5
- ADC1 \T3, \T6
- mulx \T5, \T6, 24\M1
- ADC2 \T5, rax
- ADC1 \T4, \T6
- ADC1 \T5, rax
-.endm
-
-#else // S2N_ADX
-
-.macro MUL128x256_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6
- mov rdx, \M0
- mulx \T1, \T0, \M1 // T0 <- C0_final
- mulx \T2, \T4, 8\M1
- mulx \T3, \T5, 16\M1
- add \T1, \T4
- adc \T2, \T5
- mulx \T4, \T5, 24\M1
- adc \T3, \T5
- adc \T4, 0
-
- mov rdx, 8\M0
- mulx \T6, \T5, \M1
- add \T1, \T5 // T1 <- C1_final
- adc \T2, \T6
- mulx \T5, \T6, 8\M1
- adc \T3, \T5
- mulx \T5, rax, 16\M1
- adc \T4, \T5
- mulx \T5, rdx, 24\M1
- adc \T5, 0
- add \T2, \T6
- adc \T3, rax
- adc \T4, rdx
- adc \T5, 0
-.endm
-#endif // S2N_ADX
-
-//**************************************************************************************
-// Montgomery reduction
-// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
-// Operation: c [reg_p2] = a [reg_p1]
-// NOTE: a=c is not allowed
-//**************************************************************************************
-.global rdc434_asm
-rdc434_asm:
- push r12
- push r13
-
- // a[0-1] x p434p1_nz --> result: r8:r13
- MUL128x256_SCHOOL [reg_p1], [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13, rcx
-
- xor rcx, rcx
- add r8, [reg_p1+24]
- adc r9, [reg_p1+32]
- adc r10, [reg_p1+40]
- adc r11, [reg_p1+48]
- adc r12, [reg_p1+56]
- adc r13, [reg_p1+64]
- adc rcx, [reg_p1+72]
- mov [reg_p1+24], r8
- mov [reg_p1+32], r9
- mov [reg_p1+40], r10
- mov [reg_p1+48], r11
- mov [reg_p1+56], r12
- mov [reg_p1+64], r13
- mov [reg_p1+72], rcx
- mov r8, [reg_p1+80]
- mov r9, [reg_p1+88]
- mov r10, [reg_p1+96]
- mov r11, [reg_p1+104]
- adc r8, 0
- adc r9, 0
- adc r10, 0
- adc r11, 0
- mov [reg_p1+80], r8
- mov [reg_p1+88], r9
- mov [reg_p1+96], r10
- mov [reg_p1+104], r11
-
- // a[2-3] x p434p1_nz --> result: r8:r13
- MUL128x256_SCHOOL [reg_p1+16], [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13, rcx
-
- xor rcx, rcx
- add r8, [reg_p1+40]
- adc r9, [reg_p1+48]
- adc r10, [reg_p1+56]
- adc r11, [reg_p1+64]
- adc r12, [reg_p1+72]
- adc r13, [reg_p1+80]
- adc rcx, [reg_p1+88]
- mov [reg_p1+40], r8
- mov [reg_p1+48], r9
- mov [reg_p1+56], r10
- mov [reg_p1+64], r11
- mov [reg_p1+72], r12
- mov [reg_p1+80], r13
- mov [reg_p1+88], rcx
- mov r8, [reg_p1+96]
- mov r9, [reg_p1+104]
- adc r8, 0
- adc r9, 0
- mov [reg_p1+96], r8
- mov [reg_p1+104], r9
-
- // a[4-5] x p434p1_nz --> result: r8:r13
- MUL128x256_SCHOOL [reg_p1+32], [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13, rcx
-
- xor rcx, rcx
- add r8, [reg_p1+56]
- adc r9, [reg_p1+64]
- adc r10, [reg_p1+72]
- adc r11, [reg_p1+80]
- adc r12, [reg_p1+88]
- adc r13, [reg_p1+96]
- adc rcx, [reg_p1+104]
- mov [reg_p2], r8 // Final result c0-c1
- mov [reg_p2+8], r9
- mov [reg_p1+72], r10
- mov [reg_p1+80], r11
- mov [reg_p1+88], r12
- mov [reg_p1+96], r13
- mov [reg_p1+104], rcx
-
- // a[6-7] x p434p1_nz --> result: r8:r12
- MUL64x256_SCHOOL [reg_p1+48], [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13
-
- // Final result c2:c6
- add r8, [reg_p1+72]
- adc r9, [reg_p1+80]
- adc r10, [reg_p1+88]
- adc r11, [reg_p1+96]
- adc r12, [reg_p1+104]
- mov [reg_p2+16], r8
- mov [reg_p2+24], r9
- mov [reg_p2+32], r10
- mov [reg_p2+40], r11
- mov [reg_p2+48], r12
-
- pop r13
- pop r12
- ret
-
-//***********************************************************************
-// 434-bit multiprecision addition
-// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
-//***********************************************************************
-.global mp_add434_asm
-mp_add434_asm:
- mov r8, [reg_p1]
- mov r9, [reg_p1+8]
- mov r10, [reg_p1+16]
- mov r11, [reg_p1+24]
- add r8, [reg_p2]
- adc r9, [reg_p2+8]
- adc r10, [reg_p2+16]
- adc r11, [reg_p2+24]
- mov [reg_p3], r8
- mov [reg_p3+8], r9
- mov [reg_p3+16], r10
- mov [reg_p3+24], r11
-
- mov r8, [reg_p1+32]
- mov r9, [reg_p1+40]
- mov r10, [reg_p1+48]
- adc r8, [reg_p2+32]
- adc r9, [reg_p2+40]
- adc r10, [reg_p2+48]
- mov [reg_p3+32], r8
- mov [reg_p3+40], r9
- mov [reg_p3+48], r10
- ret
-
-//***********************************************************************
-// 2x434-bit multiprecision subtraction/addition
-// Operation: c [x2] = a [x0] - b [x1]. If c < 0, add p434*2^448
-//***********************************************************************
-.global mp_subadd434x2_asm
-mp_subadd434x2_asm:
- push r12
- push r13
- push r14
- push r15
- xor rax, rax
- mov r8, [reg_p1]
- mov r9, [reg_p1+8]
- mov r10, [reg_p1+16]
- mov r11, [reg_p1+24]
- mov r12, [reg_p1+32]
- sub r8, [reg_p2]
- sbb r9, [reg_p2+8]
- sbb r10, [reg_p2+16]
- sbb r11, [reg_p2+24]
- sbb r12, [reg_p2+32]
- mov [reg_p3], r8
- mov [reg_p3+8], r9
- mov [reg_p3+16], r10
- mov [reg_p3+24], r11
- mov [reg_p3+32], r12
-
- mov r8, [reg_p1+40]
- mov r9, [reg_p1+48]
- mov r10, [reg_p1+56]
- mov r11, [reg_p1+64]
- mov r12, [reg_p1+72]
- sbb r8, [reg_p2+40]
- sbb r9, [reg_p2+48]
- sbb r10, [reg_p2+56]
- sbb r11, [reg_p2+64]
- sbb r12, [reg_p2+72]
- mov [reg_p3+40], r8
- mov [reg_p3+48], r9
- mov [reg_p3+56], r10
-
- mov r13, [reg_p1+80]
- mov r14, [reg_p1+88]
- mov r15, [reg_p1+96]
- mov rcx, [reg_p1+104]
- sbb r13, [reg_p2+80]
- sbb r14, [reg_p2+88]
- sbb r15, [reg_p2+96]
- sbb rcx, [reg_p2+104]
- sbb rax, 0
-
- // Add p434 anded with the mask in rax
- mov r8, [rip+asm_p434]
- mov r9, [rip+asm_p434+24]
- mov r10, [rip+asm_p434+32]
- mov rdi, [rip+asm_p434+40]
- mov rsi, [rip+asm_p434+48]
- and r8, rax
- and r9, rax
- and r10, rax
- and rdi, rax
- and rsi, rax
- mov rax, [reg_p3+56]
- add rax, r8
- adc r11, r8
- adc r12, r8
- adc r13, r9
- adc r14, r10
- adc r15, rdi
- adc rcx, rsi
-
- mov [reg_p3+56], rax
- mov [reg_p3+64], r11
- mov [reg_p3+72], r12
- mov [reg_p3+80], r13
- mov [reg_p3+88], r14
- mov [reg_p3+96], r15
- mov [reg_p3+104], rcx
- pop r15
- pop r14
- pop r13
- pop r12
- ret
-
-//***********************************************************************
-// Double 2x434-bit multiprecision subtraction
-// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2]
-//***********************************************************************
-.global mp_dblsub434x2_asm
-mp_dblsub434x2_asm:
- push r12
- push r13
-
- xor rax, rax
- mov r8, [reg_p3]
- mov r9, [reg_p3+8]
- mov r10, [reg_p3+16]
- mov r11, [reg_p3+24]
- mov r12, [reg_p3+32]
- mov r13, [reg_p3+40]
- mov rcx, [reg_p3+48]
- sub r8, [reg_p1]
- sbb r9, [reg_p1+8]
- sbb r10, [reg_p1+16]
- sbb r11, [reg_p1+24]
- sbb r12, [reg_p1+32]
- sbb r13, [reg_p1+40]
- sbb rcx, [reg_p1+48]
- adc rax, 0
- sub r8, [reg_p2]
- sbb r9, [reg_p2+8]
- sbb r10, [reg_p2+16]
- sbb r11, [reg_p2+24]
- sbb r12, [reg_p2+32]
- sbb r13, [reg_p2+40]
- sbb rcx, [reg_p2+48]
- adc rax, 0
- mov [reg_p3], r8
- mov [reg_p3+8], r9
- mov [reg_p3+16], r10
- mov [reg_p3+24], r11
- mov [reg_p3+32], r12
- mov [reg_p3+40], r13
- mov [reg_p3+48], rcx
-
- mov r8, [reg_p3+56]
- mov r9, [reg_p3+64]
- mov r10, [reg_p3+72]
- mov r11, [reg_p3+80]
- mov r12, [reg_p3+88]
- mov r13, [reg_p3+96]
- mov rcx, [reg_p3+104]
- sub r8, rax
- sbb r8, [reg_p1+56]
- sbb r9, [reg_p1+64]
- sbb r10, [reg_p1+72]
- sbb r11, [reg_p1+80]
- sbb r12, [reg_p1+88]
- sbb r13, [reg_p1+96]
- sbb rcx, [reg_p1+104]
- sub r8, [reg_p2+56]
- sbb r9, [reg_p2+64]
- sbb r10, [reg_p2+72]
- sbb r11, [reg_p2+80]
- sbb r12, [reg_p2+88]
- sbb r13, [reg_p2+96]
- sbb rcx, [reg_p2+104]
- mov [reg_p3+56], r8
- mov [reg_p3+64], r9
- mov [reg_p3+72], r10
- mov [reg_p3+80], r11
- mov [reg_p3+88], r12
- mov [reg_p3+96], r13
- mov [reg_p3+104], rcx
-
- pop r13
- pop r12
- ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.c
new file mode 100644
index 0000000000..7ce71ae3d3
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.c
@@ -0,0 +1,146 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: supersingular isogeny parameters and generation of functions for P434
+*********************************************************************************************/
+
+#include "sikep434r3.h"
+
+/* Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points:
+ *
+ * Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at
+ * the leftmost position (i.e., little endian format). Elements (a+b*i) over GF(p^2), where a and b are
+ * defined over GF(p), are encoded as {a, b}, with a in the least significant position. Elliptic curve
+ * points P = (x,y) are encoded as {x, y}, with x in the least significant position. Internally, the
+ * number of digits used to represent all these elements is obtained by approximating the number of bits
+ * to the immediately greater multiple of 32. For example, a 434-bit field element is represented with
+ * Ceil(434 / 64) = 7 64-bit digits or Ceil(434 / 32) = 14 32-bit digits.
+ *
+ * Curve isogeny system "SIDHp434". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over
+ * GF(p434^2), where A=6, B=1, C=1 and p434 = 2^216*3^137-1 */
+
+const uint64_t p434[S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+ 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+ 0xFDC1767AE2FFFFFF, 0x7BC65C783158AEA3, 0x6CFC5FD681C52056,
+ 0x0002341F27177344
+};
+
+const uint64_t p434x2[S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+ 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+ 0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47, 0xD9F8BFAD038A40AC,
+ 0x0004683E4E2EE688
+};
+
+const uint64_t p434x4[S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+ 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+ 0xF705D9EB8BFFFFFF, 0xEF1971E0C562BA8F, 0xB3F17F5A07148159,
+ 0x0008D07C9C5DCD11
+};
+
+const uint64_t p434p1[S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0xFDC1767AE3000000, 0x7BC65C783158AEA3, 0x6CFC5FD681C52056,
+ 0x0002341F27177344
+};
+
+/* Alice's generator values {XPA0 + XPA1*i, XQA0 + xQA1*i, XRA0 + XRA1*i} in GF(p434^2),
+ * expressed in Montgomery representation */
+const uint64_t A_gen[6*S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+ 0x05ADF455C5C345BF, 0x91935C5CC767AC2B, 0xAFE4E879951F0257,
+ 0x70E792DC89FA27B1, 0xF797F526BB48C8CD, 0x2181DB6131AF621F,
+ 0x00000A1C08B1ECC4, /* XPA0 */
+
+ 0x74840EB87CDA7788, 0x2971AA0ECF9F9D0B, 0xCB5732BDF41715D5,
+ 0x8CD8E51F7AACFFAA, 0xA7F424730D7E419F, 0xD671EB919A179E8C,
+ 0x0000FFA26C5A924A, /* XPA1 */
+
+ 0xFEC6E64588B7273B, 0xD2A626D74CBBF1C6, 0xF8F58F07A78098C7,
+ 0xE23941F470841B03, 0x1B63EDA2045538DD, 0x735CFEB0FFD49215,
+ 0x0001C4CB77542876, /* XQA0 */
+
+ 0xADB0F733C17FFDD6, 0x6AFFBD037DA0A050, 0x680EC43DB144E02F,
+ 0x1E2E5D5FF524E374, 0xE2DDA115260E2995, 0xA6E4B552E2EDE508,
+ 0x00018ECCDDF4B53E, /* XQA1 */
+
+ 0x01BA4DB518CD6C7D, 0x2CB0251FE3CC0611, 0x259B0C6949A9121B,
+ 0x60E17AC16D2F82AD, 0x3AA41F1CE175D92D, 0x413FBE6A9B9BC4F3,
+ 0x00022A81D8D55643, /* XRA0 */
+
+ 0xB8ADBC70FC82E54A, 0xEF9CDDB0D5FADDED, 0x5820C734C80096A0,
+ 0x7799994BAA96E0E4, 0x044961599E379AF8, 0xDB2B94FBF09F27E2,
+ 0x0000B87FC716C0C6 /* XRA1 */
+};
+
+/* Bob's generator values {XPB0, XQB0, XRB0 + XRB1*i} in GF(p434^2), expressed in Montgomery representation */
+const uint64_t B_gen[6*S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+ 0x6E5497556EDD48A3, 0x2A61B501546F1C05, 0xEB919446D049887D,
+ 0x5864A4A69D450C4F, 0xB883F276A6490D2B, 0x22CC287022D5F5B9,
+ 0x0001BED4772E551F, /* XPB0 */
+
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, /* XPB1 */
+
+ 0xFAE2A3F93D8B6B8E, 0x494871F51700FE1C, 0xEF1A94228413C27C,
+ 0x498FF4A4AF60BD62, 0xB00AD2A708267E8A, 0xF4328294E017837F,
+ 0x000034080181D8AE, /* XQB0 */
+
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, /* XQB1 */
+
+ 0x283B34FAFEFDC8E4, 0x9208F44977C3E647, 0x7DEAE962816F4E9A,
+ 0x68A2BA8AA262EC9D, 0x8176F112EA43F45B, 0x02106D022634F504,
+ 0x00007E8A50F02E37, /* XRB0 */
+
+ 0xB378B7C1DA22CCB1, 0x6D089C99AD1D9230, 0xEBE15711813E2369,
+ 0x2B35A68239D48A53, 0x445F6FD138407C93, 0xBEF93B29A3F6B54B,
+ 0x000173FA910377D3 /* XRB1 */
+};
+
+/* Montgomery constant Montgomery_R2 = (2^448)^2 mod p434 */
+const uint64_t Montgomery_R2[S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+ 0x28E55B65DCD69B30, 0xACEC7367768798C2, 0xAB27973F8311688D,
+ 0x175CC6AF8D6C7C0B, 0xABCD92BF2DDE347E, 0x69E16A61C7686D9A,
+ 0x000025A89BCDD12A
+};
+
+/* Value one in Montgomery representation */
+const uint64_t Montgomery_one[S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+ 0x000000000000742C, 0x0000000000000000, 0x0000000000000000,
+ 0xB90FF404FC000000, 0xD801A4FB559FACD4, 0xE93254545F77410C,
+ 0x0000ECEEA7BD2EDA
+};
+
+/* Fixed parameters for isogeny tree computation */
+const unsigned int strat_Alice[S2N_SIKE_P434_R3_MAX_ALICE-1] = {
+ 48, 28, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 13, 7, 4,
+ 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 4, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 21, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2,
+ 1, 1, 1, 1, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1
+};
+
+const unsigned int strat_Bob[S2N_SIKE_P434_R3_MAX_BOB-1] = {
+ 66, 33, 17, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1,
+ 2, 1, 1, 16, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 32,
+ 16, 8, 4, 3, 1, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4,
+ 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1
+};
+
+/* Returns true if the machine is big endian */
+bool is_big_endian()
+{
+ uint16_t i = 1;
+ uint8_t *ptr = (uint8_t *)&i;
+ return !(*ptr);
+}
+
+uint32_t bswap32(uint32_t x)
+{
+ uint32_t i = (x >> 16) | (x << 16);
+ return ((i & UINT32_C(0xff00ff00)) >> 8) | ((i & UINT32_C(0x00ff00ff)) << 8);
+}
+
+uint64_t bswap64(uint64_t x)
+{
+ return bswap32(x >> 32) | (((uint64_t)bswap32(x)) << 32);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.h
new file mode 100644
index 0000000000..5b797b1d7f
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.h
@@ -0,0 +1,181 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: supersingular isogeny parameters, generation of functions for P434;
+* configuration and platform-dependent macros
+*********************************************************************************************/
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+/* All sikep434r3 functions and global variables in the pq-crypto/sike_r3 directory
+ * should be defined using this namespace macro to avoid symbol collisions. For example,
+ * in foo.h, declare a function as follows:
+ *
+ * #define foo_function S2N_SIKE_P434_R3_NAMESPACE(foo_function)
+ * int foo_function(int foo_argument); */
+#define S2N_SIKE_P434_R3_NAMESPACE(s) s2n_sike_p434_r3_##s
+
+/* Endian-related functionality */
+/* Returns true if the machine is big endian */
+#define is_big_endian S2N_SIKE_P434_R3_NAMESPACE(is_big_endian)
+bool is_big_endian(void);
+
+#define bswap32 S2N_SIKE_P434_R3_NAMESPACE(bswap32)
+uint32_t bswap32(uint32_t x);
+
+#define bswap64 S2N_SIKE_P434_R3_NAMESPACE(bswap64)
+uint64_t bswap64(uint64_t x);
+
+/* Arch specific definitions */
+#define digit_t S2N_SIKE_P434_R3_NAMESPACE(digit_t)
+#define hdigit_t S2N_SIKE_P434_R3_NAMESPACE(hdigit_t)
+#if defined(_AMD64_) || defined(__x86_64) || defined(__x86_64__) || defined(__aarch64__) || defined(_S390X_) || defined(_ARM64_) || defined(__powerpc64__) || (defined(__riscv) && (__riscv_xlen == 64))
+ #define S2N_SIKE_P434_R3_NWORDS_FIELD 7 /* Number of words of a 434-bit field element */
+ #define S2N_SIKE_P434_R3_ZERO_WORDS 3 /* Number of "0" digits in the least significant part of p434 + 1 */
+ #define S2N_SIKE_P434_R3_RADIX 64
+ #define S2N_SIKE_P434_R3_LOG2RADIX 6
+ #define S2N_SIKE_P434_R3_BSWAP_DIGIT(i) bswap64((i))
+ typedef uint64_t digit_t;
+ typedef uint32_t hdigit_t;
+#elif defined(_X86_) || defined(_ARM_) || defined(__arm__) || defined(__i386__)
+ #define S2N_SIKE_P434_R3_NWORDS_FIELD 14 /* Number of words of a 434-bit field element */
+ #define S2N_SIKE_P434_R3_ZERO_WORDS 6 /* Number of "0" digits in the least significant part of p434 + 1 */
+ #define S2N_SIKE_P434_R3_RADIX 32
+ #define S2N_SIKE_P434_R3_LOG2RADIX 5
+ #define S2N_SIKE_P434_R3_BSWAP_DIGIT(i) bswap32((i))
+ typedef uint32_t digit_t;
+ typedef uint16_t hdigit_t;
+#else
+ #error -- "Unsupported ARCHITECTURE"
+#endif
+
+/* Basic constants */
+#define S2N_SIKE_P434_R3_NBITS_FIELD 434
+#define S2N_SIKE_P434_R3_MAXBITS_FIELD 448
+/* Number of 64-bit words of a 434-bit field element */
+#define S2N_SIKE_P434_R3_NWORDS64_FIELD ((S2N_SIKE_P434_R3_NBITS_FIELD+63)/64)
+#define S2N_SIKE_P434_R3_NBITS_ORDER 256
+/* Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. */
+#define S2N_SIKE_P434_R3_NWORDS_ORDER ((S2N_SIKE_P434_R3_NBITS_ORDER+S2N_SIKE_P434_R3_RADIX-1)/S2N_SIKE_P434_R3_RADIX)
+#define S2N_SIKE_P434_R3_ALICE 0
+#define S2N_SIKE_P434_R3_BOB 1
+#define S2N_SIKE_P434_R3_OALICE_BITS 216
+#define S2N_SIKE_P434_R3_OBOB_BITS 218
+#define S2N_SIKE_P434_R3_MASK_ALICE 0xFF
+#define S2N_SIKE_P434_R3_MASK_BOB 0x01
+
+/* Fixed parameters for isogeny tree computation */
+#define S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE 7
+#define S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB 8
+#define S2N_SIKE_P434_R3_MAX_ALICE 108
+#define S2N_SIKE_P434_R3_MAX_BOB 137
+#define S2N_SIKE_P434_R3_MSG_BYTES 16
+#define S2N_SIKE_P434_R3_SECRETKEY_A_BYTES ((S2N_SIKE_P434_R3_OALICE_BITS + 7) / 8)
+#define S2N_SIKE_P434_R3_SECRETKEY_B_BYTES ((S2N_SIKE_P434_R3_OBOB_BITS - 1 + 7) / 8)
+#define S2N_SIKE_P434_R3_FP2_ENCODED_BYTES (2 * ((S2N_SIKE_P434_R3_NBITS_FIELD + 7) / 8))
+
+/* SIDH's basic element definitions and point representations */
+/* Datatype for representing 434-bit field elements (448-bit max.) */
+#define felm_t S2N_SIKE_P434_R3_NAMESPACE(felm_t)
+typedef digit_t felm_t[S2N_SIKE_P434_R3_NWORDS_FIELD];
+
+/* Datatype for representing double-precision 2x434-bit field elements (2x448-bit max.) */
+#define dfelm_t S2N_SIKE_P434_R3_NAMESPACE(dfelm_t)
+typedef digit_t dfelm_t[2*S2N_SIKE_P434_R3_NWORDS_FIELD];
+
+/* Datatype for representing quadratic extension field elements GF(p434^2) */
+#define f2elm_t S2N_SIKE_P434_R3_NAMESPACE(f2elm_t)
+#define felm_s S2N_SIKE_P434_R3_NAMESPACE(felm_s)
+typedef struct felm_s {
+ felm_t e[2];
+} f2elm_t;
+
+/* Point representation in projective XZ Montgomery coordinates. */
+#define point_proj S2N_SIKE_P434_R3_NAMESPACE(point_proj)
+typedef struct { f2elm_t X; f2elm_t Z; } point_proj;
+#define point_proj_t S2N_SIKE_P434_R3_NAMESPACE(point_proj_t)
+typedef point_proj point_proj_t[1];
+
+/* Macro to avoid compiler warnings when detecting unreferenced parameters */
+#define S2N_SIKE_P434_R3_UNREFERENCED_PARAMETER(PAR) ((void)(PAR))
+
+/********************** Constant-time unsigned comparisons ***********************/
+/* The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise */
+
+/* Is x != 0? */
+static __inline unsigned int is_digit_nonzero_ct(const digit_t x)
+{
+ return (unsigned int)((x | (0-x)) >> (S2N_SIKE_P434_R3_RADIX-1));
+}
+
+/* Is x = 0? */
+static __inline unsigned int is_digit_zero_ct(const digit_t x)
+{
+ return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
+}
+
+/* Is x < y? */
+static __inline unsigned int is_digit_lessthan_ct(const digit_t x, const digit_t y)
+{
+ return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (S2N_SIKE_P434_R3_RADIX-1));
+}
+
+/* Definitions for generic C implementation */
+
+typedef uint64_t uint128_t[2];
+
+/* Digit multiplication */
+#define S2N_SIKE_P434_R3_MUL(multiplier, multiplicand, hi, lo) \
+ digit_x_digit((multiplier), (multiplicand), &(lo));
+
+/* Digit addition with carry */
+#define S2N_SIKE_P434_R3_ADDC(carryIn, addend1, addend2, carryOut, sumOut) \
+ { digit_t tempReg = (addend1) + (digit_t)(carryIn); \
+ (sumOut) = (addend2) + tempReg; \
+ (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); }
+
+/* Digit subtraction with borrow */
+#define S2N_SIKE_P434_R3_SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \
+ { digit_t tempReg = (minuend) - (subtrahend); \
+ unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg))); \
+ (differenceOut) = tempReg - (digit_t)(borrowIn); \
+ (borrowOut) = borrowReg; }
+
+/* Shift right with flexible datatype */
+#define S2N_SIKE_P434_R3_SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \
+ (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << ((DigitSize) - (shift)));
+
+/* Fixed parameters for computation */
+#define p434 S2N_SIKE_P434_R3_NAMESPACE(p434)
+extern const uint64_t p434[S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define p434x2 S2N_SIKE_P434_R3_NAMESPACE(p434x2)
+extern const uint64_t p434x2[S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define p434x4 S2N_SIKE_P434_R3_NAMESPACE(p434x4)
+extern const uint64_t p434x4[S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define p434p1 S2N_SIKE_P434_R3_NAMESPACE(p434p1)
+extern const uint64_t p434p1[S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define A_gen S2N_SIKE_P434_R3_NAMESPACE(A_gen)
+extern const uint64_t A_gen[6*S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define B_gen S2N_SIKE_P434_R3_NAMESPACE(B_gen)
+extern const uint64_t B_gen[6*S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define Montgomery_R2 S2N_SIKE_P434_R3_NAMESPACE(Montgomery_R2)
+extern const uint64_t Montgomery_R2[S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define Montgomery_one S2N_SIKE_P434_R3_NAMESPACE(Montgomery_one)
+extern const uint64_t Montgomery_one[S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define strat_Alice S2N_SIKE_P434_R3_NAMESPACE(strat_Alice)
+extern const unsigned int strat_Alice[S2N_SIKE_P434_R3_MAX_ALICE-1];
+
+#define strat_Bob S2N_SIKE_P434_R3_NAMESPACE(strat_Bob)
+extern const unsigned int strat_Bob[S2N_SIKE_P434_R3_MAX_BOB-1];
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_api.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_api.h
new file mode 100644
index 0000000000..cf3c4feb85
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_api.h
@@ -0,0 +1,78 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: API header file for P434
+*********************************************************************************************/
+
+#pragma once
+
+#include "sikep434r3.h"
+
+/*********************** Key encapsulation mechanism API ***********************/
+/* Encoding of keys for KEM-based isogeny system "SIKEp434" (wire format):
+ *
+ * Elements over GF(p434) are encoded in 55 octets in little endian format (i.e., the least
+ * significant octet is located in the lowest memory address). Elements (a+b*i) over GF(p434^2),
+ * where a and b are defined over GF(p434), are encoded as {a, b}, with a in the lowest memory portion.
+ *
+ * Private keys sk consist of the concatenation of a 16-byte random value, a value in the range
+ * [0, 2^217-1] and the public key pk. In the SIKE API, private keys are encoded in 374 octets in
+ * little endian format. Public keys pk consist of 3 elements in GF(p434^2). In the SIKE API, pk
+ * is encoded in 330 octets. Ciphertexts ct consist of the concatenation of a public key value
+ * and a 16-byte value. In the SIKE API, ct is encoded in 330 + 16 = 346 octets. Shared keys ss
+ * consist of a value of 16 octets. */
+
+/*********************** Ephemeral key exchange API ***********************/
+
+/* SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use
+ * it with static keys. See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith,
+ * C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016. Extended version available at:
+ * http://eprint.iacr.org/2016/859 */
+
+/* Generation of Bob's secret key
+ * Outputs random value in [0, 2^Floor(Log(2,3^137)) - 1] to be used as Bob's private key */
+#define random_mod_order_B S2N_SIKE_P434_R3_NAMESPACE(random_mod_order_B)
+int random_mod_order_B(unsigned char* random_digits);
+
+/* Alice's ephemeral public key generation
+ * Input: a private key PrivateKeyA in the range [0, 2^216 - 1], stored in 27 bytes.
+ * Output: the public key PublicKeyA consisting of 3 GF(p434^2) elements encoded in 330 bytes. */
+#define EphemeralKeyGeneration_A S2N_SIKE_P434_R3_NAMESPACE(EphemeralKeyGeneration_A)
+int EphemeralKeyGeneration_A(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA);
+
+/* Bob's ephemeral key-pair generation
+ * It produces a private key PrivateKeyB and computes the public key PublicKeyB.
+ * The private key is an integer in the range [0, 2^Floor(Log(2,3^137)) - 1], stored in 28 bytes.
+ * The public key consists of 3 GF(p434^2) elements encoded in 330 bytes. */
+#define EphemeralKeyGeneration_B S2N_SIKE_P434_R3_NAMESPACE(EphemeralKeyGeneration_B)
+int EphemeralKeyGeneration_B(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB);
+
+/* Alice's ephemeral shared secret computation
+ * It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB
+ * Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^216 - 1], stored in 27 bytes.
+ * Bob's PublicKeyB consists of 3 GF(p434^2) elements encoded in 330 bytes.
+ * Output: a shared secret SharedSecretA that consists of one element in GF(p434^2) encoded in 110 bytes. */
+#define EphemeralSecretAgreement_A S2N_SIKE_P434_R3_NAMESPACE(EphemeralSecretAgreement_A)
+int EphemeralSecretAgreement_A(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA);
+
+/* Bob's ephemeral shared secret computation
+ * It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA
+ * Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^137)) - 1], stored in 28 bytes.
+ * Alice's PublicKeyA consists of 3 GF(p434^2) elements encoded in 330 bytes.
+ * Output: a shared secret SharedSecretB that consists of one element in GF(p434^2) encoded in 110 bytes. */
+#define EphemeralSecretAgreement_B S2N_SIKE_P434_R3_NAMESPACE(EphemeralSecretAgreement_B)
+int EphemeralSecretAgreement_B(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB);
+
+/* Encoding of keys for KEX-based isogeny system "SIDHp434" (wire format):
+ *
+ * Elements over GF(p434) are encoded in 55 octets in little endian format (i.e., the
+ * least significant octet is located in the lowest memory address). Elements (a+b*i)
+ * over GF(p434^2), where a and b are defined over GF(p434), are encoded as {a, b}, with
+ * a in the lowest memory portion.
+ *
+ * Private keys PrivateKeyA and PrivateKeyB can have values in the range [0, 2^216-1] and
+ * [0, 2^Floor(Log(2,3^137)) - 1], resp. In the SIDH API, Alice's and Bob's private keys
+ * are encoded in 27 and 28 octets, resp., in little endian format. Public keys PublicKeyA
+ * and PublicKeyB consist of 3 elements in GF(p434^2). In the SIDH API, they are encoded in
+ * 330 octets. Shared keys SharedSecretA and SharedSecretB consist of one element in GF(p434^2).
+ * In the SIDH API, they are encoded in 110 octets. */
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.c
new file mode 100644
index 0000000000..e5ae4e7c7e
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.c
@@ -0,0 +1,348 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: elliptic curve and isogeny functions
+*********************************************************************************************/
+
+#include "sikep434r3.h"
+#include "sikep434r3_fpx.h"
+#include "sikep434r3_ec_isogeny.h"
+
+/* Doubling of a Montgomery point in projective coordinates (X:Z).
+ * Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C.
+ * Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). */
+void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24)
+{
+ f2elm_t _t0, _t1;
+ f2elm_t *t0=&_t0, *t1=&_t1;
+
+ mp2_sub_p2(&P->X, &P->Z, t0); /* t0 = X1-Z1 */
+ mp2_add(&P->X, &P->Z, t1); /* t1 = X1+Z1 */
+ fp2sqr_mont(t0, t0); /* t0 = (X1-Z1)^2 */
+ fp2sqr_mont(t1, t1); /* t1 = (X1+Z1)^2 */
+ fp2mul_mont(C24, t0, &Q->Z); /* Z2 = C24*(X1-Z1)^2 */
+ fp2mul_mont(t1, &Q->Z, &Q->X); /* X2 = C24*(X1-Z1)^2*(X1+Z1)^2 */
+ mp2_sub_p2(t1, t0, t1); /* t1 = (X1+Z1)^2-(X1-Z1)^2 */
+ fp2mul_mont(A24plus, t1, t0); /* t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] */
+ mp2_add(&Q->Z, t0, &Q->Z); /* Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 */
+ fp2mul_mont(&Q->Z, t1, &Q->Z); /* Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] */
+}
+
+/* Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings.
+ * Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C.
+ * Output: projective Montgomery x-coordinates Q <- (2^e)*P. */
+void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24, const int e)
+{
+ int i;
+
+ copy_words((const digit_t*)P, (digit_t*)Q, 2*2*S2N_SIKE_P434_R3_NWORDS_FIELD);
+
+ for (i = 0; i < e; i++) {
+ xDBL(Q, Q, A24plus, C24);
+ }
+}
+
+/* Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4.
+ * Input: projective point of order four P = (X4:Z4).
+ * Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients
+ * that are used to evaluate the isogeny at a point in eval_4_isog(). */
+void get_4_isog(const point_proj_t P, f2elm_t *A24plus, f2elm_t *C24, f2elm_t *coeff)
+{
+ mp2_sub_p2(&P->X, &P->Z, &coeff[1]); /* coeff[1] = X4-Z4 */
+ mp2_add(&P->X, &P->Z, &coeff[2]); /* coeff[2] = X4+Z4 */
+ fp2sqr_mont(&P->Z, &coeff[0]); /* coeff[0] = Z4^2 */
+ mp2_add(&coeff[0], &coeff[0], &coeff[0]); /* coeff[0] = 2*Z4^2 */
+ fp2sqr_mont(&coeff[0], C24); /* C24 = 4*Z4^4 */
+ mp2_add(&coeff[0], &coeff[0], &coeff[0]); /* coeff[0] = 4*Z4^2 */
+ fp2sqr_mont(&P->X, A24plus); /* A24plus = X4^2 */
+ mp2_add(A24plus, A24plus, A24plus); /* A24plus = 2*X4^2 */
+ fp2sqr_mont(A24plus, A24plus); /* A24plus = 4*X4^4 */
+}
+
+/* Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined
+ * by the 3 coefficients in coeff (computed in the function get_4_isog()).
+ * Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z).
+ * Output: the projective point P = phi(P) = (X:Z) in the codomain. */
+void eval_4_isog(point_proj_t P, f2elm_t *coeff)
+{
+ f2elm_t _t0, _t1;
+ f2elm_t *t0=&_t0, *t1=&_t1;
+
+ mp2_add(&P->X, &P->Z, t0); /* t0 = X+Z */
+ mp2_sub_p2(&P->X, &P->Z, t1); /* t1 = X-Z */
+ fp2mul_mont(t0, &coeff[1], &P->X); /* X = (X+Z)*coeff[1] */
+ fp2mul_mont(t1, &coeff[2], &P->Z); /* Z = (X-Z)*coeff[2] */
+ fp2mul_mont(t0, t1, t0); /* t0 = (X+Z)*(X-Z) */
+ fp2mul_mont(&coeff[0], t0, t0); /* t0 = coeff[0]*(X+Z)*(X-Z) */
+ mp2_add(&P->X, &P->Z, t1); /* t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] */
+ mp2_sub_p2(&P->X, &P->Z, &P->Z); /* Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] */
+ fp2sqr_mont(t1, t1); /* t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 */
+ fp2sqr_mont(&P->Z, &P->Z); /* Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 */
+ mp2_add(t1, t0, &P->X); /* X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 */
+ mp2_sub_p2(&P->Z, t0, t0); /* t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) */
+ fp2mul_mont(&P->X, t1, &P->X); /* Xfinal */
+ fp2mul_mont(&P->Z, t0, &P->Z); /* Zfinal */
+}
+
+/* Tripling of a Montgomery point in projective coordinates (X:Z).
+ * Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C.
+ * Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). */
+void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus)
+{
+ f2elm_t _t0, _t1, _t2, _t3, _t4, _t5, _t6;
+ f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3, *t4=&_t4, *t5=&_t5, *t6=&_t6;
+
+ mp2_sub_p2(&P->X, &P->Z, t0); /* t0 = X-Z */
+ fp2sqr_mont(t0, t2); /* t2 = (X-Z)^2 */
+ mp2_add(&P->X, &P->Z, t1); /* t1 = X+Z */
+ fp2sqr_mont(t1, t3); /* t3 = (X+Z)^2 */
+ mp2_add(&P->X, &P->X, t4); /* t4 = 2*X */
+ mp2_add(&P->Z, &P->Z, t0); /* t0 = 2*Z */
+ fp2sqr_mont(t4, t1); /* t1 = 4*X^2 */
+ mp2_sub_p2(t1, t3, t1); /* t1 = 4*X^2 - (X+Z)^2 */
+ mp2_sub_p2(t1, t2, t1); /* t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 */
+ fp2mul_mont(A24plus, t3, t5); /* t5 = A24plus*(X+Z)^2 */
+ fp2mul_mont(t3, t5, t3); /* t3 = A24plus*(X+Z)^4 */
+ fp2mul_mont(A24minus, t2, t6); /* t6 = A24minus*(X-Z)^2 */
+ fp2mul_mont(t2, t6, t2); /* t2 = A24minus*(X-Z)^4 */
+ mp2_sub_p2(t2, t3, t3); /* t3 = A24minus*(X-Z)^4 - A24plus*(X+Z)^4 */
+ mp2_sub_p2(t5, t6, t2); /* t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 */
+ fp2mul_mont(t1, t2, t1); /* t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] */
+ fp2add(t3, t1, t2); /* t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^4 - A24plus*(X+Z)^4 */
+ fp2sqr_mont(t2, t2); /* t2 = t2^2 */
+ fp2mul_mont(t4, t2, &Q->X); /* X3 = 2*X*t2 */
+ fp2sub(t3, t1, t1); /* t1 = A24minus*(X-Z)^4 - A24plus*(X+Z)^4 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] */
+ fp2sqr_mont(t1, t1); /* t1 = t1^2 */
+ fp2mul_mont(t0, t1, &Q->Z); /* Z3 = 2*Z*t1 */
+}
+
+/* Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings.
+ * Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C.
+ * Output: projective Montgomery x-coordinates Q <- (3^e)*P. */
+void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus, const int e)
+{
+ int i;
+
+ copy_words((const digit_t*)P, (digit_t*)Q, 2*2*S2N_SIKE_P434_R3_NWORDS_FIELD);
+
+ for (i = 0; i < e; i++) {
+ xTPL(Q, Q, A24minus, A24plus);
+ }
+}
+
+/* Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3.
+ * Input: projective point of order three P = (X3:Z3).
+ * Output: the 3-isogenous Montgomery curve with projective coefficient A/C. */
+void get_3_isog(const point_proj_t P, f2elm_t *A24minus, f2elm_t *A24plus, f2elm_t *coeff)
+{
+ f2elm_t _t0, _t1, _t2, _t3, _t4;
+ f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3, *t4=&_t4;
+
+ mp2_sub_p2(&P->X, &P->Z, &coeff[0]); /* coeff0 = X-Z */
+ fp2sqr_mont(&coeff[0], t0); /* t0 = (X-Z)^2 */
+ mp2_add(&P->X, &P->Z, &coeff[1]); /* coeff1 = X+Z */
+ fp2sqr_mont(&coeff[1], t1); /* t1 = (X+Z)^2 */
+ mp2_add(&P->X, &P->X, t3); /* t3 = 2*X */
+ fp2sqr_mont(t3, t3); /* t3 = 4*X^2 */
+ fp2sub(t3, t0, t2); /* t2 = 4*X^2 - (X-Z)^2 */
+ fp2sub(t3, t1, t3); /* t3 = 4*X^2 - (X+Z)^2 */
+ mp2_add(t0, t3, t4); /* t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 */
+ mp2_add(t4, t4, t4); /* t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) */
+ mp2_add(t1, t4, t4); /* t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 */
+ fp2mul_mont(t2, t4, A24minus); /* A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] */
+ mp2_add(t1, t2, t4); /* t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 */
+ mp2_add(t4, t4, t4); /* t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) */
+ mp2_add(t0, t4, t4); /* t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 */
+ fp2mul_mont(t3, t4, A24plus); /* A24plus = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] */
+}
+
+/* Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and
+ * a point P with 2 coefficients in coeff (computed in the function get_3_isog()).
+ * Inputs: projective points P = (X3:Z3) and Q = (X:Z).
+ * Output: the projective point Q <- phi(Q) = (X3:Z3). */
+void eval_3_isog(point_proj_t Q, const f2elm_t *coeff)
+{
+ f2elm_t _t0, _t1, _t2;
+ f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2;
+
+ mp2_add(&Q->X, &Q->Z, t0); /* t0 = X+Z */
+ mp2_sub_p2(&Q->X, &Q->Z, t1); /* t1 = X-Z */
+ fp2mul_mont(&coeff[0], t0, t0); /* t0 = coeff0*(X+Z) */
+ fp2mul_mont(&coeff[1], t1, t1); /* t1 = coeff1*(X-Z) */
+ mp2_add(t0, t1, t2); /* t2 = coeff0*(X+Z) + coeff1*(X-Z) */
+ mp2_sub_p2(t1, t0, t0); /* t0 = coeff1*(X-Z) - coeff0*(X+Z) */
+ fp2sqr_mont(t2, t2); /* t2 = [coeff0*(X+Z) + coeff1*(X-Z)]^2 */
+ fp2sqr_mont(t0, t0); /* t0 = [coeff1*(X-Z) - coeff0*(X+Z)]^2 */
+ fp2mul_mont(&Q->X, t2, &Q->X); /* X3final = X*[coeff0*(X+Z) + coeff1*(X-Z)]^2 */
+ fp2mul_mont(&Q->Z, t0, &Q->Z); /* Z3final = Z*[coeff1*(X-Z) - coeff0*(X+Z)]^2 */
+}
+
+/* 3-way simultaneous inversion
+ * Input: z1,z2,z3
+ * Output: 1/z1,1/z2,1/z3 (override inputs). */
+void inv_3_way(f2elm_t *z1, f2elm_t *z2, f2elm_t *z3)
+{
+ f2elm_t _t0, _t1, _t2, _t3;
+ f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3;
+
+ fp2mul_mont(z1, z2, t0); /* t0 = z1*z2 */
+ fp2mul_mont(z3, t0, t1); /* t1 = z1*z2*z3 */
+ fp2inv_mont(t1); /* t1 = 1/(z1*z2*z3) */
+ fp2mul_mont(z3, t1, t2); /* t2 = 1/(z1*z2) */
+ fp2mul_mont(t2, z2, t3); /* t3 = 1/z1 */
+ fp2mul_mont(t2, z1, z2); /* z2 = 1/z2 */
+ fp2mul_mont(t0, t1, z3); /* z3 = 1/z3 */
+ fp2copy(t3, z1); /* z1 = 1/z1 */
+}
+
+/* Given the x-coordinates of P, Q, and R, returns the value A corresponding to the
+ * Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A.
+ * Input: the x-coordinates xP, xQ, and xR of the points P, Q and R.
+ * Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x. */
+void get_A(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xR, f2elm_t *A)
+{
+ f2elm_t _t0, _t1, one = {0};
+ f2elm_t *t0=&_t0, *t1=&_t1;
+
+
+ fpcopy((const digit_t*)&Montgomery_one,one.e[0]);
+ fp2add(xP, xQ, t1); /* t1 = xP+xQ */
+ fp2mul_mont(xP, xQ, t0); /* t0 = xP*xQ */
+ fp2mul_mont(xR, t1, A); /* A = xR*t1 */
+ fp2add(t0, A, A); /* A = A+t0 */
+ fp2mul_mont(t0, xR, t0); /* t0 = t0*xR */
+ fp2sub(A, &one, A); /* A = A-1 */
+ fp2add(t0, t0, t0); /* t0 = t0+t0 */
+ fp2add(t1, xR, t1); /* t1 = t1+xR */
+ fp2add(t0, t0, t0); /* t0 = t0+t0 */
+ fp2sqr_mont(A, A); /* A = A^2 */
+ fp2inv_mont(t0); /* t0 = 1/t0 */
+ fp2mul_mont(A, t0, A); /* A = A*t0 */
+ fp2sub(A, t1, A); /* Afinal = A-t1 */
+}
+
+/* Computes the j-invariant of a Montgomery curve with projective constant.
+ * Input: A,C in GF(p^2).
+ * Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve
+ * B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x. */
+void j_inv(const f2elm_t *A, const f2elm_t *C, f2elm_t *jinv)
+{
+ f2elm_t _t0, _t1;
+ f2elm_t *t0=&_t0, *t1=&_t1;
+
+ fp2sqr_mont(A, jinv); /* jinv = A^2 */
+ fp2sqr_mont(C, t1); /* t1 = C^2 */
+ fp2add(t1, t1, t0); /* t0 = t1+t1 */
+ fp2sub(jinv, t0, t0); /* t0 = jinv-t0 */
+ fp2sub(t0, t1, t0); /* t0 = t0-t1 */
+ fp2sub(t0, t1, jinv); /* jinv = t0-t1 */
+ fp2sqr_mont(t1, t1); /* t1 = t1^2 */
+ fp2mul_mont(jinv, t1, jinv); /* jinv = jinv*t1 */
+ fp2add(t0, t0, t0); /* t0 = t0+t0 */
+ fp2add(t0, t0, t0); /* t0 = t0+t0 */
+ fp2sqr_mont(t0, t1); /* t1 = t0^2 */
+ fp2mul_mont(t0, t1, t0); /* t0 = t0*t1 */
+ fp2add(t0, t0, t0); /* t0 = t0+t0 */
+ fp2add(t0, t0, t0); /* t0 = t0+t0 */
+ fp2inv_mont(jinv); /* jinv = 1/jinv */
+ fp2mul_mont(jinv, t0, jinv); /* jinv = t0*jinv */
+}
+
+/* Simultaneous doubling and differential addition.
+ * Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ,
+ * affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4.
+ * Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P,
+ * and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. */
+static void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t *xPQ, const f2elm_t *A24)
+{
+ f2elm_t _t0, _t1, _t2;
+ f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2;
+
+ mp2_add(&P->X, &P->Z, t0); /* t0 = XP+ZP */
+ mp2_sub_p2(&P->X, &P->Z, t1); /* t1 = XP-ZP */
+ fp2sqr_mont(t0, &P->X); /* XP = (XP+ZP)^2 */
+ mp2_sub_p2(&Q->X, &Q->Z, t2); /* t2 = XQ-ZQ */
+ mp2_add(&Q->X, &Q->Z, &Q->X); /* XQ = XQ+ZQ */
+ fp2mul_mont(t0, t2, t0); /* t0 = (XP+ZP)*(XQ-ZQ) */
+ fp2sqr_mont(t1, &P->Z); /* ZP = (XP-ZP)^2 */
+ fp2mul_mont(t1, &Q->X, t1); /* t1 = (XP-ZP)*(XQ+ZQ) */
+ mp2_sub_p2(&P->X, &P->Z, t2); /* t2 = (XP+ZP)^2-(XP-ZP)^2 */
+ fp2mul_mont(&P->X, &P->Z, &P->X); /* XP = (XP+ZP)^2*(XP-ZP)^2 */
+ fp2mul_mont(A24, t2, &Q->X); /* XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] */
+ mp2_sub_p2(t0, t1, &Q->Z); /* ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) */
+ mp2_add(&Q->X, &P->Z, &P->Z); /* ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 */
+ mp2_add(t0, t1, &Q->X); /* XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) */
+ fp2mul_mont(&P->Z, t2, &P->Z); /* ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] */
+ fp2sqr_mont(&Q->Z, &Q->Z); /* ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 */
+ fp2sqr_mont(&Q->X, &Q->X); /* XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 */
+ fp2mul_mont(&Q->Z, xPQ, &Q->Z); /* ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 */
+}
+
+/* Swap points.
+ * If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P */
+static void swap_points(point_proj_t P, point_proj_t Q, const digit_t option)
+{
+ unsigned int i;
+
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ digit_t temp = option & (P->X.e[0][i] ^ Q->X.e[0][i]);
+ P->X.e[0][i] = temp ^ P->X.e[0][i];
+ Q->X.e[0][i] = temp ^ Q->X.e[0][i];
+ temp = option & (P->X.e[1][i] ^ Q->X.e[1][i]);
+ P->X.e[1][i] = temp ^ P->X.e[1][i];
+ Q->X.e[1][i] = temp ^ Q->X.e[1][i];
+ temp = option & (P->Z.e[0][i] ^ Q->Z.e[0][i]);
+ P->Z.e[0][i] = temp ^ P->Z.e[0][i];
+ Q->Z.e[0][i] = temp ^ Q->Z.e[0][i];
+ temp = option & (P->Z.e[1][i] ^ Q->Z.e[1][i]);
+ P->Z.e[1][i] = temp ^ P->Z.e[1][i];
+ Q->Z.e[1][i] = temp ^ Q->Z.e[1][i];
+ }
+}
+
+void LADDER3PT(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xPQ, const digit_t* m,
+ const unsigned int AliceOrBob, point_proj_t R, const f2elm_t *A)
+{
+ point_proj_t R0 = {0}, R2 = {0};
+ f2elm_t _A24 = {0};
+ f2elm_t *A24 = &_A24;
+ digit_t mask;
+ int i, nbits, swap, prevbit = 0;
+
+ if (AliceOrBob == S2N_SIKE_P434_R3_ALICE) {
+ nbits = S2N_SIKE_P434_R3_OALICE_BITS;
+ } else {
+ nbits = S2N_SIKE_P434_R3_OBOB_BITS - 1;
+ }
+
+ /* Initializing constant */
+ fpcopy((const digit_t*)&Montgomery_one, A24->e[0]);
+ mp2_add(A24, A24, A24);
+ mp2_add(A, A24, A24);
+ fp2div2(A24, A24);
+ fp2div2(A24, A24); /* A24 = (A+2)/4 */
+
+ /* Initializing points */
+ fp2copy(xQ, &R0->X);
+ fpcopy((const digit_t*)&Montgomery_one, (digit_t*)&R0->Z);
+ fp2copy(xPQ, &R2->X);
+ fpcopy((const digit_t*)&Montgomery_one, (digit_t*)&R2->Z);
+ fp2copy(xP, &R->X);
+ fpcopy((const digit_t*)&Montgomery_one, (digit_t*)&R->Z);
+ fpzero((digit_t*)(R->Z.e)[1]);
+
+ /* Main loop */
+ for (i = 0; i < nbits; i++) {
+ int bit = (m[i >> S2N_SIKE_P434_R3_LOG2RADIX] >> (i & (S2N_SIKE_P434_R3_RADIX-1))) & 1;
+ swap = bit ^ prevbit;
+ prevbit = bit;
+ mask = 0 - (digit_t)swap;
+
+ swap_points(R, R2, mask);
+ xDBLADD(R0, R2, &R->X, A24);
+ fp2mul_mont(&R2->X, &R->Z, &R2->X);
+ }
+ swap = 0 ^ prevbit;
+ mask = 0 - (digit_t)swap;
+ swap_points(R, R2, mask);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.h
new file mode 100644
index 0000000000..44245ec726
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.h
@@ -0,0 +1,46 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: elliptic curve and isogeny functions
+*********************************************************************************************/
+
+#pragma once
+
+#include "sikep434r3.h"
+
+#define xDBL S2N_SIKE_P434_R3_NAMESPACE(xDBL)
+void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24);
+
+#define xDBLe S2N_SIKE_P434_R3_NAMESPACE(xDBLe)
+void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24, const int e);
+
+#define get_4_isog S2N_SIKE_P434_R3_NAMESPACE(get_4_isog)
+void get_4_isog(const point_proj_t P, f2elm_t *A24plus, f2elm_t *C24, f2elm_t *coeff);
+
+#define eval_4_isog S2N_SIKE_P434_R3_NAMESPACE(eval_4_isog)
+void eval_4_isog(point_proj_t P, f2elm_t* coeff);
+
+#define xTPL S2N_SIKE_P434_R3_NAMESPACE(xTPL)
+void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus);
+
+#define xTPLe S2N_SIKE_P434_R3_NAMESPACE(xTPLe)
+void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus, const int e);
+
+#define get_3_isog S2N_SIKE_P434_R3_NAMESPACE(get_3_isog)
+void get_3_isog(const point_proj_t P, f2elm_t *A24minus, f2elm_t *A24plus, f2elm_t *coeff);
+
+#define eval_3_isog S2N_SIKE_P434_R3_NAMESPACE(eval_3_isog)
+void eval_3_isog(point_proj_t Q, const f2elm_t *coeff);
+
+#define inv_3_way S2N_SIKE_P434_R3_NAMESPACE(inv_3_way)
+void inv_3_way(f2elm_t *z1, f2elm_t *z2, f2elm_t *z3);
+
+#define get_A S2N_SIKE_P434_R3_NAMESPACE(get_A)
+void get_A(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xR, f2elm_t *A);
+
+#define j_inv S2N_SIKE_P434_R3_NAMESPACE(j_inv)
+void j_inv(const f2elm_t *A, const f2elm_t *C, f2elm_t *jinv);
+
+#define LADDER3PT S2N_SIKE_P434_R3_NAMESPACE(LADDER3PT)
+void LADDER3PT(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xPQ, const digit_t *m,
+ const unsigned int AliceOrBob, point_proj_t R, const f2elm_t *A);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.c
new file mode 100644
index 0000000000..413cb2b8e4
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.c
@@ -0,0 +1,417 @@
+/********************************************************************************************
+* SHA3-derived function SHAKE
+*
+* Based on the public domain implementation in crypto_hash/keccakc512/simple/
+* from http://bench.cr.yp.to/supercop.html by Ronny Van Keer
+* and the public domain "TweetFips202" implementation from https://twitter.com/tweetfips202
+* by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe
+*
+* See NIST Special Publication 800-185 for more information:
+* http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-185.pdf
+*
+*********************************************************************************************/
+
+#include <stdint.h>
+#include <stddef.h>
+#include "sikep434r3.h"
+#include "sikep434r3_fips202.h"
+
+#define NROUNDS 24
+#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset)))
+
+/*************************************************
+ * Name: load64
+ *
+ * Description: Load 8 bytes into uint64_t in little-endian order
+ *
+ * Arguments: - const uint8_t *x: pointer to input byte array
+ *
+ * Returns the loaded 64-bit unsigned integer
+ **************************************************/
+static uint64_t load64(const uint8_t *x) {
+ uint64_t r = 0;
+ for (size_t i = 0; i < 8; ++i) {
+ r |= (uint64_t)x[i] << 8 * i;
+ }
+
+ return r;
+}
+
+/*************************************************
+ * Name: store64
+ *
+ * Description: Store a 64-bit integer to a byte array in little-endian order
+ *
+ * Arguments: - uint8_t *x: pointer to the output byte array
+ * - uint64_t u: input 64-bit unsigned integer
+ **************************************************/
+static void store64(uint8_t *x, uint64_t u) {
+ for (size_t i = 0; i < 8; ++i) {
+ x[i] = (uint8_t) (u >> 8 * i);
+ }
+}
+
+static const uint64_t KeccakF_RoundConstants[NROUNDS] = {
+ (uint64_t)0x0000000000000001ULL,
+ (uint64_t)0x0000000000008082ULL,
+ (uint64_t)0x800000000000808aULL,
+ (uint64_t)0x8000000080008000ULL,
+ (uint64_t)0x000000000000808bULL,
+ (uint64_t)0x0000000080000001ULL,
+ (uint64_t)0x8000000080008081ULL,
+ (uint64_t)0x8000000000008009ULL,
+ (uint64_t)0x000000000000008aULL,
+ (uint64_t)0x0000000000000088ULL,
+ (uint64_t)0x0000000080008009ULL,
+ (uint64_t)0x000000008000000aULL,
+ (uint64_t)0x000000008000808bULL,
+ (uint64_t)0x800000000000008bULL,
+ (uint64_t)0x8000000000008089ULL,
+ (uint64_t)0x8000000000008003ULL,
+ (uint64_t)0x8000000000008002ULL,
+ (uint64_t)0x8000000000000080ULL,
+ (uint64_t)0x000000000000800aULL,
+ (uint64_t)0x800000008000000aULL,
+ (uint64_t)0x8000000080008081ULL,
+ (uint64_t)0x8000000000008080ULL,
+ (uint64_t)0x0000000080000001ULL,
+ (uint64_t)0x8000000080008008ULL,
+};
+
+static void KeccakF1600_StatePermute(uint64_t * state)
+{
+ int round;
+ uint64_t Aba, Abe, Abi, Abo, Abu;
+ uint64_t Aga, Age, Agi, Ago, Agu;
+ uint64_t Aka, Ake, Aki, Ako, Aku;
+ uint64_t Ama, Ame, Ami, Amo, Amu;
+ uint64_t Asa, Ase, Asi, Aso, Asu;
+
+ /* copyFromState(A, state) */
+ Aba = state[ 0];
+ Abe = state[ 1];
+ Abi = state[ 2];
+ Abo = state[ 3];
+ Abu = state[ 4];
+ Aga = state[ 5];
+ Age = state[ 6];
+ Agi = state[ 7];
+ Ago = state[ 8];
+ Agu = state[ 9];
+ Aka = state[10];
+ Ake = state[11];
+ Aki = state[12];
+ Ako = state[13];
+ Aku = state[14];
+ Ama = state[15];
+ Ame = state[16];
+ Ami = state[17];
+ Amo = state[18];
+ Amu = state[19];
+ Asa = state[20];
+ Ase = state[21];
+ Asi = state[22];
+ Aso = state[23];
+ Asu = state[24];
+
+ for( round = 0; round < NROUNDS; round += 2 ) {
+ uint64_t BCa, BCe, BCi, BCo, BCu;
+ uint64_t Da, De, Di, Do, Du;
+ uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
+ uint64_t Ega, Ege, Egi, Ego, Egu;
+ uint64_t Eka, Eke, Eki, Eko, Eku;
+ uint64_t Ema, Eme, Emi, Emo, Emu;
+ uint64_t Esa, Ese, Esi, Eso, Esu;
+
+ /* prepareTheta */
+ BCa = Aba^Aga^Aka^Ama^Asa;
+ BCe = Abe^Age^Ake^Ame^Ase;
+ BCi = Abi^Agi^Aki^Ami^Asi;
+ BCo = Abo^Ago^Ako^Amo^Aso;
+ BCu = Abu^Agu^Aku^Amu^Asu;
+
+ /* thetaRhoPiChiIotaPrepareTheta(round , A, E) */
+ Da = BCu^ROL(BCe, 1);
+ De = BCa^ROL(BCi, 1);
+ Di = BCe^ROL(BCo, 1);
+ Do = BCi^ROL(BCu, 1);
+ Du = BCo^ROL(BCa, 1);
+
+ Aba ^= Da;
+ BCa = Aba;
+ Age ^= De;
+ BCe = ROL(Age, 44);
+ Aki ^= Di;
+ BCi = ROL(Aki, 43);
+ Amo ^= Do;
+ BCo = ROL(Amo, 21);
+ Asu ^= Du;
+ BCu = ROL(Asu, 14);
+ Eba = BCa ^((~BCe)& BCi );
+ Eba ^= (uint64_t)KeccakF_RoundConstants[round];
+ Ebe = BCe ^((~BCi)& BCo );
+ Ebi = BCi ^((~BCo)& BCu );
+ Ebo = BCo ^((~BCu)& BCa );
+ Ebu = BCu ^((~BCa)& BCe );
+
+ Abo ^= Do;
+ BCa = ROL(Abo, 28);
+ Agu ^= Du;
+ BCe = ROL(Agu, 20);
+ Aka ^= Da;
+ BCi = ROL(Aka, 3);
+ Ame ^= De;
+ BCo = ROL(Ame, 45);
+ Asi ^= Di;
+ BCu = ROL(Asi, 61);
+ Ega = BCa ^((~BCe)& BCi );
+ Ege = BCe ^((~BCi)& BCo );
+ Egi = BCi ^((~BCo)& BCu );
+ Ego = BCo ^((~BCu)& BCa );
+ Egu = BCu ^((~BCa)& BCe );
+
+ Abe ^= De;
+ BCa = ROL(Abe, 1);
+ Agi ^= Di;
+ BCe = ROL(Agi, 6);
+ Ako ^= Do;
+ BCi = ROL(Ako, 25);
+ Amu ^= Du;
+ BCo = ROL(Amu, 8);
+ Asa ^= Da;
+ BCu = ROL(Asa, 18);
+ Eka = BCa ^((~BCe)& BCi );
+ Eke = BCe ^((~BCi)& BCo );
+ Eki = BCi ^((~BCo)& BCu );
+ Eko = BCo ^((~BCu)& BCa );
+ Eku = BCu ^((~BCa)& BCe );
+
+ Abu ^= Du;
+ BCa = ROL(Abu, 27);
+ Aga ^= Da;
+ BCe = ROL(Aga, 36);
+ Ake ^= De;
+ BCi = ROL(Ake, 10);
+ Ami ^= Di;
+ BCo = ROL(Ami, 15);
+ Aso ^= Do;
+ BCu = ROL(Aso, 56);
+ Ema = BCa ^((~BCe)& BCi );
+ Eme = BCe ^((~BCi)& BCo );
+ Emi = BCi ^((~BCo)& BCu );
+ Emo = BCo ^((~BCu)& BCa );
+ Emu = BCu ^((~BCa)& BCe );
+
+ Abi ^= Di;
+ BCa = ROL(Abi, 62);
+ Ago ^= Do;
+ BCe = ROL(Ago, 55);
+ Aku ^= Du;
+ BCi = ROL(Aku, 39);
+ Ama ^= Da;
+ BCo = ROL(Ama, 41);
+ Ase ^= De;
+ BCu = ROL(Ase, 2);
+ Esa = BCa ^((~BCe)& BCi );
+ Ese = BCe ^((~BCi)& BCo );
+ Esi = BCi ^((~BCo)& BCu );
+ Eso = BCo ^((~BCu)& BCa );
+ Esu = BCu ^((~BCa)& BCe );
+
+ /* prepareTheta */
+ BCa = Eba^Ega^Eka^Ema^Esa;
+ BCe = Ebe^Ege^Eke^Eme^Ese;
+ BCi = Ebi^Egi^Eki^Emi^Esi;
+ BCo = Ebo^Ego^Eko^Emo^Eso;
+ BCu = Ebu^Egu^Eku^Emu^Esu;
+
+ /* thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */
+ Da = BCu^ROL(BCe, 1);
+ De = BCa^ROL(BCi, 1);
+ Di = BCe^ROL(BCo, 1);
+ Do = BCi^ROL(BCu, 1);
+ Du = BCo^ROL(BCa, 1);
+
+ Eba ^= Da;
+ BCa = Eba;
+ Ege ^= De;
+ BCe = ROL(Ege, 44);
+ Eki ^= Di;
+ BCi = ROL(Eki, 43);
+ Emo ^= Do;
+ BCo = ROL(Emo, 21);
+ Esu ^= Du;
+ BCu = ROL(Esu, 14);
+ Aba = BCa ^((~BCe)& BCi );
+ Aba ^= (uint64_t)KeccakF_RoundConstants[round+1];
+ Abe = BCe ^((~BCi)& BCo );
+ Abi = BCi ^((~BCo)& BCu );
+ Abo = BCo ^((~BCu)& BCa );
+ Abu = BCu ^((~BCa)& BCe );
+
+ Ebo ^= Do;
+ BCa = ROL(Ebo, 28);
+ Egu ^= Du;
+ BCe = ROL(Egu, 20);
+ Eka ^= Da;
+ BCi = ROL(Eka, 3);
+ Eme ^= De;
+ BCo = ROL(Eme, 45);
+ Esi ^= Di;
+ BCu = ROL(Esi, 61);
+ Aga = BCa ^((~BCe)& BCi );
+ Age = BCe ^((~BCi)& BCo );
+ Agi = BCi ^((~BCo)& BCu );
+ Ago = BCo ^((~BCu)& BCa );
+ Agu = BCu ^((~BCa)& BCe );
+
+ Ebe ^= De;
+ BCa = ROL(Ebe, 1);
+ Egi ^= Di;
+ BCe = ROL(Egi, 6);
+ Eko ^= Do;
+ BCi = ROL(Eko, 25);
+ Emu ^= Du;
+ BCo = ROL(Emu, 8);
+ Esa ^= Da;
+ BCu = ROL(Esa, 18);
+ Aka = BCa ^((~BCe)& BCi );
+ Ake = BCe ^((~BCi)& BCo );
+ Aki = BCi ^((~BCo)& BCu );
+ Ako = BCo ^((~BCu)& BCa );
+ Aku = BCu ^((~BCa)& BCe );
+
+ Ebu ^= Du;
+ BCa = ROL(Ebu, 27);
+ Ega ^= Da;
+ BCe = ROL(Ega, 36);
+ Eke ^= De;
+ BCi = ROL(Eke, 10);
+ Emi ^= Di;
+ BCo = ROL(Emi, 15);
+ Eso ^= Do;
+ BCu = ROL(Eso, 56);
+ Ama = BCa ^((~BCe)& BCi );
+ Ame = BCe ^((~BCi)& BCo );
+ Ami = BCi ^((~BCo)& BCu );
+ Amo = BCo ^((~BCu)& BCa );
+ Amu = BCu ^((~BCa)& BCe );
+
+ Ebi ^= Di;
+ BCa = ROL(Ebi, 62);
+ Ego ^= Do;
+ BCe = ROL(Ego, 55);
+ Eku ^= Du;
+ BCi = ROL(Eku, 39);
+ Ema ^= Da;
+ BCo = ROL(Ema, 41);
+ Ese ^= De;
+ BCu = ROL(Ese, 2);
+ Asa = BCa ^((~BCe)& BCi );
+ Ase = BCe ^((~BCi)& BCo );
+ Asi = BCi ^((~BCo)& BCu );
+ Aso = BCo ^((~BCu)& BCa );
+ Asu = BCu ^((~BCa)& BCe );
+ }
+
+ /* copyToState(state, A) */
+ state[ 0] = Aba;
+ state[ 1] = Abe;
+ state[ 2] = Abi;
+ state[ 3] = Abo;
+ state[ 4] = Abu;
+ state[ 5] = Aga;
+ state[ 6] = Age;
+ state[ 7] = Agi;
+ state[ 8] = Ago;
+ state[ 9] = Agu;
+ state[10] = Aka;
+ state[11] = Ake;
+ state[12] = Aki;
+ state[13] = Ako;
+ state[14] = Aku;
+ state[15] = Ama;
+ state[16] = Ame;
+ state[17] = Ami;
+ state[18] = Amo;
+ state[19] = Amu;
+ state[20] = Asa;
+ state[21] = Ase;
+ state[22] = Asi;
+ state[23] = Aso;
+ state[24] = Asu;
+}
+
+static void keccak_absorb(uint64_t *s, unsigned int r, const unsigned char *m, unsigned long long int mlen,
+ unsigned char p)
+{
+ unsigned long long i;
+ unsigned char t[200];
+
+ while (mlen >= r) {
+ for (i = 0; i < r / 8; ++i)
+ s[i] ^= load64(m + 8 * i);
+
+ KeccakF1600_StatePermute(s);
+ mlen -= r;
+ m += r;
+ }
+
+ for (i = 0; i < r; ++i) {
+ t[i] = 0;
+ }
+ for (i = 0; i < mlen; ++i) {
+ t[i] = m[i];
+ }
+
+ t[i] = p;
+ t[r - 1] |= 128;
+
+ for (i = 0; i < r / 8; ++i) {
+ s[i] ^= load64(t + 8 * i);
+ }
+}
+
+static void keccak_squeezeblocks(unsigned char *h, unsigned long long int nblocks, uint64_t *s, unsigned int r)
+{
+ unsigned int i;
+
+ while(nblocks > 0) {
+ KeccakF1600_StatePermute(s);
+ for (i = 0; i < (r>>3); i++) {
+ store64(h+8*i, s[i]);
+ }
+
+ h += r;
+ nblocks--;
+ }
+}
+
+void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen)
+{
+ uint64_t s[25];
+ unsigned char t[SHAKE256_RATE];
+ unsigned long long nblocks = outlen / SHAKE256_RATE;
+ size_t i;
+
+ for (i = 0; i < 25; ++i) {
+ s[i] = 0;
+ }
+
+ /* Absorb input */
+ keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F);
+
+ /* Squeeze output */
+ keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE);
+
+ output += nblocks * SHAKE256_RATE;
+ outlen -= nblocks * SHAKE256_RATE;
+
+ if (outlen) {
+ keccak_squeezeblocks(t, 1, s, SHAKE256_RATE);
+
+ for (i = 0; i < outlen; i++) {
+ output[i] = t[i];
+ }
+ }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.h
new file mode 100644
index 0000000000..9dd237a491
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.h
@@ -0,0 +1,23 @@
+/********************************************************************************************
+* SHA3-derived function SHAKE
+*
+* Based on the public domain implementation in crypto_hash/keccakc512/simple/
+* from http://bench.cr.yp.to/supercop.html by Ronny Van Keer
+* and the public domain "TweetFips202" implementation from https://twitter.com/tweetfips202
+* by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe
+*
+* See NIST Special Publication 800-185 for more information:
+* http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-185.pdf
+*
+*********************************************************************************************/
+
+#pragma once
+
+#include <stdint.h>
+#include "sikep434r3.h"
+
+#define SHAKE128_RATE 168
+#define SHAKE256_RATE 136
+
+#define shake256 S2N_SIKE_P434_R3_NAMESPACE(shake256)
+void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.c
new file mode 100644
index 0000000000..867ac0f6c1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.c
@@ -0,0 +1,297 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: modular arithmetic for P434
+*********************************************************************************************/
+
+#include "sikep434r3.h"
+#include "pq-crypto/s2n_pq.h"
+#include "sikep434r3_fp.h"
+#include "sikep434r3_fpx.h"
+#include "sikep434r3_fp_x64_asm.h"
+
+/* Multiprecision subtraction with correction with 2*p, c = a-b+2p. */
+void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+ if (s2n_sikep434r3_asm_is_enabled()) {
+ mp_sub434_p2_asm(a, b, c);
+ return;
+ }
+#endif
+
+ unsigned int i, borrow = 0;
+
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_SUBC(borrow, a[i], b[i], borrow, c[i]);
+ }
+
+ borrow = 0;
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_ADDC(borrow, c[i], ((const digit_t*)p434x2)[i], borrow, c[i]);
+ }
+}
+
+/* Multiprecision subtraction with correction with 4*p, c = a-b+4p. */
+void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+ if (s2n_sikep434r3_asm_is_enabled()) {
+ mp_sub434_p4_asm(a, b, c);
+ return;
+ }
+#endif
+
+ unsigned int i, borrow = 0;
+
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_SUBC(borrow, a[i], b[i], borrow, c[i]);
+ }
+
+ borrow = 0;
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_ADDC(borrow, c[i], ((const digit_t*)p434x4)[i], borrow, c[i]);
+ }
+}
+
+/* Modular addition, c = a+b mod p434.
+ * Inputs: a, b in [0, 2*p434-1]
+ * Output: c in [0, 2*p434-1] */
+void fpadd434(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+ if (s2n_sikep434r3_asm_is_enabled()) {
+ fpadd434_asm(a, b, c);
+ return;
+ }
+#endif
+ unsigned int i, carry = 0;
+ digit_t mask;
+
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_ADDC(carry, a[i], b[i], carry, c[i]);
+ }
+
+ carry = 0;
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_SUBC(carry, c[i], ((const digit_t*)p434x2)[i], carry, c[i]);
+ }
+ mask = 0 - (digit_t)carry;
+
+ carry = 0;
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_ADDC(carry, c[i], ((const digit_t*)p434x2)[i] & mask, carry, c[i]);
+ }
+}
+
+/* Modular subtraction, c = a-b mod p434.
+ * Inputs: a, b in [0, 2*p434-1]
+ * Output: c in [0, 2*p434-1] */
+void fpsub434(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+ if (s2n_sikep434r3_asm_is_enabled()) {
+ fpsub434_asm(a, b, c);
+ return;
+ }
+#endif
+
+ unsigned int i, borrow = 0;
+ digit_t mask;
+
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_SUBC(borrow, a[i], b[i], borrow, c[i]);
+ }
+ mask = 0 - (digit_t)borrow;
+
+ borrow = 0;
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_ADDC(borrow, c[i], ((const digit_t*)p434x2)[i] & mask, borrow, c[i]);
+ }
+}
+
+/* Modular negation, a = -a mod p434.
+ * Input/output: a in [0, 2*p434-1] */
+void fpneg434(digit_t* a)
+{
+ unsigned int i, borrow = 0;
+
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_SUBC(borrow, ((const digit_t*)p434x2)[i], a[i], borrow, a[i]);
+ }
+}
+
+/* Modular division by two, c = a/2 mod p434.
+ * Input : a in [0, 2*p434-1]
+ * Output: c in [0, 2*p434-1] */
+void fpdiv2_434(const digit_t* a, digit_t* c)
+{
+ unsigned int i, carry = 0;
+ digit_t mask;
+
+ mask = 0 - (digit_t)(a[0] & 1); /* If a is odd compute a+p434 */
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_ADDC(carry, a[i], ((const digit_t*)p434)[i] & mask, carry, c[i]);
+ }
+
+ mp_shiftr1(c, S2N_SIKE_P434_R3_NWORDS_FIELD);
+}
+
+/* Modular correction to reduce field element a in [0, 2*p434-1] to [0, p434-1]. */
+void fpcorrection434(digit_t* a)
+{
+ unsigned int i, borrow = 0;
+ digit_t mask;
+
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_SUBC(borrow, a[i], ((const digit_t*)p434)[i], borrow, a[i]);
+ }
+ mask = 0 - (digit_t)borrow;
+
+ borrow = 0;
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ S2N_SIKE_P434_R3_ADDC(borrow, a[i], ((const digit_t*)p434)[i] & mask, borrow, a[i]);
+ }
+}
+
+/* Digit multiplication, digit * digit -> 2-digit result */
+void digit_x_digit(const digit_t a, const digit_t b, digit_t* c)
+{
+ register digit_t al, ah, bl, bh, temp;
+ digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
+ digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4);
+
+ al = a & mask_low; /* Low part */
+ ah = a >> (sizeof(digit_t) * 4); /* High part */
+ bl = b & mask_low;
+ bh = b >> (sizeof(digit_t) * 4);
+
+ albl = al*bl;
+ albh = al*bh;
+ ahbl = ah*bl;
+ ahbh = ah*bh;
+ c[0] = albl & mask_low; /* C00 */
+
+ res1 = albl >> (sizeof(digit_t) * 4);
+ res2 = ahbl & mask_low;
+ res3 = albh & mask_low;
+ temp = res1 + res2 + res3;
+ carry = temp >> (sizeof(digit_t) * 4);
+ c[0] ^= temp << (sizeof(digit_t) * 4); /* C01 */
+
+ res1 = ahbl >> (sizeof(digit_t) * 4);
+ res2 = albh >> (sizeof(digit_t) * 4);
+ res3 = ahbh & mask_low;
+ temp = res1 + res2 + res3 + carry;
+ c[1] = temp & mask_low; /* C10 */
+ carry = temp & mask_high;
+ c[1] ^= (ahbh & mask_high) + carry; /* C11 */
+}
+
+/* Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. */
+void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+ if (s2n_sikep434r3_asm_is_enabled()) {
+ S2N_SIKE_P434_R3_UNREFERENCED_PARAMETER(nwords);
+ mul434_asm(a, b, c);
+ return;
+ }
+#endif
+
+ unsigned int i, j;
+ digit_t t = 0, u = 0, v = 0, UV[2];
+ unsigned int carry;
+
+ for (i = 0; i < nwords; i++) {
+ for (j = 0; j <= i; j++) {
+ S2N_SIKE_P434_R3_MUL(a[j], b[i-j], UV+1, UV[0]);
+ S2N_SIKE_P434_R3_ADDC(0, UV[0], v, carry, v);
+ S2N_SIKE_P434_R3_ADDC(carry, UV[1], u, carry, u);
+ t += carry;
+ }
+ c[i] = v;
+ v = u;
+ u = t;
+ t = 0;
+ }
+
+ for (i = nwords; i < 2*nwords-1; i++) {
+ for (j = i-nwords+1; j < nwords; j++) {
+ S2N_SIKE_P434_R3_MUL(a[j], b[i-j], UV+1, UV[0]);
+ S2N_SIKE_P434_R3_ADDC(0, UV[0], v, carry, v);
+ S2N_SIKE_P434_R3_ADDC(carry, UV[1], u, carry, u);
+ t += carry;
+ }
+ c[i] = v;
+ v = u;
+ u = t;
+ t = 0;
+ }
+ c[2*nwords-1] = v;
+}
+
+/* Efficient Montgomery reduction using comba and exploiting the special form of the prime p434.
+ * mc = ma*R^-1 mod p434x2, where R = 2^448.
+ * If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1].
+ * ma is assumed to be in Montgomery representation. */
+void rdc_mont(digit_t* ma, digit_t* mc)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+ if (s2n_sikep434r3_asm_is_enabled()) {
+ rdc434_asm(ma, mc);
+ return;
+ }
+#endif
+
+ unsigned int i, j, carry, count = S2N_SIKE_P434_R3_ZERO_WORDS;
+ digit_t UV[2], t = 0, u = 0, v = 0;
+
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ mc[i] = 0;
+ }
+
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ for (j = 0; j < i; j++) {
+ if (j < (i-S2N_SIKE_P434_R3_ZERO_WORDS+1)) {
+ S2N_SIKE_P434_R3_MUL(mc[j], ((const digit_t*)p434p1)[i-j], UV+1, UV[0]);
+ S2N_SIKE_P434_R3_ADDC(0, UV[0], v, carry, v);
+ S2N_SIKE_P434_R3_ADDC(carry, UV[1], u, carry, u);
+ t += carry;
+ }
+ }
+ S2N_SIKE_P434_R3_ADDC(0, v, ma[i], carry, v);
+ S2N_SIKE_P434_R3_ADDC(carry, u, 0, carry, u);
+ t += carry;
+ mc[i] = v;
+ v = u;
+ u = t;
+ t = 0;
+ }
+
+ for (i = S2N_SIKE_P434_R3_NWORDS_FIELD; i < 2*S2N_SIKE_P434_R3_NWORDS_FIELD-1; i++) {
+ if (count > 0) {
+ count -= 1;
+ }
+ for (j = i-S2N_SIKE_P434_R3_NWORDS_FIELD+1; j < S2N_SIKE_P434_R3_NWORDS_FIELD; j++) {
+ if (j < (S2N_SIKE_P434_R3_NWORDS_FIELD-count)) {
+ S2N_SIKE_P434_R3_MUL(mc[j], ((const digit_t*)p434p1)[i-j], UV+1, UV[0]);
+ S2N_SIKE_P434_R3_ADDC(0, UV[0], v, carry, v);
+ S2N_SIKE_P434_R3_ADDC(carry, UV[1], u, carry, u);
+ t += carry;
+ }
+ }
+ S2N_SIKE_P434_R3_ADDC(0, v, ma[i], carry, v);
+ S2N_SIKE_P434_R3_ADDC(carry, u, 0, carry, u);
+ t += carry;
+ mc[i-S2N_SIKE_P434_R3_NWORDS_FIELD] = v;
+ v = u;
+ u = t;
+ t = 0;
+ }
+
+ /* `carry` isn't read after this, but it's still a necessary argument to the macro */
+ /* cppcheck-suppress unreadVariable */
+ S2N_SIKE_P434_R3_ADDC(0, v, ma[2*S2N_SIKE_P434_R3_NWORDS_FIELD-1], carry, v);
+ mc[S2N_SIKE_P434_R3_NWORDS_FIELD-1] = v;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.h
new file mode 100644
index 0000000000..7844ba0457
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.h
@@ -0,0 +1,39 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: modular arithmetic for P434
+*********************************************************************************************/
+
+#pragma once
+
+#include "sikep434r3.h"
+
+#define mp_sub434_p2 S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p2)
+void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mp_sub434_p4 S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p4)
+void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define fpadd434 S2N_SIKE_P434_R3_NAMESPACE(fpadd434)
+void fpadd434(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define fpsub434 S2N_SIKE_P434_R3_NAMESPACE(fpsub434)
+void fpsub434(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define fpneg434 S2N_SIKE_P434_R3_NAMESPACE(fpneg434)
+void fpneg434(digit_t* a);
+
+#define fpdiv2_434 S2N_SIKE_P434_R3_NAMESPACE(fpdiv2_434)
+void fpdiv2_434(const digit_t* a, digit_t* c);
+
+#define fpcorrection434 S2N_SIKE_P434_R3_NAMESPACE(fpcorrection434)
+void fpcorrection434(digit_t* a);
+
+#define digit_x_digit S2N_SIKE_P434_R3_NAMESPACE(digit_x_digit)
+void digit_x_digit(const digit_t a, const digit_t b, digit_t* c);
+
+#define mp_mul S2N_SIKE_P434_R3_NAMESPACE(mp_mul)
+void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords);
+
+#define rdc_mont S2N_SIKE_P434_R3_NAMESPACE(rdc_mont)
+void rdc_mont(digit_t* ma, digit_t* mc);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.S b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.S
new file mode 100644
index 0000000000..1814a8b25a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.S
@@ -0,0 +1,1054 @@
+//*******************************************************************************************
+// Supersingular Isogeny Key Encapsulation Library
+//
+// Abstract: field arithmetic in x64 assembly for P434 on Linux
+//*******************************************************************************************
+
+/* Requires bmi2 instruction set for mulx. adx instructions are optional, but preferred. */
+.intel_syntax noprefix
+
+#define S2N_SIKE_P434_R3_NAMESPACE(s) s2n_sike_p434_r3_##s
+
+// Registers that are used for parameter passing:
+#define reg_p1 rdi
+#define reg_p2 rsi
+#define reg_p3 rdx
+
+// Define addition instructions
+#ifdef S2N_ADX
+
+#define ADD1 adox
+#define ADC1 adox
+#define ADD2 adcx
+#define ADC2 adcx
+
+#else
+
+#define ADD1 add
+#define ADC1 adc
+#define ADD2 add
+#define ADC2 adc
+
+#endif
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+
+#define asm_p434 S2N_SIKE_P434_R3_NAMESPACE(asm_p434)
+.align 32
+.type asm_p434, @object
+.size asm_p434, 56
+asm_p434:
+.quad -1
+.quad -1
+.quad -1
+.quad -161717841442111489
+.quad 8918917783347572387
+.quad 7853257225132122198
+.quad 620258357900100
+
+
+#define asm_p434x2 S2N_SIKE_P434_R3_NAMESPACE(asm_p434x2)
+.align 32
+.type asm_p434x2, @object
+.size asm_p434x2, 56
+asm_p434x2:
+.quad -2
+.quad -1
+.quad -1
+.quad -323435682884222977
+.quad -608908507014406841
+.quad -2740229623445307220
+.quad 1240516715800200
+
+
+#define asm_p434x4 S2N_SIKE_P434_R3_NAMESPACE(asm_p434x4)
+.align 32
+.type asm_p434x4, @object
+.size asm_p434x4, 56
+asm_p434x4:
+.quad -4
+.quad -1
+.quad -1
+.quad -646871365768445953
+.quad -1217817014028813681
+.quad -5480459246890614439
+.quad 2481033431600401
+
+
+#define asm_p434p1 S2N_SIKE_P434_R3_NAMESPACE(asm_p434p1)
+.align 32
+.type asm_p434p1, @object
+.size asm_p434p1, 56
+asm_p434p1:
+.quad 0
+.quad 0
+.quad 0
+.quad -161717841442111488
+.quad 8918917783347572387
+.quad 7853257225132122198
+.quad 620258357900100
+
+//***********************************************************************
+// Field addition
+// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
+//***********************************************************************
+#define fpadd434_asm S2N_SIKE_P434_R3_NAMESPACE(fpadd434_asm)
+.global fpadd434_asm
+fpadd434_asm:
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbx
+ push rbp
+
+ xor rax, rax
+ mov r8, [reg_p1]
+ mov r9, [reg_p1+8]
+ mov r10, [reg_p1+16]
+ mov r11, [reg_p1+24]
+ mov r12, [reg_p1+32]
+ mov r13, [reg_p1+40]
+ mov r14, [reg_p1+48]
+ add r8, [reg_p2]
+ adc r9, [reg_p2+8]
+ adc r10, [reg_p2+16]
+ adc r11, [reg_p2+24]
+ adc r12, [reg_p2+32]
+ adc r13, [reg_p2+40]
+ adc r14, [reg_p2+48]
+
+ mov rbx, [rip+asm_p434x2]
+ sub r8, rbx
+ mov rcx, [rip+asm_p434x2+8]
+ sbb r9, rcx
+ sbb r10, rcx
+ mov rdi, [rip+asm_p434x2+24]
+ sbb r11, rdi
+ mov rsi, [rip+asm_p434x2+32]
+ sbb r12, rsi
+ mov rbp, [rip+asm_p434x2+40]
+ sbb r13, rbp
+ mov r15, [rip+asm_p434x2+48]
+ sbb r14, r15
+ sbb rax, 0
+
+ and rbx, rax
+ and rcx, rax
+ and rdi, rax
+ and rsi, rax
+ and rbp, rax
+ and r15, rax
+
+ add r8, rbx
+ adc r9, rcx
+ adc r10, rcx
+ adc r11, rdi
+ adc r12, rsi
+ adc r13, rbp
+ adc r14, r15
+ mov [reg_p3], r8
+ mov [reg_p3+8], r9
+ mov [reg_p3+16], r10
+ mov [reg_p3+24], r11
+ mov [reg_p3+32], r12
+ mov [reg_p3+40], r13
+ mov [reg_p3+48], r14
+
+ pop rbp
+ pop rbx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+
+//***********************************************************************
+// Field subtraction
+// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
+//***********************************************************************
+#define fpsub434_asm S2N_SIKE_P434_R3_NAMESPACE(fpsub434_asm)
+.global fpsub434_asm
+fpsub434_asm:
+ push r12
+ push r13
+ push r14
+
+ xor rax, rax
+ mov r8, [reg_p1]
+ mov r9, [reg_p1+8]
+ mov r10, [reg_p1+16]
+ mov r11, [reg_p1+24]
+ mov r12, [reg_p1+32]
+ mov r13, [reg_p1+40]
+ mov r14, [reg_p1+48]
+ sub r8, [reg_p2]
+ sbb r9, [reg_p2+8]
+ sbb r10, [reg_p2+16]
+ sbb r11, [reg_p2+24]
+ sbb r12, [reg_p2+32]
+ sbb r13, [reg_p2+40]
+ sbb r14, [reg_p2+48]
+ sbb rax, 0
+
+ mov rcx, [rip+asm_p434x2]
+ mov rdi, [rip+asm_p434x2+8]
+ mov rsi, [rip+asm_p434x2+24]
+ and rcx, rax
+ and rdi, rax
+ and rsi, rax
+ add r8, rcx
+ adc r9, rdi
+ adc r10, rdi
+ adc r11, rsi
+ mov [reg_p3], r8
+ mov [reg_p3+8], r9
+ mov [reg_p3+16], r10
+ mov [reg_p3+24], r11
+ setc cl
+
+ mov r8, [rip+asm_p434x2+32]
+ mov rdi, [rip+asm_p434x2+40]
+ mov rsi, [rip+asm_p434x2+48]
+ and r8, rax
+ and rdi, rax
+ and rsi, rax
+ bt rcx, 0
+ adc r12, r8
+ adc r13, rdi
+ adc r14, rsi
+ mov [reg_p3+32], r12
+ mov [reg_p3+40], r13
+ mov [reg_p3+48], r14
+
+ pop r14
+ pop r13
+ pop r12
+ ret
+
+///////////////////////////////////////////////////////////////// MACRO
+.macro SUB434_PX P0
+ push r12
+ push r13
+
+ mov r8, [reg_p1]
+ mov r9, [reg_p1+8]
+ mov r10, [reg_p1+16]
+ mov r11, [reg_p1+24]
+ mov r12, [reg_p1+32]
+ mov r13, [reg_p1+40]
+ mov rcx, [reg_p1+48]
+ sub r8, [reg_p2]
+ sbb r9, [reg_p2+8]
+ sbb r10, [reg_p2+16]
+ sbb r11, [reg_p2+24]
+ sbb r12, [reg_p2+32]
+ sbb r13, [reg_p2+40]
+ sbb rcx, [reg_p2+48]
+
+ mov rax, [rip+\P0]
+ mov rdi, [rip+\P0+8]
+ mov rsi, [rip+\P0+24]
+ add r8, rax
+ mov rax, [rip+\P0+32]
+ adc r9, rdi
+ adc r10, rdi
+ adc r11, rsi
+ mov rdi, [rip+\P0+40]
+ mov rsi, [rip+\P0+48]
+ adc r12, rax
+ adc r13, rdi
+ adc rcx, rsi
+ mov [reg_p3], r8
+ mov [reg_p3+8], r9
+ mov [reg_p3+16], r10
+ mov [reg_p3+24], r11
+ mov [reg_p3+32], r12
+ mov [reg_p3+40], r13
+ mov [reg_p3+48], rcx
+
+ pop r13
+ pop r12
+.endm
+
+//***********************************************************************
+// Multiprecision subtraction with correction with 2*p434
+// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p434
+//***********************************************************************
+#define mp_sub434_p2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p2_asm)
+.global mp_sub434_p2_asm
+mp_sub434_p2_asm:
+ SUB434_PX asm_p434x2
+ ret
+
+//***********************************************************************
+// Multiprecision subtraction with correction with 4*p434
+// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p434
+//***********************************************************************
+#define mp_sub434_p4_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p4_asm)
+.global mp_sub434_p4_asm
+mp_sub434_p4_asm:
+ SUB434_PX asm_p434x4
+ ret
+
+///////////////////////////////////////////////////////////////// MACRO
+// Schoolbook integer multiplication
+// Inputs: memory pointers M0 and M1
+// Outputs: memory pointer C and regs T1, T3, rax
+// Temps: regs T0:T6
+/////////////////////////////////////////////////////////////////
+#ifdef S2N_ADX
+
+.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
+ mov rdx, \M0
+ mulx \T0, \T1, \M1 // T0:T1 = A0*B0
+ mov \C, \T1 // C0_final
+ mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1
+ xor rax, rax
+ adox \T0, \T2
+ mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2
+ adox \T1, \T3
+
+ mov rdx, 8\M0
+ mulx \T3, \T4, \M1 // T3:T4 = A1*B0
+ adox \T2, rax
+ xor rax, rax
+ mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1
+ adox \T4, \T0
+ mov 8\C, \T4 // C1_final
+ adcx \T3, \T6
+ mulx \T6, \T0, 16\M1 // T6:T0 = A1*B2
+ adox \T3, \T1
+ adcx \T5, \T0
+ adcx \T6, rax
+ adox \T5, \T2
+
+ mov rdx, 16\M0
+ mulx \T1, \T0, \M1 // T1:T0 = A2*B0
+ adox \T6, rax
+ xor rax, rax
+ mulx \T4, \T2, 8\M1 // T4:T2 = A2*B1
+ adox \T0, \T3
+ mov 16\C, \T0 // C2_final
+ adcx \T1, \T5
+ mulx \T0, \T3, 16\M1 // T0:T3 = A2*B2
+ adcx \T4, \T6
+ adcx \T0, rax
+ adox \T1, \T2
+ adox \T3, \T4
+ adox rax, \T0
+.endm
+
+///////////////////////////////////////////////////////////////// MACRO
+// Schoolbook integer multiplication
+// Inputs: memory pointers M0 and M1
+// Outputs: memory pointer C
+// Temps: regs T0:T9
+/////////////////////////////////////////////////////////////////
+.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+ mov rdx, \M0
+ mulx \T0, \T1, \M1 // T0:T1 = A0*B0
+ mov \C, \T1 // C0_final
+ mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1
+ xor rax, rax
+ adox \T0, \T2
+ mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2
+ adox \T1, \T3
+ mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3
+ adox \T2, \T4
+
+ mov rdx, 8\M0
+ mulx \T5, \T4, \M1 // T5:T4 = A1*B0
+ adox \T3, rax
+ xor rax, rax
+ mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1
+ adox \T4, \T0
+ mov 8\C, \T4 // C1_final
+ adcx \T5, \T7
+ mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2
+ adcx \T6, \T8
+ adox \T5, \T1
+ mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3
+ adcx \T7, \T9
+ adcx \T8, rax
+ adox \T6, \T2
+
+ mov rdx, 16\M0
+ mulx \T1, \T0, \M1 // T1:T0 = A2*B0
+ adox \T7, \T3
+ adox \T8, rax
+ xor rax, rax
+ mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1
+ adox \T0, \T5
+ mov 16\C, \T0 // C2_final
+ adcx \T1, \T3
+ mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2
+ adcx \T2, \T4
+ adox \T1, \T6
+ mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3
+ adcx \T3, \T9
+ mov rdx, 24\M0
+ adcx \T4, rax
+
+ adox \T2, \T7
+ adox \T3, \T8
+ adox \T4, rax
+
+ mulx \T5, \T0, \M1 // T5:T0 = A3*B0
+ xor rax, rax
+ mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1
+ adcx \T5, \T7
+ adox \T1, \T0
+ mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2
+ adcx \T6, \T8
+ adox \T2, \T5
+ mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3
+ adcx \T7, \T9
+ adcx \T8, rax
+
+ adox \T3, \T6
+ adox \T4, \T7
+ adox \T8, rax
+ mov 24\C, \T1 // C3_final
+ mov 32\C, \T2 // C4_final
+ mov 40\C, \T3 // C5_final
+ mov 48\C, \T4 // C6_final
+ mov 56\C, \T8 // C7_final
+.endm
+
+#else // S2N_ADX
+
+.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
+ mov rdx, \M0
+ mulx \T0, \T1, \M1 // T0:T1 = A0*B0
+ mov \C, \T1 // C0_final
+ mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1
+ add \T0, \T2
+ mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2
+ adc \T1, \T3
+
+ mov rdx, 8\M0
+ mulx \T3, \T4, \M1 // T3:T4 = A1*B0
+ adc \T2, 0
+ mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1
+ add \T4, \T0
+ mov 8\C, \T4 // C1_final
+ adc \T3, \T1
+ adc \T5, \T2
+ mulx \T2, \T1, 16\M1 // T2:T1 = A1*B2
+ adc \T2, 0
+
+ add \T3, \T6
+ adc \T5, \T1
+ adc \T2, 0
+
+ mov rdx, 16\M0
+ mulx \T1, \T0, \M1 // T1:T0 = A2*B0
+ add \T0, \T3
+ mov 16\C, \T0 // C2_final
+ mulx \T4, \T6, 8\M1 // T4:T6 = A2*B1
+ adc \T1, \T5
+ adc \T2, \T4
+ mulx rax, \T3, 16\M1 // rax:T3 = A2*B2
+ adc rax, 0
+ add \T1, \T6
+ adc \T3, \T2
+ adc rax, 0
+.endm
+
+.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+ mov rdx, \M0
+ mulx \T0, \T1, \M1 // T0:T1 = A0*B0
+ mov \C, \T1 // C0_final
+ mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1
+ add \T0, \T2
+ mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2
+ adc \T1, \T3
+ mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3
+ adc \T2, \T4
+ mov rdx, 8\M0
+ adc \T3, 0
+
+ mulx \T5, \T4, \M1 // T5:T4 = A1*B0
+ mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1
+ add \T5, \T7
+ mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2
+ adc \T6, \T8
+ mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3
+ adc \T7, \T9
+ adc \T8, 0
+
+ add \T4, \T0
+ mov 8\C, \T4 // C1_final
+ adc \T5, \T1
+ adc \T6, \T2
+ adc \T7, \T3
+ mov rdx, 16\M0
+ adc \T8, 0
+
+ mulx \T1, \T0, \M1 // T1:T0 = A2*B0
+ mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1
+ add \T1, \T3
+ mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2
+ adc \T2, \T4
+ mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3
+ adc \T3, \T9
+ mov rdx, 24\M0
+ adc \T4, 0
+
+ add \T0, \T5
+ mov 16\C, \T0 // C2_final
+ adc \T1, \T6
+ adc \T2, \T7
+ adc \T3, \T8
+ adc \T4, 0
+
+ mulx \T5, \T0, \M1 // T5:T0 = A3*B0
+ mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1
+ add \T5, \T7
+ mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2
+ adc \T6, \T8
+ mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3
+ adc \T7, \T9
+ adc \T8, 0
+
+ add \T1, \T0
+ mov 24\C, \T1 // C3_final
+ adc \T2, \T5
+ mov 32\C, \T2 // C4_final
+ adc \T3, \T6
+ mov 40\C, \T3 // C5_final
+ adc \T4, \T7
+ mov 48\C, \T4 // C6_final
+ adc \T8, 0
+ mov 56\C, \T8 // C7_final
+.endm
+
+#endif // S2N_ADX
+
+//*****************************************************************************
+// 434-bit multiplication using Karatsuba (one level), schoolbook (one level)
+//*****************************************************************************
+#define mul434_asm S2N_SIKE_P434_R3_NAMESPACE(mul434_asm)
+.global mul434_asm
+mul434_asm:
+ push r12
+ push r13
+ push r14
+ push r15
+ mov rcx, reg_p3
+
+ // r8-r11 <- AH + AL, rax <- mask
+ xor rax, rax
+ mov r8, [reg_p1]
+ mov r9, [reg_p1+8]
+ mov r10, [reg_p1+16]
+ mov r11, [reg_p1+24]
+ push rbx
+ push rbp
+ sub rsp, 96
+ add r8, [reg_p1+32]
+ adc r9, [reg_p1+40]
+ adc r10, [reg_p1+48]
+ adc r11, 0
+ sbb rax, 0
+ mov [rsp], r8
+ mov [rsp+8], r9
+ mov [rsp+16], r10
+ mov [rsp+24], r11
+
+ // r12-r15 <- BH + BL, rbx <- mask
+ xor rbx, rbx
+ mov r12, [reg_p2]
+ mov r13, [reg_p2+8]
+ mov r14, [reg_p2+16]
+ mov r15, [reg_p2+24]
+ add r12, [reg_p2+32]
+ adc r13, [reg_p2+40]
+ adc r14, [reg_p2+48]
+ adc r15, 0
+ sbb rbx, 0
+ mov [rsp+32], r12
+ mov [rsp+40], r13
+ mov [rsp+48], r14
+ mov [rsp+56], r15
+
+ // r12-r15 <- masked (BH + BL)
+ and r12, rax
+ and r13, rax
+ and r14, rax
+ and r15, rax
+
+ // r8-r11 <- masked (AH + AL)
+ and r8, rbx
+ and r9, rbx
+ and r10, rbx
+ and r11, rbx
+
+ // r8-r11 <- masked (AH + AL) + masked (AH + AL)
+ add r8, r12
+ adc r9, r13
+ adc r10, r14
+ adc r11, r15
+ mov [rsp+64], r8
+ mov [rsp+72], r9
+ mov [rsp+80], r10
+ mov [rsp+88], r11
+
+ // [rsp] <- (AH+AL) x (BH+BL), low part
+ MUL256_SCHOOL [rsp], [rsp+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp
+
+ // [rcx] <- AL x BL
+ MUL256_SCHOOL [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp // Result C0-C3
+
+ // [rcx+64], rbx, rbp, rax <- AH x BH
+ MUL192_SCHOOL [reg_p1+32], [reg_p2+32], [rcx+64], r8, rbx, r10, rbp, r12, r13, r14
+
+ // r8-r11 <- (AH+AL) x (BH+BL), final step
+ mov r8, [rsp+64]
+ mov r9, [rsp+72]
+ mov r10, [rsp+80]
+ mov r11, [rsp+88]
+ mov rdx, [rsp+32]
+ add r8, rdx
+ mov rdx, [rsp+40]
+ adc r9, rdx
+ mov rdx, [rsp+48]
+ adc r10, rdx
+ mov rdx, [rsp+56]
+ adc r11, rdx
+
+ // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL
+ mov r12, [rsp]
+ mov r13, [rsp+8]
+ mov r14, [rsp+16]
+ mov r15, [rsp+24]
+ sub r12, [rcx]
+ sbb r13, [rcx+8]
+ sbb r14, [rcx+16]
+ sbb r15, [rcx+24]
+ sbb r8, [rcx+32]
+ sbb r9, [rcx+40]
+ sbb r10, [rcx+48]
+ sbb r11, [rcx+56]
+
+ // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
+ sub r12, [rcx+64]
+ sbb r13, [rcx+72]
+ sbb r14, [rcx+80]
+ sbb r15, rbx
+ sbb r8, rbp
+ sbb r9, rax
+ sbb r10, 0
+ sbb r11, 0
+
+ add r12, [rcx+32]
+ mov [rcx+32], r12 // Result C4-C7
+ adc r13, [rcx+40]
+ mov [rcx+40], r13
+ adc r14, [rcx+48]
+ mov [rcx+48], r14
+ adc r15, [rcx+56]
+ mov [rcx+56], r15
+ adc r8, [rcx+64]
+ mov [rcx+64], r8 // Result C8-C15
+ adc r9, [rcx+72]
+ mov [rcx+72], r9
+ adc r10, [rcx+80]
+ mov [rcx+80], r10
+ adc r11, rbx
+ mov [rcx+88], r11
+ adc rbp, 0
+ mov [rcx+96], rbp
+ adc rax, 0
+ mov [rcx+104], rax
+
+ add rsp, 96
+ pop rbp
+ pop rbx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+
+///////////////////////////////////////////////////////////////// MACRO
+// Schoolbook integer multiplication
+// Inputs: reg I0 and memory pointer M1
+// Outputs: regs T0:T4
+// Temps: regs T0:T5
+/////////////////////////////////////////////////////////////////
+.macro MUL64x256_SCHOOL I0, M1, T0, T1, T2, T3, T4, T5
+ mulx \T2, \T4, 8\M1
+ xor rax, rax
+ mulx \T3, \T5, 16\M1
+ ADD1 \T1, \T4 // T1 <- C1_final
+ ADC1 \T2, \T5 // T2 <- C2_final
+ mulx \T4, \T5, 24\M1
+ ADC1 \T3, \T5 // T3 <- C3_final
+ ADC1 \T4, rax // T4 <- C4_final
+.endm
+
+///////////////////////////////////////////////////////////////// MACRO
+// Schoolbook integer multiplication
+// Inputs: regs I0 and I1, and memory pointer M1
+// Outputs: regs T0:T5
+// Temps: regs T0:T5
+/////////////////////////////////////////////////////////////////
+#ifdef S2N_ADX
+
+.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5
+ mulx \T2, \T4, 8\M1
+ xor rax, rax
+ mulx \T3, \T5, 16\M1
+ ADD1 \T1, \T4
+ ADC1 \T2, \T5
+ mulx \T4, \T5, 24\M1
+ ADC1 \T3, \T5
+ ADC1 \T4, rax
+
+ xor rax, rax
+ mov rdx, \I1
+ mulx \I1, \T5, \M1
+ ADD2 \T1, \T5 // T1 <- C1_final
+ ADC2 \T2, \I1
+ mulx \T5, \I1, 8\M1
+ ADC2 \T3, \T5
+ ADD1 \T2, \I1
+ mulx \T5, \I1, 16\M1
+ ADC2 \T4, \T5
+ ADC1 \T3, \I1
+ mulx \T5, \I1, 24\M1
+ ADC2 \T5, rax
+ ADC1 \T4, \I1
+ ADC1 \T5, rax
+.endm
+
+#else // S2N_ADX
+
+.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5
+ mulx \T2, \T4, 8\M1
+ mulx \T3, \T5, 16\M1
+ add \T1, \T4
+ adc \T2, \T5
+ mulx \T4, \T5, 24\M1
+ adc \T3, \T5
+ adc \T4, 0
+
+ mov rdx, \I1
+ mulx \I1, \T5, \M1
+ add \T1, \T5 // T1 <- C1_final
+ adc \T2, \I1
+ mulx \T5, \I1, 8\M1
+ adc \T3, \T5
+ mulx \T5, rax, 16\M1
+ adc \T4, \T5
+ mulx \T5, rdx, 24\M1
+ adc \T5, 0
+ add \T2, \I1
+ adc \T3, rax
+ adc \T4, rdx
+ adc \T5, 0
+.endm
+
+#endif // S2N_ADX
+
+//**************************************************************************************
+// Montgomery reduction
+// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
+// Operation: c [reg_p2] = a [reg_p1]
+//**************************************************************************************
+#define rdc434_asm S2N_SIKE_P434_R3_NAMESPACE(rdc434_asm)
+.global rdc434_asm
+rdc434_asm:
+ push r14
+
+ // a[0-1] x p434p1_nz --> result: r8:r13
+ mov rdx, [reg_p1]
+ mov r14, [reg_p1+8]
+ mulx r9, r8, [rip+asm_p434p1+24] // result r8
+ push r12
+ push r13
+ push r15
+ push rbp
+ push rbx
+ MUL128x256_SCHOOL rdx, r14, [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13
+
+ mov rdx, [reg_p1+16]
+ mov rcx, [reg_p1+72]
+ add r8, [reg_p1+24]
+ adc r9, [reg_p1+32]
+ adc r10, [reg_p1+40]
+ adc r11, [reg_p1+48]
+ adc r12, [reg_p1+56]
+ adc r13, [reg_p1+64]
+ adc rcx, 0
+ mulx rbp, rbx, [rip+asm_p434p1+24] // result rbx
+ mov [reg_p2], r9
+ mov [reg_p2+8], r10
+ mov [reg_p2+16], r11
+ mov [reg_p2+24], r12
+ mov [reg_p2+32], r13
+ mov r9, [reg_p1+80]
+ mov r10, [reg_p1+88]
+ mov r11, [reg_p1+96]
+ mov rdi, [reg_p1+104]
+ adc r9, 0
+ adc r10, 0
+ adc r11, 0
+ adc rdi, 0
+
+ // a[2-3] x p434p1_nz --> result: rbx, rbp, r12:r15
+ MUL128x256_SCHOOL rdx, r8, [rip+asm_p434p1+24], rbx, rbp, r12, r13, r14, r15
+
+ mov rdx, [reg_p2]
+ add rbx, [reg_p2+8]
+ adc rbp, [reg_p2+16]
+ adc r12, [reg_p2+24]
+ adc r13, [reg_p2+32]
+ adc r14, rcx
+ mov rcx, 0
+ adc r15, r9
+ adc rcx, r10
+ mulx r9, r8, [rip+asm_p434p1+24] // result r8
+ mov [reg_p2], rbp
+ mov [reg_p2+8], r12
+ mov [reg_p2+16], r13
+ adc r11, 0
+ adc rdi, 0
+
+ // a[4-5] x p434p1_nz --> result: r8:r13
+ MUL128x256_SCHOOL rdx, rbx, [rip+asm_p434p1+24], r8, r9, r10, rbp, r12, r13
+
+ mov rdx, [reg_p2]
+ add r8, [reg_p2+8]
+ adc r9, [reg_p2+16]
+ adc r10, r14
+ adc rbp, r15
+ adc r12, rcx
+ adc r13, r11
+ adc rdi, 0
+ mulx r15, r14, [rip+asm_p434p1+24] // result r14
+ mov [reg_p2], r8 // Final result c0-c1
+ mov [reg_p2+8], r9
+
+ // a[6-7] x p434p1_nz --> result: r14:r15, r8:r9, r11
+ MUL64x256_SCHOOL rdx, [rip+asm_p434p1+24], r14, r15, r8, r9, r11, rcx
+
+ // Final result c2:c6
+ add r14, r10
+ adc r15, rbp
+ pop rbx
+ pop rbp
+ adc r8, r12
+ adc r9, r13
+ adc r11, rdi
+ mov [reg_p2+16], r14
+ mov [reg_p2+24], r15
+ pop r15
+ pop r13
+ mov [reg_p2+32], r8
+ mov [reg_p2+40], r9
+ mov [reg_p2+48], r11
+
+ pop r12
+ pop r14
+ ret
+
+//***********************************************************************
+// 434-bit multiprecision addition
+// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
+//***********************************************************************
+#define mp_add434_asm S2N_SIKE_P434_R3_NAMESPACE(mp_add434_asm)
+.global mp_add434_asm
+mp_add434_asm:
+ mov r8, [reg_p1]
+ mov r9, [reg_p1+8]
+ mov r10, [reg_p1+16]
+ mov r11, [reg_p1+24]
+ add r8, [reg_p2]
+ adc r9, [reg_p2+8]
+ adc r10, [reg_p2+16]
+ adc r11, [reg_p2+24]
+ mov [reg_p3], r8
+ mov [reg_p3+8], r9
+ mov [reg_p3+16], r10
+ mov [reg_p3+24], r11
+
+ mov r8, [reg_p1+32]
+ mov r9, [reg_p1+40]
+ mov r10, [reg_p1+48]
+ adc r8, [reg_p2+32]
+ adc r9, [reg_p2+40]
+ adc r10, [reg_p2+48]
+ mov [reg_p3+32], r8
+ mov [reg_p3+40], r9
+ mov [reg_p3+48], r10
+ ret
+
+//***************************************************************************
+// 2x434-bit multiprecision subtraction/addition
+// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. If c < 0, add p434*2^448
+//***************************************************************************
+#define mp_subadd434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_subadd434x2_asm)
+.global mp_subadd434x2_asm
+mp_subadd434x2_asm:
+ push r12
+ push r13
+ push r14
+ push r15
+ xor rax, rax
+ mov r8, [reg_p1]
+ mov r9, [reg_p1+8]
+ mov r10, [reg_p1+16]
+ mov r11, [reg_p1+24]
+ mov r12, [reg_p1+32]
+ sub r8, [reg_p2]
+ sbb r9, [reg_p2+8]
+ sbb r10, [reg_p2+16]
+ sbb r11, [reg_p2+24]
+ sbb r12, [reg_p2+32]
+ mov [reg_p3], r8
+ mov [reg_p3+8], r9
+ mov [reg_p3+16], r10
+ mov [reg_p3+24], r11
+ mov [reg_p3+32], r12
+
+ mov r8, [reg_p1+40]
+ mov r9, [reg_p1+48]
+ mov r10, [reg_p1+56]
+ mov r11, [reg_p1+64]
+ mov r12, [reg_p1+72]
+ sbb r8, [reg_p2+40]
+ sbb r9, [reg_p2+48]
+ sbb r10, [reg_p2+56]
+ sbb r11, [reg_p2+64]
+ sbb r12, [reg_p2+72]
+ mov [reg_p3+40], r8
+ mov [reg_p3+48], r9
+ mov [reg_p3+56], r10
+
+ mov r13, [reg_p1+80]
+ mov r14, [reg_p1+88]
+ mov r15, [reg_p1+96]
+ mov rcx, [reg_p1+104]
+ sbb r13, [reg_p2+80]
+ sbb r14, [reg_p2+88]
+ sbb r15, [reg_p2+96]
+ sbb rcx, [reg_p2+104]
+ sbb rax, 0
+
+ // Add p434 anded with the mask in rax
+ mov r8, [rip+asm_p434]
+ mov r9, [rip+asm_p434+24]
+ mov r10, [rip+asm_p434+32]
+ mov rdi, [rip+asm_p434+40]
+ mov rsi, [rip+asm_p434+48]
+ and r8, rax
+ and r9, rax
+ and r10, rax
+ and rdi, rax
+ and rsi, rax
+ mov rax, [reg_p3+56]
+ add rax, r8
+ adc r11, r8
+ adc r12, r8
+ adc r13, r9
+ adc r14, r10
+ adc r15, rdi
+ adc rcx, rsi
+
+ mov [reg_p3+56], rax
+ mov [reg_p3+64], r11
+ mov [reg_p3+72], r12
+ mov [reg_p3+80], r13
+ mov [reg_p3+88], r14
+ mov [reg_p3+96], r15
+ mov [reg_p3+104], rcx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+
+//***********************************************************************
+// Double 2x434-bit multiprecision subtraction
+// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2]
+//***********************************************************************
+#define mp_dblsub434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_dblsub434x2_asm)
+.global mp_dblsub434x2_asm
+mp_dblsub434x2_asm:
+ push r12
+ push r13
+ push r14
+
+ mov r8, [reg_p3]
+ mov r9, [reg_p3+8]
+ mov r10, [reg_p3+16]
+ mov r11, [reg_p3+24]
+ mov r12, [reg_p3+32]
+ mov r13, [reg_p3+40]
+ mov r14, [reg_p3+48]
+ sub r8, [reg_p1]
+ sbb r9, [reg_p1+8]
+ sbb r10, [reg_p1+16]
+ sbb r11, [reg_p1+24]
+ sbb r12, [reg_p1+32]
+ sbb r13, [reg_p1+40]
+ sbb r14, [reg_p1+48]
+ setc al
+ sub r8, [reg_p2]
+ sbb r9, [reg_p2+8]
+ sbb r10, [reg_p2+16]
+ sbb r11, [reg_p2+24]
+ sbb r12, [reg_p2+32]
+ sbb r13, [reg_p2+40]
+ sbb r14, [reg_p2+48]
+ setc cl
+ mov [reg_p3], r8
+ mov [reg_p3+8], r9
+ mov [reg_p3+16], r10
+ mov [reg_p3+24], r11
+ mov [reg_p3+32], r12
+ mov [reg_p3+40], r13
+ mov [reg_p3+48], r14
+
+ mov r8, [reg_p3+56]
+ mov r9, [reg_p3+64]
+ mov r10, [reg_p3+72]
+ mov r11, [reg_p3+80]
+ mov r12, [reg_p3+88]
+ mov r13, [reg_p3+96]
+ mov r14, [reg_p3+104]
+ bt rax, 0
+ sbb r8, [reg_p1+56]
+ sbb r9, [reg_p1+64]
+ sbb r10, [reg_p1+72]
+ sbb r11, [reg_p1+80]
+ sbb r12, [reg_p1+88]
+ sbb r13, [reg_p1+96]
+ sbb r14, [reg_p1+104]
+ bt rcx, 0
+ sbb r8, [reg_p2+56]
+ sbb r9, [reg_p2+64]
+ sbb r10, [reg_p2+72]
+ sbb r11, [reg_p2+80]
+ sbb r12, [reg_p2+88]
+ sbb r13, [reg_p2+96]
+ sbb r14, [reg_p2+104]
+ mov [reg_p3+56], r8
+ mov [reg_p3+64], r9
+ mov [reg_p3+72], r10
+ mov [reg_p3+80], r11
+ mov [reg_p3+88], r12
+ mov [reg_p3+96], r13
+ mov [reg_p3+104], r14
+
+ pop r14
+ pop r13
+ pop r12
+ ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.h
new file mode 100644
index 0000000000..1753e25fb4
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.h
@@ -0,0 +1,38 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: x86_64 assembly optimized modular arithmetic for P434
+*********************************************************************************************/
+
+#pragma once
+
+#if defined(S2N_SIKE_P434_R3_ASM)
+
+#define fpadd434_asm S2N_SIKE_P434_R3_NAMESPACE(fpadd434_asm)
+void fpadd434_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define fpsub434_asm S2N_SIKE_P434_R3_NAMESPACE(fpsub434_asm)
+void fpsub434_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mul434_asm S2N_SIKE_P434_R3_NAMESPACE(mul434_asm)
+void mul434_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define rdc434_asm S2N_SIKE_P434_R3_NAMESPACE(rdc434_asm)
+void rdc434_asm(digit_t* ma, digit_t* mc);
+
+#define mp_add434_asm S2N_SIKE_P434_R3_NAMESPACE(mp_add434_asm)
+void mp_add434_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mp_subadd434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_subadd434x2_asm)
+void mp_subadd434x2_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mp_dblsub434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_dblsub434x2_asm)
+void mp_dblsub434x2_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mp_sub434_p2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p2_asm)
+void mp_sub434_p2_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mp_sub434_p4_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p4_asm)
+void mp_sub434_p4_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.c
new file mode 100644
index 0000000000..40c61144e4
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.c
@@ -0,0 +1,478 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: core functions over GF(p) and GF(p^2)
+*********************************************************************************************/
+
+#include <string.h>
+#include "sikep434r3.h"
+#include "sikep434r3_fp.h"
+#include "sikep434r3_fpx.h"
+#include "pq-crypto/s2n_pq.h"
+#include "sikep434r3_fp_x64_asm.h"
+
+static void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc);
+static void to_mont(const felm_t a, felm_t mc);
+static void from_mont(const felm_t ma, felm_t c);
+static void fpsqr_mont(const felm_t ma, felm_t mc);
+static unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords);
+static void fpinv_chain_mont(felm_t a);
+static void fpinv_mont(felm_t a);
+static void to_fp2mont(const f2elm_t *a, f2elm_t *mc);
+static void from_fp2mont(const f2elm_t *ma, f2elm_t *c);
+
+/* Encoding digits to bytes according to endianness */
+__inline static void encode_to_bytes(const digit_t* x, unsigned char* enc, int nbytes)
+{
+ if (is_big_endian()) {
+ int ndigits = nbytes / sizeof(digit_t);
+ int rem = nbytes % sizeof(digit_t);
+
+ for (int i = 0; i < ndigits; i++) {
+ digit_t temp = S2N_SIKE_P434_R3_BSWAP_DIGIT(x[i]);
+ memcpy(enc + (i * sizeof(digit_t)), (unsigned char *)&temp, sizeof(digit_t));
+ }
+
+ if (rem) {
+ digit_t ld = S2N_SIKE_P434_R3_BSWAP_DIGIT(x[ndigits]);
+ memcpy(enc + ndigits * sizeof(digit_t), (unsigned char *) &ld, rem);
+ }
+ } else {
+ memcpy(enc, (const unsigned char *) x, nbytes);
+ }
+}
+
+/* Conversion of GF(p^2) element from Montgomery to standard representation,
+ * and encoding by removing leading 0 bytes */
+void fp2_encode(const f2elm_t *x, unsigned char *enc)
+{
+ f2elm_t t;
+
+ from_fp2mont(x, &t);
+ encode_to_bytes(t.e[0], enc, S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2);
+ encode_to_bytes(t.e[1], enc + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2, S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2);
+}
+
+/* Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation */
+void fp2_decode(const unsigned char *x, f2elm_t *dec)
+{
+ decode_to_digits(x, dec->e[0], S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2, S2N_SIKE_P434_R3_NWORDS_FIELD);
+ decode_to_digits(x + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2, dec->e[1], S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2, S2N_SIKE_P434_R3_NWORDS_FIELD);
+ to_fp2mont(dec, dec);
+}
+
+/* Multiprecision multiplication, c = a*b mod p. */
+static void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc)
+{
+ dfelm_t temp = {0};
+
+ mp_mul(ma, mb, temp, S2N_SIKE_P434_R3_NWORDS_FIELD);
+ rdc_mont(temp, mc);
+}
+
+/* Conversion to Montgomery representation,
+ * mc = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1].
+ * The Montgomery constant R^2 mod p is the global value "Montgomery_R2". */
+static void to_mont(const felm_t a, felm_t mc)
+{
+ fpmul_mont(a, (const digit_t*)&Montgomery_R2, mc);
+}
+
+/* Conversion from Montgomery representation to standard representation,
+ * c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. */
+static void from_mont(const felm_t ma, felm_t c)
+{
+ digit_t one[S2N_SIKE_P434_R3_NWORDS_FIELD] = {0};
+
+ one[0] = 1;
+ fpmul_mont(ma, one, c);
+ fpcorrection434(c);
+}
+
+/* Copy wordsize digits, c = a, where lng(a) = nwords. */
+void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords)
+{
+ unsigned int i;
+
+ for (i = 0; i < nwords; i++) {
+ c[i] = a[i];
+ }
+}
+
+/* Multiprecision squaring, c = a^2 mod p. */
+static void fpsqr_mont(const felm_t ma, felm_t mc)
+{
+ dfelm_t temp = {0};
+
+ mp_mul(ma, ma, temp, S2N_SIKE_P434_R3_NWORDS_FIELD);
+ rdc_mont(temp, mc);
+}
+
+/* Copy a GF(p^2) element, c = a. */
+void fp2copy(const f2elm_t *a, f2elm_t *c)
+{
+ fpcopy(a->e[0], c->e[0]);
+ fpcopy(a->e[1], c->e[1]);
+}
+
+/* GF(p^2) division by two, c = a/2 in GF(p^2). */
+void fp2div2(const f2elm_t *a, f2elm_t *c)
+{
+ fpdiv2_434(a->e[0], c->e[0]);
+ fpdiv2_434(a->e[1], c->e[1]);
+}
+
+/* Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit. */
+unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords)
+{
+ unsigned int i, carry = 0;
+
+ for (i = 0; i < nwords; i++) {
+ S2N_SIKE_P434_R3_ADDC(carry, a[i], b[i], carry, c[i]);
+ }
+
+ return carry;
+}
+
+/* GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2).
+ * Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1]
+ * Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] */
+void fp2sqr_mont(const f2elm_t *a, f2elm_t *c)
+{
+ felm_t t1, t2, t3;
+
+ mp_addfast(a->e[0], a->e[1], t1); /* t1 = a0+a1 */
+ mp_sub434_p4(a->e[0], a->e[1], t2); /* t2 = a0-a1 */
+ mp_addfast(a->e[0], a->e[0], t3); /* t3 = 2a0 */
+ fpmul_mont(t1, t2, c->e[0]); /* c0 = (a0+a1)(a0-a1) */
+ fpmul_mont(t3, a->e[1], c->e[1]); /* c1 = 2a0*a1 */
+}
+
+/* Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit. */
+static unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords)
+{
+ unsigned int i, borrow = 0;
+
+ for (i = 0; i < nwords; i++) {
+ S2N_SIKE_P434_R3_SUBC(borrow, a[i], b[i], borrow, c[i]);
+ }
+
+ return borrow;
+}
+
+/* Multiprecision subtraction followed by addition with p*2^S2N_SIKE_P434_R3_MAXBITS_FIELD,
+ * c = a-b+(p*2^S2N_SIKE_P434_R3_MAXBITS_FIELD) if a-b < 0, otherwise c=a-b. */
+__inline static void mp_subaddfast(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+ if (s2n_sikep434r3_asm_is_enabled()) {
+ mp_subadd434x2_asm(a, b, c);
+ return;
+ }
+#endif
+
+ felm_t t1;
+
+ digit_t mask = 0 - (digit_t)mp_sub(a, b, c, 2*S2N_SIKE_P434_R3_NWORDS_FIELD);
+ for (int i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ t1[i] = ((const digit_t *) p434)[i] & mask;
+ }
+ mp_addfast((digit_t*)&c[S2N_SIKE_P434_R3_NWORDS_FIELD], t1, (digit_t*)&c[S2N_SIKE_P434_R3_NWORDS_FIELD]);
+}
+
+/* Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*S2N_SIKE_P434_R3_NWORDS_FIELD. */
+__inline static void mp_dblsubfast(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+ if (s2n_sikep434r3_asm_is_enabled()) {
+ mp_dblsub434x2_asm(a, b, c);
+ return;
+ }
+#endif
+
+ mp_sub(c, a, c, 2*S2N_SIKE_P434_R3_NWORDS_FIELD);
+ mp_sub(c, b, c, 2*S2N_SIKE_P434_R3_NWORDS_FIELD);
+}
+
+/* GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2).
+ * Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1]
+ * Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] */
+void fp2mul_mont(const f2elm_t *a, const f2elm_t *b, f2elm_t *c)
+{
+ felm_t t1, t2;
+ dfelm_t tt1, tt2, tt3;
+
+ mp_addfast(a->e[0], a->e[1], t1); /* t1 = a0+a1 */
+ mp_addfast(b->e[0], b->e[1], t2); /* t2 = b0+b1 */
+ mp_mul(a->e[0], b->e[0], tt1, S2N_SIKE_P434_R3_NWORDS_FIELD); /* tt1 = a0*b0 */
+ mp_mul(a->e[1], b->e[1], tt2, S2N_SIKE_P434_R3_NWORDS_FIELD); /* tt2 = a1*b1 */
+ mp_mul(t1, t2, tt3, S2N_SIKE_P434_R3_NWORDS_FIELD); /* tt3 = (a0+a1)*(b0+b1) */
+ mp_dblsubfast(tt1, tt2, tt3); /* tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 */
+ mp_subaddfast(tt1, tt2, tt1); /* tt1 = a0*b0 - a1*b1 + p*2^S2N_SIKE_P434_R3_MAXBITS_FIELD if a0*b0 - a1*b1 < 0, else tt1 = a0*b0 - a1*b1 */
+ rdc_mont(tt3, c->e[1]); /* c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 */
+ rdc_mont(tt1, c->e[0]); /* c[0] = a0*b0 - a1*b1 */
+}
+
+/* Chain to compute a^(p-3)/4 using Montgomery arithmetic. */
+static void fpinv_chain_mont(felm_t a)
+{
+ unsigned int i, j;
+ felm_t t[31], tt;
+
+ /* Precomputed table */
+ fpsqr_mont(a, tt);
+ fpmul_mont(a, tt, t[0]);
+ for (i = 0; i <= 29; i++) {
+ fpmul_mont(t[i], tt, t[i + 1]);
+ }
+
+ fpcopy(a, tt);
+ for (i = 0; i < 7; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[5], tt, tt);
+ for (i = 0; i < 10; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[14], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[3], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[23], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[13], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[24], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[7], tt, tt);
+ for (i = 0; i < 8; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[12], tt, tt);
+ for (i = 0; i < 8; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[30], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[1], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[30], tt, tt);
+ for (i = 0; i < 7; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[21], tt, tt);
+ for (i = 0; i < 9; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[2], tt, tt);
+ for (i = 0; i < 9; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[19], tt, tt);
+ for (i = 0; i < 9; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[1], tt, tt);
+ for (i = 0; i < 7; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[24], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[26], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[16], tt, tt);
+ for (i = 0; i < 7; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[10], tt, tt);
+ for (i = 0; i < 7; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[6], tt, tt);
+ for (i = 0; i < 7; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[0], tt, tt);
+ for (i = 0; i < 9; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[20], tt, tt);
+ for (i = 0; i < 8; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[9], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[25], tt, tt);
+ for (i = 0; i < 9; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[30], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[26], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(a, tt, tt);
+ for (i = 0; i < 7; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[28], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[6], tt, tt);
+ for (i = 0; i < 6; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[10], tt, tt);
+ for (i = 0; i < 9; i++) {
+ fpsqr_mont(tt, tt);
+ }
+ fpmul_mont(t[22], tt, tt);
+ for (j = 0; j < 35; j++) {
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+ fpmul_mont(t[30], tt, tt);
+ }
+ fpcopy(tt, a);
+}
+
+/* Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. */
+static void fpinv_mont(felm_t a)
+{
+ felm_t tt;
+
+ fpcopy(a, tt);
+ fpinv_chain_mont(tt);
+ fpsqr_mont(tt, tt);
+ fpsqr_mont(tt, tt);
+ fpmul_mont(a, tt, a);
+}
+
+/* GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). */
+void fp2inv_mont(f2elm_t *a)
+{
+ f2elm_t t1;
+
+ fpsqr_mont(a->e[0], t1.e[0]); /* t10 = a0^2 */
+ fpsqr_mont(a->e[1], t1.e[1]); /* t11 = a1^2 */
+ fpadd434(t1.e[0], t1.e[1], t1.e[0]); /* t10 = a0^2+a1^2 */
+ fpinv_mont(t1.e[0]); /* t10 = (a0^2+a1^2)^-1 */
+ fpneg434(a->e[1]); /* a = a0-i*a1 */
+ fpmul_mont(a->e[0], t1.e[0], a->e[0]);
+ fpmul_mont(a->e[1], t1.e[0], a->e[1]); /* a = (a0-i*a1)*(a0^2+a1^2)^-1 */
+}
+
+/* Conversion of a GF(p^2) element to Montgomery representation,
+ * mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2). */
+static void to_fp2mont(const f2elm_t *a, f2elm_t *mc)
+{
+ to_mont(a->e[0], mc->e[0]);
+ to_mont(a->e[1], mc->e[1]);
+}
+
+/* Conversion of a GF(p^2) element from Montgomery representation to standard representation,
+ * c_i = ma_i*R^(-1) = a_i in GF(p^2). */
+static void from_fp2mont(const f2elm_t *ma, f2elm_t *c)
+{
+ from_mont(ma->e[0], c->e[0]);
+ from_mont(ma->e[1], c->e[1]);
+}
+
+/* Multiprecision right shift by one. */
+void mp_shiftr1(digit_t* x, const unsigned int nwords)
+{
+ unsigned int i;
+
+ for (i = 0; i < nwords-1; i++) {
+ S2N_SIKE_P434_R3_SHIFTR(x[i+1], x[i], 1, x[i], S2N_SIKE_P434_R3_RADIX);
+ }
+ x[nwords-1] >>= 1;
+}
+
+void decode_to_digits(const unsigned char* x, digit_t* dec, int nbytes, int ndigits)
+{
+ dec[ndigits - 1] = 0;
+ memcpy((unsigned char*)dec, x, nbytes);
+
+ if (is_big_endian()) {
+ for (int i = 0; i < ndigits; i++) {
+ dec[i] = S2N_SIKE_P434_R3_BSWAP_DIGIT(dec[i]);
+ }
+ }
+}
+
+void fpcopy(const felm_t a, felm_t c)
+{
+ unsigned int i;
+
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ c[i] = a[i];
+ }
+}
+
+void fpzero(felm_t a)
+{
+ unsigned int i;
+
+ for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+ a[i] = 0;
+ }
+}
+
+void fp2add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c)
+{
+ fpadd434(a->e[0], b->e[0], c->e[0]);
+ fpadd434(a->e[1], b->e[1], c->e[1]);
+}
+
+void fp2sub(const f2elm_t *a, const f2elm_t *b, f2elm_t *c)
+{
+ fpsub434(a->e[0], b->e[0], c->e[0]);
+ fpsub434(a->e[1], b->e[1], c->e[1]);
+}
+
+void mp_addfast(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+ if (s2n_sikep434r3_asm_is_enabled()) {
+ mp_add434_asm(a, b, c);
+ return;
+ }
+#endif
+
+ mp_add(a, b, c, S2N_SIKE_P434_R3_NWORDS_FIELD);
+}
+
+void mp2_add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c)
+{
+ mp_addfast(a->e[0], b->e[0], c->e[0]);
+ mp_addfast(a->e[1], b->e[1], c->e[1]);
+}
+
+void mp2_sub_p2(const f2elm_t *a, const f2elm_t *b, f2elm_t *c)
+{
+ mp_sub434_p2(a->e[0], b->e[0], c->e[0]);
+ mp_sub434_p2(a->e[1], b->e[1], c->e[1]);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.h
new file mode 100644
index 0000000000..bce1849ce1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.h
@@ -0,0 +1,65 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: core functions over GF(p) and GF(p^2)
+*********************************************************************************************/
+
+#pragma once
+
+#include <string.h>
+#include "sikep434r3.h"
+#include "sikep434r3_fp.h"
+
+#define fp2_encode S2N_SIKE_P434_R3_NAMESPACE(fp2_encode)
+void fp2_encode(const f2elm_t *x, unsigned char *enc);
+
+#define fp2_decode S2N_SIKE_P434_R3_NAMESPACE(fp2_decode)
+void fp2_decode(const unsigned char *x, f2elm_t *dec);
+
+#define copy_words S2N_SIKE_P434_R3_NAMESPACE(copy_words)
+void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords);
+
+#define fp2copy S2N_SIKE_P434_R3_NAMESPACE(fp2copy)
+void fp2copy(const f2elm_t *a, f2elm_t *c);
+
+#define fp2div2 S2N_SIKE_P434_R3_NAMESPACE(fp2div2)
+void fp2div2(const f2elm_t *a, f2elm_t *c);
+
+#define mp_add S2N_SIKE_P434_R3_NAMESPACE(mp_add)
+unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords);
+
+#define fp2sqr_mont S2N_SIKE_P434_R3_NAMESPACE(fp2sqr_mont)
+void fp2sqr_mont(const f2elm_t *a, f2elm_t *c);
+
+#define fp2mul_mont S2N_SIKE_P434_R3_NAMESPACE(fp2mul_mont)
+void fp2mul_mont(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
+
+#define fp2inv_mont S2N_SIKE_P434_R3_NAMESPACE(fp2inv_mont)
+void fp2inv_mont(f2elm_t *a);
+
+#define mp_shiftr1 S2N_SIKE_P434_R3_NAMESPACE(mp_shiftr1)
+void mp_shiftr1(digit_t* x, const unsigned int nwords);
+
+#define decode_to_digits S2N_SIKE_P434_R3_NAMESPACE(decode_to_digits)
+void decode_to_digits(const unsigned char* x, digit_t* dec, int nbytes, int ndigits);
+
+#define fpcopy S2N_SIKE_P434_R3_NAMESPACE(fpcopy)
+void fpcopy(const felm_t a, felm_t c);
+
+#define fpzero S2N_SIKE_P434_R3_NAMESPACE(fpzero)
+void fpzero(felm_t a);
+
+#define fp2add S2N_SIKE_P434_R3_NAMESPACE(fp2add)
+void fp2add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
+
+#define fp2sub S2N_SIKE_P434_R3_NAMESPACE(fp2sub)
+void fp2sub(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
+
+#define mp_addfast S2N_SIKE_P434_R3_NAMESPACE(mp_addfast)
+void mp_addfast(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mp2_add S2N_SIKE_P434_R3_NAMESPACE(mp2_add)
+void mp2_add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
+
+#define mp2_sub_p2 S2N_SIKE_P434_R3_NAMESPACE(mp2_sub_p2)
+void mp2_sub_p2(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_kem.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_kem.c
new file mode 100644
index 0000000000..b32add7723
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_kem.c
@@ -0,0 +1,112 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: supersingular isogeny key encapsulation (SIKE) protocol
+*********************************************************************************************/
+
+#include <string.h>
+#include "sikep434r3.h"
+#include "sikep434r3_fips202.h"
+#include "utils/s2n_safety.h"
+#include "tls/s2n_kem.h"
+#include "pq-crypto/s2n_pq.h"
+#include "pq-crypto/s2n_pq_random.h"
+#include "sikep434r3_fpx.h"
+#include "sikep434r3_api.h"
+
+/* SIKE's key generation
+ * Outputs: secret key sk (S2N_SIKE_P434_R3_SECRET_KEY_BYTES = S2N_SIKE_P434_R3_MSG_BYTES + S2N_SIKE_P434_R3_SECRETKEY_B_BYTES + S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES bytes)
+ * public key pk (S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES bytes) */
+int s2n_sike_p434_r3_crypto_kem_keypair(unsigned char *pk, unsigned char *sk)
+{
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+
+ /* Generate lower portion of secret key sk <- s||SK */
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(sk, S2N_SIKE_P434_R3_MSG_BYTES));
+ POSIX_GUARD(random_mod_order_B(sk + S2N_SIKE_P434_R3_MSG_BYTES));
+
+ /* Generate public key pk */
+ EphemeralKeyGeneration_B(sk + S2N_SIKE_P434_R3_MSG_BYTES, pk);
+
+ /* Append public key pk to secret key sk */
+ memcpy(&sk[S2N_SIKE_P434_R3_MSG_BYTES + S2N_SIKE_P434_R3_SECRETKEY_B_BYTES], pk, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES);
+
+ return S2N_SUCCESS;
+}
+
+/* SIKE's encapsulation
+ * Input: public key pk (S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES bytes)
+ * Outputs: shared secret ss (S2N_SIKE_P434_R3_SHARED_SECRET_BYTES bytes)
+ * ciphertext message ct (S2N_SIKE_P434_R3_CIPHERTEXT_BYTES = S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES + S2N_SIKE_P434_R3_MSG_BYTES bytes) */
+int s2n_sike_p434_r3_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk)
+{
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+
+ unsigned char ephemeralsk[S2N_SIKE_P434_R3_SECRETKEY_A_BYTES];
+ unsigned char jinvariant[S2N_SIKE_P434_R3_FP2_ENCODED_BYTES];
+ unsigned char h[S2N_SIKE_P434_R3_MSG_BYTES];
+ unsigned char temp[S2N_SIKE_P434_R3_CIPHERTEXT_BYTES+S2N_SIKE_P434_R3_MSG_BYTES];
+
+ /* Generate ephemeralsk <- G(m||pk) mod oA */
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(temp, S2N_SIKE_P434_R3_MSG_BYTES));
+ memcpy(&temp[S2N_SIKE_P434_R3_MSG_BYTES], pk, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES);
+ shake256(ephemeralsk, S2N_SIKE_P434_R3_SECRETKEY_A_BYTES, temp, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES+S2N_SIKE_P434_R3_MSG_BYTES);
+ ephemeralsk[S2N_SIKE_P434_R3_SECRETKEY_A_BYTES - 1] &= S2N_SIKE_P434_R3_MASK_ALICE;
+
+ /* Encrypt */
+ EphemeralKeyGeneration_A(ephemeralsk, ct);
+ EphemeralSecretAgreement_A(ephemeralsk, pk, jinvariant);
+ shake256(h, S2N_SIKE_P434_R3_MSG_BYTES, jinvariant, S2N_SIKE_P434_R3_FP2_ENCODED_BYTES);
+ for (int i = 0; i < S2N_SIKE_P434_R3_MSG_BYTES; i++) {
+ ct[i + S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES] = temp[i] ^ h[i];
+ }
+
+ /* Generate shared secret ss <- H(m||ct) */
+ memcpy(&temp[S2N_SIKE_P434_R3_MSG_BYTES], ct, S2N_SIKE_P434_R3_CIPHERTEXT_BYTES);
+ shake256(ss, S2N_SIKE_P434_R3_SHARED_SECRET_BYTES, temp, S2N_SIKE_P434_R3_CIPHERTEXT_BYTES+S2N_SIKE_P434_R3_MSG_BYTES);
+
+ return S2N_SUCCESS;
+}
+
+/* SIKE's decapsulation
+ * Input: secret key sk (S2N_SIKE_P434_R3_SECRET_KEY_BYTES = S2N_SIKE_P434_R3_MSG_BYTES + S2N_SIKE_P434_R3_SECRETKEY_B_BYTES + S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES bytes)
+ * ciphertext message ct (S2N_SIKE_P434_R3_CIPHERTEXT_BYTES = S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES + S2N_SIKE_P434_R3_MSG_BYTES bytes)
+ * Outputs: shared secret ss (S2N_SIKE_P434_R3_SHARED_SECRET_BYTES bytes) */
+int s2n_sike_p434_r3_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk)
+{
+ POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+
+ unsigned char ephemeralsk_[S2N_SIKE_P434_R3_SECRETKEY_A_BYTES];
+ unsigned char jinvariant_[S2N_SIKE_P434_R3_FP2_ENCODED_BYTES];
+ unsigned char h_[S2N_SIKE_P434_R3_MSG_BYTES];
+ unsigned char c0_[S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES];
+ unsigned char temp[S2N_SIKE_P434_R3_CIPHERTEXT_BYTES+S2N_SIKE_P434_R3_MSG_BYTES];
+
+ /* Decrypt */
+ EphemeralSecretAgreement_B(sk + S2N_SIKE_P434_R3_MSG_BYTES, ct, jinvariant_);
+ shake256(h_, S2N_SIKE_P434_R3_MSG_BYTES, jinvariant_, S2N_SIKE_P434_R3_FP2_ENCODED_BYTES);
+ for (int i = 0; i < S2N_SIKE_P434_R3_MSG_BYTES; i++) {
+ temp[i] = ct[i + S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES] ^ h_[i];
+ }
+
+ /* Generate ephemeralsk_ <- G(m||pk) mod oA */
+ memcpy(&temp[S2N_SIKE_P434_R3_MSG_BYTES], &sk[S2N_SIKE_P434_R3_MSG_BYTES + S2N_SIKE_P434_R3_SECRETKEY_B_BYTES], S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES);
+ shake256(ephemeralsk_, S2N_SIKE_P434_R3_SECRETKEY_A_BYTES, temp, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES+S2N_SIKE_P434_R3_MSG_BYTES);
+ ephemeralsk_[S2N_SIKE_P434_R3_SECRETKEY_A_BYTES - 1] &= S2N_SIKE_P434_R3_MASK_ALICE;
+
+ /* Generate shared secret ss <- H(m||ct), or output ss <- H(s||ct) in case of ct verification failure */
+ EphemeralKeyGeneration_A(ephemeralsk_, c0_);
+
+ /* Verify ciphertext.
+ * If c0_ and ct are NOT equal, decaps failed and we overwrite the shared secret
+ * with pseudorandom noise (ss = H(s||ct)) by performing the copy (dont_copy = false).
+ *
+ * If c0_ and ct are equal, then decaps succeeded and we skip the overwrite and output
+ * the actual shared secret: ss = H(m||ct) (dont_copy = true). */
+ bool dont_copy = s2n_constant_time_equals(c0_, ct, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES);
+ POSIX_GUARD(s2n_constant_time_copy_or_dont(temp, sk, S2N_SIKE_P434_R3_MSG_BYTES, dont_copy));
+ memcpy(&temp[S2N_SIKE_P434_R3_MSG_BYTES], ct, S2N_SIKE_P434_R3_CIPHERTEXT_BYTES);
+ shake256(ss, S2N_SIKE_P434_R3_SHARED_SECRET_BYTES, temp, S2N_SIKE_P434_R3_CIPHERTEXT_BYTES+S2N_SIKE_P434_R3_MSG_BYTES);
+
+ return S2N_SUCCESS;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_sidh.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_sidh.c
new file mode 100644
index 0000000000..f570e27e32
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_sidh.c
@@ -0,0 +1,310 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH)
+*********************************************************************************************/
+
+#include "sikep434r3.h"
+#include "pq-crypto/s2n_pq_random.h"
+#include "utils/s2n_safety.h"
+#include "sikep434r3_fpx.h"
+#include "sikep434r3_ec_isogeny.h"
+#include "sikep434r3_api.h"
+
+/* Initialization of basis points */
+static void init_basis(const digit_t *gen, f2elm_t *XP, f2elm_t *XQ, f2elm_t *XR)
+{
+ fpcopy(gen, XP->e[0]);
+ fpcopy(gen + S2N_SIKE_P434_R3_NWORDS_FIELD, XP->e[1]);
+ fpcopy(gen + 2*S2N_SIKE_P434_R3_NWORDS_FIELD, XQ->e[0]);
+ fpcopy(gen + 3*S2N_SIKE_P434_R3_NWORDS_FIELD, XQ->e[1]);
+ fpcopy(gen + 4*S2N_SIKE_P434_R3_NWORDS_FIELD, XR->e[0]);
+ fpcopy(gen + 5*S2N_SIKE_P434_R3_NWORDS_FIELD, XR->e[1]);
+}
+
+/* Generation of Bob's secret key
+ * Outputs random value in [0, 2^Floor(Log(2, oB)) - 1] */
+int random_mod_order_B(unsigned char* random_digits)
+{
+ POSIX_GUARD_RESULT(s2n_get_random_bytes(random_digits, S2N_SIKE_P434_R3_SECRETKEY_B_BYTES));
+ random_digits[S2N_SIKE_P434_R3_SECRETKEY_B_BYTES-1] &= S2N_SIKE_P434_R3_MASK_BOB; /* Masking last byte */
+
+ return 0;
+}
+
+/* Alice's ephemeral public key generation
+ * Input: a private key PrivateKeyA in the range [0, 2^eA - 1].
+ * Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded
+ * by removing leading 0 bytes. */
+int EphemeralKeyGeneration_A(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA)
+{
+ point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE];
+ f2elm_t _XPA, _XQA, _XRA, coeff[3], _A24plus = {0}, _C24 = {0}, _A = {0};
+ f2elm_t *XPA=&_XPA, *XQA=&_XQA, *XRA=&_XRA, *A24plus=&_A24plus, *C24=&_C24, *A=&_A;
+ unsigned int i, row, m, tree_index = 0, pts_index[S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE], npts = 0, ii = 0;
+ digit_t SecretKeyA[S2N_SIKE_P434_R3_NWORDS_ORDER] = {0};
+
+ /* Initialize basis points */
+ init_basis((const digit_t*)A_gen, XPA, XQA, XRA);
+ init_basis((const digit_t*)B_gen, &phiP->X, &phiQ->X, &phiR->X);
+ fpcopy((const digit_t*)&Montgomery_one, (phiP->Z.e)[0]);
+ fpcopy((const digit_t*)&Montgomery_one, (phiQ->Z.e)[0]);
+ fpcopy((const digit_t*)&Montgomery_one, (phiR->Z.e)[0]);
+
+ /* Initialize constants: A24plus = A+2C, C24 = 4C, where A=6, C=1 */
+ fpcopy((const digit_t*)&Montgomery_one, A24plus->e[0]);
+ mp2_add(A24plus, A24plus, A24plus);
+ mp2_add(A24plus, A24plus, C24);
+ mp2_add(A24plus, C24, A);
+ mp2_add(C24, C24, A24plus);
+
+ /* Retrieve kernel point */
+ decode_to_digits(PrivateKeyA, SecretKeyA, S2N_SIKE_P434_R3_SECRETKEY_A_BYTES, S2N_SIKE_P434_R3_NWORDS_ORDER);
+ LADDER3PT(XPA, XQA, XRA, SecretKeyA, S2N_SIKE_P434_R3_ALICE, R, A);
+
+ /* Traverse tree */
+ tree_index = 0;
+ for (row = 1; row < S2N_SIKE_P434_R3_MAX_ALICE; row++) {
+ while (tree_index < S2N_SIKE_P434_R3_MAX_ALICE-row) {
+ fp2copy(&R->X, &pts[npts]->X);
+ fp2copy(&R->Z, &pts[npts]->Z);
+ pts_index[npts++] = tree_index;
+ m = strat_Alice[ii++];
+ xDBLe(R, R, A24plus, C24, (int)(2*m));
+ tree_index += m;
+ }
+ get_4_isog(R, A24plus, C24, coeff);
+
+ for (i = 0; i < npts; i++) {
+ eval_4_isog(pts[i], coeff);
+ }
+ eval_4_isog(phiP, coeff);
+ eval_4_isog(phiQ, coeff);
+ eval_4_isog(phiR, coeff);
+
+ fp2copy(&pts[npts-1]->X, &R->X);
+ fp2copy(&pts[npts-1]->Z, &R->Z);
+ tree_index = pts_index[npts-1];
+ npts -= 1;
+ }
+
+ get_4_isog(R, A24plus, C24, coeff);
+ eval_4_isog(phiP, coeff);
+ eval_4_isog(phiQ, coeff);
+ eval_4_isog(phiR, coeff);
+
+ inv_3_way(&phiP->Z, &phiQ->Z, &phiR->Z);
+ fp2mul_mont(&phiP->X, &phiP->Z, &phiP->X);
+ fp2mul_mont(&phiQ->X, &phiQ->Z, &phiQ->X);
+ fp2mul_mont(&phiR->X, &phiR->Z, &phiR->X);
+
+ /* Format public key */
+ fp2_encode(&phiP->X, PublicKeyA);
+ fp2_encode(&phiQ->X, PublicKeyA + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES);
+ fp2_encode(&phiR->X, PublicKeyA + 2*S2N_SIKE_P434_R3_FP2_ENCODED_BYTES);
+
+ return 0;
+}
+
+/* Bob's ephemeral public key generation
+ * Input: a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1].
+ * Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded
+ * by removing leading 0 bytes. */
+int EphemeralKeyGeneration_B(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB)
+{
+ point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB];
+ f2elm_t _XPB, _XQB, _XRB, coeff[3], _A24plus = {0}, _A24minus = {0}, _A = {0};
+ f2elm_t *XPB=&_XPB, *XQB=&_XQB, *XRB=&_XRB, *A24plus=&_A24plus, *A24minus=&_A24minus, *A=&_A;
+
+ unsigned int i, row, m, tree_index = 0, pts_index[S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB], npts = 0, ii = 0;
+ digit_t SecretKeyB[S2N_SIKE_P434_R3_NWORDS_ORDER] = {0};
+
+ /* Initialize basis points */
+ init_basis((const digit_t*)B_gen, XPB, XQB, XRB);
+ init_basis((const digit_t*)A_gen, &phiP->X, &phiQ->X, &phiR->X);
+ fpcopy((const digit_t*)&Montgomery_one, (phiP->Z.e)[0]);
+ fpcopy((const digit_t*)&Montgomery_one, (phiQ->Z.e)[0]);
+ fpcopy((const digit_t*)&Montgomery_one, (phiR->Z.e)[0]);
+
+ /* Initialize constants: A24minus = A-2C, A24plus = A+2C, where A=6, C=1 */
+ fpcopy((const digit_t*)&Montgomery_one, A24plus->e[0]);
+ mp2_add(A24plus, A24plus, A24plus);
+ mp2_add(A24plus, A24plus, A24minus);
+ mp2_add(A24plus, A24minus, A);
+ mp2_add(A24minus, A24minus, A24plus);
+
+ /* Retrieve kernel point */
+ decode_to_digits(PrivateKeyB, SecretKeyB, S2N_SIKE_P434_R3_SECRETKEY_B_BYTES, S2N_SIKE_P434_R3_NWORDS_ORDER);
+ LADDER3PT(XPB, XQB, XRB, SecretKeyB, S2N_SIKE_P434_R3_BOB, R, A);
+
+ /* Traverse tree */
+ tree_index = 0;
+ for (row = 1; row < S2N_SIKE_P434_R3_MAX_BOB; row++) {
+ while (tree_index < S2N_SIKE_P434_R3_MAX_BOB-row) {
+ fp2copy(&R->X, &pts[npts]->X);
+ fp2copy(&R->Z, &pts[npts]->Z);
+ pts_index[npts++] = tree_index;
+ m = strat_Bob[ii++];
+ xTPLe(R, R, A24minus, A24plus, (int)m);
+ tree_index += m;
+ }
+ get_3_isog(R, A24minus, A24plus, coeff);
+
+ for (i = 0; i < npts; i++) {
+ eval_3_isog(pts[i], coeff);
+ }
+ eval_3_isog(phiP, coeff);
+ eval_3_isog(phiQ, coeff);
+ eval_3_isog(phiR, coeff);
+
+ fp2copy(&pts[npts-1]->X, &R->X);
+ fp2copy(&pts[npts-1]->Z, &R->Z);
+ tree_index = pts_index[npts-1];
+ npts -= 1;
+ }
+
+ get_3_isog(R, A24minus, A24plus, coeff);
+ eval_3_isog(phiP, coeff);
+ eval_3_isog(phiQ, coeff);
+ eval_3_isog(phiR, coeff);
+
+ inv_3_way(&phiP->Z, &phiQ->Z, &phiR->Z);
+ fp2mul_mont(&phiP->X, &phiP->Z, &phiP->X);
+ fp2mul_mont(&phiQ->X, &phiQ->Z, &phiQ->X);
+ fp2mul_mont(&phiR->X, &phiR->Z, &phiR->X);
+
+ /* Format public key */
+ fp2_encode(&phiP->X, PublicKeyB);
+ fp2_encode(&phiQ->X, PublicKeyB + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES);
+ fp2_encode(&phiR->X, PublicKeyB + 2*S2N_SIKE_P434_R3_FP2_ENCODED_BYTES);
+
+ return 0;
+}
+
+/* Alice's ephemeral shared secret computation
+ * It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB
+ * Inputs: Alice's PrivateKeyA is an integer in the range [0, oA-1].
+ * Bob's PublicKeyB consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes.
+ * Output: a shared secret SharedSecretA that consists of one element in GF(p^2) encoded
+ * by removing leading 0 bytes. */
+int EphemeralSecretAgreement_A(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB,
+ unsigned char* SharedSecretA)
+{
+ point_proj_t R, pts[S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE];
+ f2elm_t coeff[3], PKB[3], _jinv;
+ f2elm_t _A24plus = {0}, _C24 = {0}, _A = {0};
+ f2elm_t *jinv=&_jinv, *A24plus=&_A24plus, *C24=&_C24, *A=&_A;
+ unsigned int i, row, m, tree_index = 0, pts_index[S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE], npts = 0, ii = 0;
+ digit_t SecretKeyA[S2N_SIKE_P434_R3_NWORDS_ORDER] = {0};
+
+ /* Initialize images of Bob's basis */
+ fp2_decode(PublicKeyB, &PKB[0]);
+ fp2_decode(PublicKeyB + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES, &PKB[1]);
+ fp2_decode(PublicKeyB + 2*S2N_SIKE_P434_R3_FP2_ENCODED_BYTES, &PKB[2]);
+
+ /* Initialize constants: A24plus = A+2C, C24 = 4C, where C=1 */
+ get_A(&PKB[0], &PKB[1], &PKB[2], A);
+ mp_add((const digit_t*)&Montgomery_one, (const digit_t*)&Montgomery_one, C24->e[0], S2N_SIKE_P434_R3_NWORDS_FIELD);
+ mp2_add(A, C24, A24plus);
+ mp_add(C24->e[0], C24->e[0], C24->e[0], S2N_SIKE_P434_R3_NWORDS_FIELD);
+
+ /* Retrieve kernel point */
+ decode_to_digits(PrivateKeyA, SecretKeyA, S2N_SIKE_P434_R3_SECRETKEY_A_BYTES, S2N_SIKE_P434_R3_NWORDS_ORDER);
+ LADDER3PT(&PKB[0], &PKB[1], &PKB[2], SecretKeyA, S2N_SIKE_P434_R3_ALICE, R, A);
+
+ /* Traverse tree */
+ tree_index = 0;
+ for (row = 1; row < S2N_SIKE_P434_R3_MAX_ALICE; row++) {
+ while (tree_index < S2N_SIKE_P434_R3_MAX_ALICE-row) {
+ fp2copy(&R->X, &pts[npts]->X);
+ fp2copy(&R->Z, &pts[npts]->Z);
+ pts_index[npts++] = tree_index;
+ m = strat_Alice[ii++];
+ xDBLe(R, R, A24plus, C24, (int)(2*m));
+ tree_index += m;
+ }
+ get_4_isog(R, A24plus, C24, coeff);
+
+ for (i = 0; i < npts; i++) {
+ eval_4_isog(pts[i], coeff);
+ }
+
+ fp2copy(&pts[npts-1]->X, &R->X);
+ fp2copy(&pts[npts-1]->Z, &R->Z);
+ tree_index = pts_index[npts-1];
+ npts -= 1;
+ }
+
+ get_4_isog(R, A24plus, C24, coeff);
+ mp2_add(A24plus, A24plus, A24plus);
+ fp2sub(A24plus, C24, A24plus);
+ fp2add(A24plus, A24plus, A24plus);
+ j_inv(A24plus, C24, jinv);
+ fp2_encode(jinv, SharedSecretA); /* Format shared secret */
+
+ return 0;
+}
+
+/* Bob's ephemeral shared secret computation
+ * It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA
+ * Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,oB)) - 1].
+ * Alice's PublicKeyA consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes.
+ * Output: a shared secret SharedSecretB that consists of one element in GF(p^2) encoded
+ * by removing leading 0 bytes. */
+int EphemeralSecretAgreement_B(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA,
+ unsigned char* SharedSecretB)
+{
+ point_proj_t R, pts[S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB];
+ f2elm_t coeff[3], PKB[3], _jinv;
+ f2elm_t _A24plus = {0}, _A24minus = {0}, _A = {0};
+ f2elm_t *jinv=&_jinv, *A24plus=&_A24plus, *A24minus=&_A24minus, *A=&_A;
+ unsigned int i, row, m, tree_index = 0, pts_index[S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB], npts = 0, ii = 0;
+ digit_t SecretKeyB[S2N_SIKE_P434_R3_NWORDS_ORDER] = {0};
+
+ /* Initialize images of Alice's basis */
+ fp2_decode(PublicKeyA, &PKB[0]);
+ fp2_decode(PublicKeyA + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES, &PKB[1]);
+ fp2_decode(PublicKeyA + 2*S2N_SIKE_P434_R3_FP2_ENCODED_BYTES, &PKB[2]);
+
+ /* Initialize constants: A24plus = A+2C, A24minus = A-2C, where C=1 */
+ get_A(&PKB[0], &PKB[1], &PKB[2], A);
+ mp_add((const digit_t*)&Montgomery_one, (const digit_t*)&Montgomery_one, A24minus->e[0], S2N_SIKE_P434_R3_NWORDS_FIELD);
+ mp2_add(A, A24minus, A24plus);
+ mp2_sub_p2(A, A24minus, A24minus);
+
+ /* Retrieve kernel point */
+ decode_to_digits(PrivateKeyB, SecretKeyB, S2N_SIKE_P434_R3_SECRETKEY_B_BYTES, S2N_SIKE_P434_R3_NWORDS_ORDER);
+ LADDER3PT(&PKB[0], &PKB[1], &PKB[2], SecretKeyB, S2N_SIKE_P434_R3_BOB, R, A);
+
+ /* Traverse tree */
+ tree_index = 0;
+ for (row = 1; row < S2N_SIKE_P434_R3_MAX_BOB; row++) {
+ while (tree_index < S2N_SIKE_P434_R3_MAX_BOB-row) {
+ fp2copy(&R->X, &pts[npts]->X);
+ fp2copy(&R->Z, &pts[npts]->Z);
+ pts_index[npts++] = tree_index;
+ m = strat_Bob[ii++];
+ xTPLe(R, R, A24minus, A24plus, (int)m);
+ tree_index += m;
+ }
+ get_3_isog(R, A24minus, A24plus, coeff);
+
+ for (i = 0; i < npts; i++) {
+ eval_3_isog(pts[i], coeff);
+ }
+
+ fp2copy(&pts[npts-1]->X, &R->X);
+ fp2copy(&pts[npts-1]->Z, &R->Z);
+ tree_index = pts_index[npts-1];
+ npts -= 1;
+ }
+
+ get_3_isog(R, A24minus, A24plus, coeff);
+ fp2add(A24plus, A24minus, A);
+ fp2add(A, A, A);
+ fp2sub(A24plus, A24minus, A24plus);
+ j_inv(A, A24plus, jinv);
+ fp2_encode(jinv, SharedSecretB); /* Format shared secret */
+
+ return 0;
+}