Update contrib/restricted/aws/s2n to 1.3.12

ref:f8279d764b4c00974a63543a1364c91e2b81b7a6
author: thegeorg <thegeorg@yandex-team.ru> 2022-05-10 22:16:03 +0300
committer: thegeorg <thegeorg@yandex-team.ru> 2022-05-10 22:16:03 +0300
commit: 09c71d918d4d0b0ebf67e1ab41aa90ddf587a3f2 (patch)
tree: dd44d2cb68e2845c2d4c367b66893f3e043a6e8e /contrib/restricted/aws/s2n/pq-crypto
parent: 5eb4a8a2d487411924e1d1b27c454223dcf35005 (diff)
download: ydb-09c71d918d4d0b0ebf67e1ab41aa90ddf587a3f2.tar.gz
136 files changed, 14877 insertions, 3147 deletions
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c
index 26c99bc80d..2f211010df 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c
@@ -27,7 +27,7 @@ init_aes_ctr_prf_state(OUT aes_ctr_prf_state_t *s,
   bike_static_assert(sizeof(*seed) == sizeof(key.raw), seed_size_equals_ky_size);
   memcpy(key.raw, seed->raw, sizeof(key.raw));
 
-  GUARD(aes256_key_expansion(&s->ks_ptr, &key));
+  POSIX_GUARD(aes256_key_expansion(&s->ks_ptr, &key));
 
   // Initialize buffer and counter
   s->ctr.u.qw[0]    = 0;
@@ -59,7 +59,7 @@ perform_aes(OUT uint8_t *ct, IN OUT aes_ctr_prf_state_t *s)
     BIKE_ERROR(E_AES_OVER_USED);
   }
 
-  GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks_ptr));
+  POSIX_GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks_ptr));
 
   s->ctr.u.qw[0]++;
   s->rem_invokations--;
@@ -91,11 +91,11 @@ aes_ctr_prf(OUT uint8_t *a, IN OUT aes_ctr_prf_state_t *s, IN const uint32_t len
   // Copy full AES blocks
   while((len - idx) >= AES256_BLOCK_SIZE)
   {
-    GUARD(perform_aes(&a[idx], s));
+    POSIX_GUARD(perform_aes(&a[idx], s));
     idx += AES256_BLOCK_SIZE;
   }
 
-  GUARD(perform_aes(s->buffer.u.bytes, s));
+  POSIX_GUARD(perform_aes(s->buffer.u.bytes, s));
 
   // Copy the tail
   s->pos = len - idx;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c
index 21b0b6f5a3..ba43098837 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c
@@ -78,18 +78,18 @@ encrypt(OUT ct_t *ct,
   p_pk[1].val   = pk->val[1];
 
   DMSG("    Sampling m.\n");
-  GUARD(sample_uniform_r_bits(&m.val, seed, NO_RESTRICTION));
+  POSIX_GUARD(sample_uniform_r_bits(&m.val, seed, NO_RESTRICTION));
 
   DMSG("    Calculating the ciphertext.\n");
 
-  GUARD(gf2x_mod_mul((uint64_t *)&p_ct[0], (uint64_t *)&m, (uint64_t *)&p_pk[0]));
-  GUARD(gf2x_mod_mul((uint64_t *)&p_ct[1], (uint64_t *)&m, (uint64_t *)&p_pk[1]));
+  POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_ct[0], (uint64_t *)&m, (uint64_t *)&p_pk[0]));
+  POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_ct[1], (uint64_t *)&m, (uint64_t *)&p_pk[1]));
 
   DMSG("    Addding Error to the ciphertext.\n");
 
-  GUARD(
+  POSIX_GUARD(
       gf2x_add(p_ct[0].val.raw, p_ct[0].val.raw, splitted_e->val[0].raw, R_SIZE));
-  GUARD(
+  POSIX_GUARD(
       gf2x_add(p_ct[1].val.raw, p_ct[1].val.raw, splitted_e->val[1].raw, R_SIZE));
 
   // Copy the data outside
@@ -113,12 +113,12 @@ calc_pk(OUT pk_t *pk, IN const seed_t *g_seed, IN const pad_sk_t p_sk)
   // Intialized padding to zero
   DEFER_CLEANUP(padded_r_t g = {0}, padded_r_cleanup);
 
-  GUARD(sample_uniform_r_bits(&g.val, g_seed, MUST_BE_ODD));
+  POSIX_GUARD(sample_uniform_r_bits(&g.val, g_seed, MUST_BE_ODD));
 
   // Calculate (g0, g1) = (g*h1, g*h0)
-  GUARD(gf2x_mod_mul((uint64_t *)&p_pk[0], (const uint64_t *)&g,
+  POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_pk[0], (const uint64_t *)&g,
                      (const uint64_t *)&p_sk[1]));
-  GUARD(gf2x_mod_mul((uint64_t *)&p_pk[1], (const uint64_t *)&g,
+  POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_pk[1], (const uint64_t *)&g,
                      (const uint64_t *)&p_sk[0]));
 
   // Copy the data to the output parameters.
@@ -156,7 +156,7 @@ get_ss(OUT ss_t *out, IN const e_t *e)
 int
 BIKE1_L1_R1_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
 {
-  ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+  POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
 
   // Convert to this implementation types
   pk_t *l_pk = (pk_t *)pk;
@@ -177,14 +177,14 @@ BIKE1_L1_R1_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
   DMSG("    Calculating the secret key.\n");
 
   // h0 and h1 use the same context
-  GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
+  POSIX_GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
 
-  GUARD(generate_sparse_rep((uint64_t *)&p_sk[0], l_sk.wlist[0].val, DV, R_BITS,
+  POSIX_GUARD(generate_sparse_rep((uint64_t *)&p_sk[0], l_sk.wlist[0].val, DV, R_BITS,
                             sizeof(p_sk[0]), &h_prf_state));
   // Copy data
   l_sk.bin[0] = p_sk[0].val;
 
-  GUARD(generate_sparse_rep((uint64_t *)&p_sk[1], l_sk.wlist[1].val, DV, R_BITS,
+  POSIX_GUARD(generate_sparse_rep((uint64_t *)&p_sk[1], l_sk.wlist[1].val, DV, R_BITS,
                             sizeof(p_sk[1]), &h_prf_state));
 
   // Copy data
@@ -192,7 +192,7 @@ BIKE1_L1_R1_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
 
   DMSG("    Calculating the public key.\n");
 
-  GUARD(calc_pk(l_pk, &seeds.seed[1], p_sk));
+  POSIX_GUARD(calc_pk(l_pk, &seeds.seed[1], p_sk));
 
   memcpy(sk, &l_sk, sizeof(l_sk));
 
@@ -214,7 +214,7 @@ BIKE1_L1_R1_crypto_kem_enc(OUT unsigned char *     ct,
                            IN const unsigned char *pk)
 {
   DMSG("  Enter crypto_kem_enc.\n");
-  ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+  POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
 
   // Convert to this implementation types
   const pk_t *l_pk = (const pk_t *)pk;
@@ -231,11 +231,11 @@ BIKE1_L1_R1_crypto_kem_enc(OUT unsigned char *     ct,
 
   // Random data generator
   // Using first seed
-  GUARD(init_aes_ctr_prf_state(&e_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
+  POSIX_GUARD(init_aes_ctr_prf_state(&e_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
 
   DMSG("    Generating error.\n");
   ALIGN(8) compressed_idx_t_t dummy;
-  GUARD(generate_sparse_rep((uint64_t *)&e, dummy.val, T1, N_BITS, sizeof(e),
+  POSIX_GUARD(generate_sparse_rep((uint64_t *)&e, dummy.val, T1, N_BITS, sizeof(e),
                             &e_prf_state));
 
   print("e:  ", (uint64_t *)&e.val, sizeof(e) * 8);
@@ -250,7 +250,7 @@ BIKE1_L1_R1_crypto_kem_enc(OUT unsigned char *     ct,
   // Computing ct = enc(pk, e)
   // Using second seed
   DMSG("    Encrypting.\n");
-  GUARD(encrypt(l_ct, l_pk, &seeds.seed[1], &splitted_e));
+  POSIX_GUARD(encrypt(l_ct, l_pk, &seeds.seed[1], &splitted_e));
 
   DMSG("    Generating shared secret.\n");
   get_ss(l_ss, &e.val);
@@ -269,7 +269,7 @@ BIKE1_L1_R1_crypto_kem_dec(OUT unsigned char *     ss,
                            IN const unsigned char *sk)
 {
   DMSG("  Enter crypto_kem_dec.\n");
-  ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+  POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
 
   // Convert to this implementation types
   const ct_t *l_ct = (const ct_t *)ct;
@@ -284,10 +284,10 @@ BIKE1_L1_R1_crypto_kem_dec(OUT unsigned char *     ss,
   DEFER_CLEANUP(e_t merged_e = {0}, e_cleanup);
 
   DMSG("  Computing s.\n");
-  GUARD(compute_syndrome(&syndrome, l_ct, &l_sk));
+  POSIX_GUARD(compute_syndrome(&syndrome, l_ct, &l_sk));
 
   DMSG("  Decoding.\n");
-  GUARD(decode(&e, &syndrome, l_ct, &l_sk));
+  POSIX_GUARD(decode(&e, &syndrome, l_ct, &l_sk));
 
   // Check if the error weight equals T1
   if(T1 != r_bits_vector_weight(&e.val[0]) + r_bits_vector_weight(&e.val[1]))
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c
index 404c6377da..b455cd7e82 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c
@@ -96,12 +96,12 @@ compute_syndrome(OUT syndrome_t *syndrome, IN const ct_t *ct, IN const sk_t *sk)
   pad_ct[1].val = ct->val[1];
 
   // Compute s = c0*h0 + c1*h1:
-  GUARD(gf2x_mod_mul((uint64_t *)&pad_s[0], (uint64_t *)&pad_ct[0],
+  POSIX_GUARD(gf2x_mod_mul((uint64_t *)&pad_s[0], (uint64_t *)&pad_ct[0],
                      (uint64_t *)&pad_sk[0]));
-  GUARD(gf2x_mod_mul((uint64_t *)&pad_s[1], (uint64_t *)&pad_ct[1],
+  POSIX_GUARD(gf2x_mod_mul((uint64_t *)&pad_s[1], (uint64_t *)&pad_ct[1],
                      (uint64_t *)&pad_sk[1]));
 
-  GUARD(gf2x_add(pad_s[0].val.raw, pad_s[0].val.raw, pad_s[1].val.raw, R_SIZE));
+  POSIX_GUARD(gf2x_add(pad_s[0].val.raw, pad_s[0].val.raw, pad_s[1].val.raw, R_SIZE));
 
   memcpy((uint8_t *)syndrome->qw, pad_s[0].val.raw, R_SIZE);
   dup(syndrome);
@@ -118,13 +118,13 @@ recompute_syndrome(OUT syndrome_t *syndrome,
   ct_t tmp_ct = *ct;
 
   // Adapt the ciphertext
-  GUARD(gf2x_add(tmp_ct.val[0].raw, tmp_ct.val[0].raw, splitted_e->val[0].raw,
+  POSIX_GUARD(gf2x_add(tmp_ct.val[0].raw, tmp_ct.val[0].raw, splitted_e->val[0].raw,
                  R_SIZE));
-  GUARD(gf2x_add(tmp_ct.val[1].raw, tmp_ct.val[1].raw, splitted_e->val[1].raw,
+  POSIX_GUARD(gf2x_add(tmp_ct.val[1].raw, tmp_ct.val[1].raw, splitted_e->val[1].raw,
                  R_SIZE));
 
   // Recompute the syndrome
-  GUARD(compute_syndrome(syndrome, &tmp_ct, sk));
+  POSIX_GUARD(compute_syndrome(syndrome, &tmp_ct, sk));
 
   return SUCCESS;
 }
@@ -334,7 +334,7 @@ decode(OUT split_e_t *e,
     DMSG("    Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
 
     find_err1(e, &black_e, &gray_e, &s, sk->wlist, threshold);
-    GUARD(recompute_syndrome(&s, ct, sk, e));
+    POSIX_GUARD(recompute_syndrome(&s, ct, sk, e));
 #ifdef BGF_DECODER
     if(iter >= 1)
     {
@@ -346,14 +346,14 @@ decode(OUT split_e_t *e,
     DMSG("    Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
 
     find_err2(e, &black_e, &s, sk->wlist, ((DV + 1) / 2) + 1);
-    GUARD(recompute_syndrome(&s, ct, sk, e));
+    POSIX_GUARD(recompute_syndrome(&s, ct, sk, e));
 
     DMSG("    Weight of e: %lu\n",
          r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1]));
     DMSG("    Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
 
     find_err2(e, &gray_e, &s, sk->wlist, ((DV + 1) / 2) + 1);
-    GUARD(recompute_syndrome(&s, ct, sk, e));
+    POSIX_GUARD(recompute_syndrome(&s, ct, sk, e));
   }
 
   if(r_bits_vector_weight((r_t *)s.qw) > 0)
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c
index 09e0af3fde..c80d3365cb 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c
@@ -108,15 +108,15 @@ ossl_add(OUT uint8_t      res_bin[R_SIZE],
     BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
   }
 
-  GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
-  GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
+  POSIX_GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
+  POSIX_GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
 
   if(BN_GF2m_add(r, a, b) == 0)
   {
     BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
   }
 
-  GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
+  POSIX_GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
 
   return SUCCESS;
 }
@@ -176,10 +176,10 @@ cyclic_product(OUT uint8_t      res_bin[R_SIZE],
     BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
   }
 
-  GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
-  GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
-  GUARD(ossl_cyclic_product(r, a, b, bn_ctx));
-  GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
+  POSIX_GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
+  POSIX_GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
+  POSIX_GUARD(ossl_cyclic_product(r, a, b, bn_ctx));
+  POSIX_GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
 
   return SUCCESS;
 }
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c
index 3686338fad..d08fa5eea7 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c
@@ -20,7 +20,7 @@ get_rand_mod_len(OUT uint32_t *    rand_pos,
   do
   {
     // Generate 128bit of random numbers
-    GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos)));
+    POSIX_GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos)));
 
     // Mask only relevant bits
     (*rand_pos) &= mask;
@@ -56,7 +56,7 @@ sample_uniform_r_bits_with_fixed_prf_context(OUT r_t *r,
                                              IN const must_be_odd_t   must_be_odd)
 {
   // Generate random data
-  GUARD(aes_ctr_prf(r->raw, prf_state, R_SIZE));
+  POSIX_GUARD(aes_ctr_prf(r->raw, prf_state, R_SIZE));
 
   // Mask upper bits of the MSByte
   r->raw[R_SIZE - 1] &= MASK(R_BITS + 8 - (R_SIZE * 8));
@@ -104,7 +104,7 @@ generate_sparse_rep(OUT uint64_t *    a,
   // Generate weight rand numbers
   do
   {
-    GUARD(get_rand_mod_len(&wlist[ctr], len, prf_state));
+    POSIX_GUARD(get_rand_mod_len(&wlist[ctr], len, prf_state));
     ctr += is_new(wlist, ctr);
   } while(ctr < weight);
 
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h
index 1ffd56f34a..4ec60683de 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h
@@ -53,9 +53,9 @@ sample_uniform_r_bits(OUT r_t *r,
   // For the seedexpander
   DEFER_CLEANUP(aes_ctr_prf_state_t prf_state = {0}, aes_ctr_prf_state_cleanup);
 
-  GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed));
+  POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed));
 
-  GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd));
+  POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd));
 
   return SUCCESS;
 }
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c
index 26c99bc80d..2f211010df 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c
@@ -27,7 +27,7 @@ init_aes_ctr_prf_state(OUT aes_ctr_prf_state_t *s,
   bike_static_assert(sizeof(*seed) == sizeof(key.raw), seed_size_equals_ky_size);
   memcpy(key.raw, seed->raw, sizeof(key.raw));
 
-  GUARD(aes256_key_expansion(&s->ks_ptr, &key));
+  POSIX_GUARD(aes256_key_expansion(&s->ks_ptr, &key));
 
   // Initialize buffer and counter
   s->ctr.u.qw[0]    = 0;
@@ -59,7 +59,7 @@ perform_aes(OUT uint8_t *ct, IN OUT aes_ctr_prf_state_t *s)
     BIKE_ERROR(E_AES_OVER_USED);
   }
 
-  GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks_ptr));
+  POSIX_GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks_ptr));
 
   s->ctr.u.qw[0]++;
   s->rem_invokations--;
@@ -91,11 +91,11 @@ aes_ctr_prf(OUT uint8_t *a, IN OUT aes_ctr_prf_state_t *s, IN const uint32_t len
   // Copy full AES blocks
   while((len - idx) >= AES256_BLOCK_SIZE)
   {
-    GUARD(perform_aes(&a[idx], s));
+    POSIX_GUARD(perform_aes(&a[idx], s));
     idx += AES256_BLOCK_SIZE;
   }
 
-  GUARD(perform_aes(s->buffer.u.bytes, s));
+  POSIX_GUARD(perform_aes(s->buffer.u.bytes, s));
 
   // Copy the tail
   s->pos = len - idx;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c
index 8f29f3add9..e7797848a0 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c
@@ -61,12 +61,12 @@ calc_pk(OUT pk_t *pk, IN const seed_t *g_seed, IN const pad_sk_t p_sk)
   // Intialized padding to zero
   DEFER_CLEANUP(padded_r_t g = {0}, padded_r_cleanup);
 
-  GUARD(sample_uniform_r_bits(&g.val, g_seed, MUST_BE_ODD));
+  POSIX_GUARD(sample_uniform_r_bits(&g.val, g_seed, MUST_BE_ODD));
 
   // Calculate (g0, g1) = (g*h1, g*h0)
-  GUARD(gf2x_mod_mul((uint64_t *)&p_pk[0], (const uint64_t *)&g,
+  POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_pk[0], (const uint64_t *)&g,
                      (const uint64_t *)&p_sk[1]));
-  GUARD(gf2x_mod_mul((uint64_t *)&p_pk[1], (const uint64_t *)&g,
+  POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_pk[1], (const uint64_t *)&g,
                      (const uint64_t *)&p_sk[0]));
 
   // Copy the data to the output parameters.
@@ -102,12 +102,12 @@ function_h(OUT split_e_t *splitted_e, IN const r_t *in0, IN const r_t *in1)
 
   // Use the seed to generate a sparse error vector e:
   DMSG("    Generating random error.\n");
-  GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, &seed_for_hash));
+  POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, &seed_for_hash));
 
   DEFER_CLEANUP(padded_e_t e, padded_e_cleanup);
   DEFER_CLEANUP(ALIGN(8) compressed_idx_t_t dummy, compressed_idx_t_cleanup);
 
-  GUARD(generate_sparse_rep((uint64_t *)&e, dummy.val, T1, N_BITS, sizeof(e),
+  POSIX_GUARD(generate_sparse_rep((uint64_t *)&e, dummy.val, T1, N_BITS, sizeof(e),
                             &prf_state));
   split_e(splitted_e, &e.val);
 
@@ -120,7 +120,7 @@ encrypt(OUT ct_t *ct, OUT split_e_t *mf, IN const pk_t *pk, IN const seed_t *see
   DEFER_CLEANUP(padded_r_t m = {0}, padded_r_cleanup);
 
   DMSG("    Sampling m.\n");
-  GUARD(sample_uniform_r_bits(&m.val, seed, NO_RESTRICTION));
+  POSIX_GUARD(sample_uniform_r_bits(&m.val, seed, NO_RESTRICTION));
 
   // Pad the public key
   pad_pk_t p_pk = {0};
@@ -135,20 +135,20 @@ encrypt(OUT ct_t *ct, OUT split_e_t *mf, IN const pk_t *pk, IN const seed_t *see
   DEFER_CLEANUP(dbl_pad_ct_t mf_int = {0}, dbl_pad_ct_cleanup);
 
   DMSG("    Computing m*f0 and m*f1.\n");
-  GUARD(
+  POSIX_GUARD(
       gf2x_mod_mul((uint64_t *)&mf_int[0], (uint64_t *)&m, (uint64_t *)&p_pk[0]));
-  GUARD(
+  POSIX_GUARD(
       gf2x_mod_mul((uint64_t *)&mf_int[1], (uint64_t *)&m, (uint64_t *)&p_pk[1]));
 
   DEFER_CLEANUP(split_e_t splitted_e, split_e_cleanup);
 
   DMSG("    Computing the hash function e <- H(m*f0, m*f1).\n");
-  GUARD(function_h(&splitted_e, &mf_int[0].val, &mf_int[1].val));
+  POSIX_GUARD(function_h(&splitted_e, &mf_int[0].val, &mf_int[1].val));
 
   DMSG("    Addding Error to the ciphertext.\n");
-  GUARD(gf2x_add(p_ct[0].val.raw, mf_int[0].val.raw, splitted_e.val[0].raw,
+  POSIX_GUARD(gf2x_add(p_ct[0].val.raw, mf_int[0].val.raw, splitted_e.val[0].raw,
                  R_SIZE));
-  GUARD(gf2x_add(p_ct[1].val.raw, mf_int[1].val.raw, splitted_e.val[1].raw,
+  POSIX_GUARD(gf2x_add(p_ct[1].val.raw, mf_int[1].val.raw, splitted_e.val[1].raw,
                  R_SIZE));
 
   // Copy the data to the output parameters.
@@ -174,11 +174,11 @@ reencrypt(OUT pad_ct_t ce,
           IN const ct_t *l_ct)
 {
   // Compute (c0 + e0') and (c1 + e1')
-  GUARD(gf2x_add(ce[0].val.raw, l_ct->val[0].raw, e->val[0].raw, R_SIZE));
-  GUARD(gf2x_add(ce[1].val.raw, l_ct->val[1].raw, e->val[1].raw, R_SIZE));
+  POSIX_GUARD(gf2x_add(ce[0].val.raw, l_ct->val[0].raw, e->val[0].raw, R_SIZE));
+  POSIX_GUARD(gf2x_add(ce[1].val.raw, l_ct->val[1].raw, e->val[1].raw, R_SIZE));
 
   // (e0'', e1'') <-- H(c0 + e0', c1 + e1')
-  GUARD(function_h(e2, &ce[0].val, &ce[1].val));
+  POSIX_GUARD(function_h(e2, &ce[0].val, &ce[1].val));
 
   return SUCCESS;
 }
@@ -212,10 +212,10 @@ get_ss(OUT ss_t *out, IN const r_t *in0, IN const r_t *in1, IN const ct_t *ct)
 int
 BIKE1_L1_R2_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
 {
-  ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+  POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
 
-  notnull_check(sk);
-  notnull_check(pk);
+  POSIX_ENSURE_REF(sk);
+  POSIX_ENSURE_REF(pk);
 
   // Convert to this implementation types
   pk_t *l_pk = (pk_t *)pk;
@@ -232,27 +232,27 @@ BIKE1_L1_R2_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
   DEFER_CLEANUP(pad_sk_t p_sk = {0}, pad_sk_cleanup);
 
   // Get the entropy seeds.
-  GUARD(get_seeds(&seeds));
+  POSIX_GUARD(get_seeds(&seeds));
 
   DMSG("  Enter crypto_kem_keypair.\n");
   DMSG("    Calculating the secret key.\n");
 
   // h0 and h1 use the same context
-  GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
+  POSIX_GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
 
   // sigma0/1/2 use the same context.
-  GUARD(init_aes_ctr_prf_state(&s_prf_state, MAX_AES_INVOKATION, &seeds.seed[2]));
+  POSIX_GUARD(init_aes_ctr_prf_state(&s_prf_state, MAX_AES_INVOKATION, &seeds.seed[2]));
 
-  GUARD(generate_sparse_rep((uint64_t *)&p_sk[0], l_sk.wlist[0].val, DV, R_BITS,
+  POSIX_GUARD(generate_sparse_rep((uint64_t *)&p_sk[0], l_sk.wlist[0].val, DV, R_BITS,
                             sizeof(p_sk[0]), &h_prf_state));
 
   // Sample the sigmas
-  GUARD(sample_uniform_r_bits_with_fixed_prf_context(&l_sk.sigma0, &s_prf_state,
+  POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(&l_sk.sigma0, &s_prf_state,
                                                      NO_RESTRICTION));
-  GUARD(sample_uniform_r_bits_with_fixed_prf_context(&l_sk.sigma1, &s_prf_state,
+  POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(&l_sk.sigma1, &s_prf_state,
                                                      NO_RESTRICTION));
 
-  GUARD(generate_sparse_rep((uint64_t *)&p_sk[1], l_sk.wlist[1].val, DV, R_BITS,
+  POSIX_GUARD(generate_sparse_rep((uint64_t *)&p_sk[1], l_sk.wlist[1].val, DV, R_BITS,
                             sizeof(p_sk[1]), &h_prf_state));
 
   // Copy data
@@ -261,7 +261,7 @@ BIKE1_L1_R2_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
 
   DMSG("    Calculating the public key.\n");
 
-  GUARD(calc_pk(l_pk, &seeds.seed[1], p_sk));
+  POSIX_GUARD(calc_pk(l_pk, &seeds.seed[1], p_sk));
 
   memcpy(sk, &l_sk, sizeof(l_sk));
 
@@ -286,29 +286,29 @@ BIKE1_L1_R2_crypto_kem_enc(OUT unsigned char *     ct,
                            IN const unsigned char *pk)
 {
   DMSG("  Enter crypto_kem_enc.\n");
-  ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+  POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
 
   // Convert to the types that are used by this implementation
   const pk_t *l_pk = (const pk_t *)pk;
   ct_t *      l_ct = (ct_t *)ct;
   ss_t *      l_ss = (ss_t *)ss;
 
-  notnull_check(pk);
-  notnull_check(ct);
-  notnull_check(ss);
+  POSIX_ENSURE_REF(pk);
+  POSIX_ENSURE_REF(ct);
+  POSIX_ENSURE_REF(ss);
 
   // For NIST DRBG_CTR
   DEFER_CLEANUP(seeds_t seeds = {0}, seeds_cleanup);
 
   // Get the entropy seeds.
-  GUARD(get_seeds(&seeds));
+  POSIX_GUARD(get_seeds(&seeds));
 
   DMSG("    Encrypting.\n");
   // In fact, seed[0] should be used.
   // Here, we stay consistent with BIKE's reference code
   // that chooses the seconde seed.
   DEFER_CLEANUP(split_e_t mf, split_e_cleanup);
-  GUARD(encrypt(l_ct, &mf, l_pk, &seeds.seed[1]));
+  POSIX_GUARD(encrypt(l_ct, &mf, l_pk, &seeds.seed[1]));
 
   DMSG("    Generating shared secret.\n");
   get_ss(l_ss, &mf.val[0], &mf.val[1], l_ct);
@@ -327,14 +327,14 @@ BIKE1_L1_R2_crypto_kem_dec(OUT unsigned char *     ss,
                            IN const unsigned char *sk)
 {
   DMSG("  Enter crypto_kem_dec.\n");
-  ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+  POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
 
   // Convert to the types used by this implementation
   const ct_t *l_ct = (const ct_t *)ct;
   ss_t *      l_ss = (ss_t *)ss;
-  notnull_check(sk);
-  notnull_check(ct);
-  notnull_check(ss);
+  POSIX_ENSURE_REF(sk);
+  POSIX_ENSURE_REF(ct);
+  POSIX_ENSURE_REF(ss);
 
   DEFER_CLEANUP(ALIGN(8) sk_t l_sk, sk_cleanup);
   memcpy(&l_sk, sk, sizeof(l_sk));
@@ -344,14 +344,14 @@ BIKE1_L1_R2_crypto_kem_dec(OUT unsigned char *     ss,
   DEFER_CLEANUP(split_e_t e, split_e_cleanup);
 
   DMSG("  Computing s.\n");
-  GUARD(compute_syndrome(&syndrome, l_ct, &l_sk));
+  POSIX_GUARD(compute_syndrome(&syndrome, l_ct, &l_sk));
 
   DMSG("  Decoding.\n");
   uint32_t dec_ret = decode(&e, &syndrome, l_ct, &l_sk) != SUCCESS ? 0 : 1;
 
   DEFER_CLEANUP(split_e_t e2, split_e_cleanup);
   DEFER_CLEANUP(pad_ct_t ce, pad_ct_cleanup);
-  GUARD(reencrypt(ce, &e2, &e, l_ct));
+  POSIX_GUARD(reencrypt(ce, &e2, &e, l_ct));
 
   // Check if the decoding is successful.
   // Check if the error weight equals T1.
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c
index 404c6377da..b455cd7e82 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c
@@ -96,12 +96,12 @@ compute_syndrome(OUT syndrome_t *syndrome, IN const ct_t *ct, IN const sk_t *sk)
   pad_ct[1].val = ct->val[1];
 
   // Compute s = c0*h0 + c1*h1:
-  GUARD(gf2x_mod_mul((uint64_t *)&pad_s[0], (uint64_t *)&pad_ct[0],
+  POSIX_GUARD(gf2x_mod_mul((uint64_t *)&pad_s[0], (uint64_t *)&pad_ct[0],
                      (uint64_t *)&pad_sk[0]));
-  GUARD(gf2x_mod_mul((uint64_t *)&pad_s[1], (uint64_t *)&pad_ct[1],
+  POSIX_GUARD(gf2x_mod_mul((uint64_t *)&pad_s[1], (uint64_t *)&pad_ct[1],
                      (uint64_t *)&pad_sk[1]));
 
-  GUARD(gf2x_add(pad_s[0].val.raw, pad_s[0].val.raw, pad_s[1].val.raw, R_SIZE));
+  POSIX_GUARD(gf2x_add(pad_s[0].val.raw, pad_s[0].val.raw, pad_s[1].val.raw, R_SIZE));
 
   memcpy((uint8_t *)syndrome->qw, pad_s[0].val.raw, R_SIZE);
   dup(syndrome);
@@ -118,13 +118,13 @@ recompute_syndrome(OUT syndrome_t *syndrome,
   ct_t tmp_ct = *ct;
 
   // Adapt the ciphertext
-  GUARD(gf2x_add(tmp_ct.val[0].raw, tmp_ct.val[0].raw, splitted_e->val[0].raw,
+  POSIX_GUARD(gf2x_add(tmp_ct.val[0].raw, tmp_ct.val[0].raw, splitted_e->val[0].raw,
                  R_SIZE));
-  GUARD(gf2x_add(tmp_ct.val[1].raw, tmp_ct.val[1].raw, splitted_e->val[1].raw,
+  POSIX_GUARD(gf2x_add(tmp_ct.val[1].raw, tmp_ct.val[1].raw, splitted_e->val[1].raw,
                  R_SIZE));
 
   // Recompute the syndrome
-  GUARD(compute_syndrome(syndrome, &tmp_ct, sk));
+  POSIX_GUARD(compute_syndrome(syndrome, &tmp_ct, sk));
 
   return SUCCESS;
 }
@@ -334,7 +334,7 @@ decode(OUT split_e_t *e,
     DMSG("    Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
 
     find_err1(e, &black_e, &gray_e, &s, sk->wlist, threshold);
-    GUARD(recompute_syndrome(&s, ct, sk, e));
+    POSIX_GUARD(recompute_syndrome(&s, ct, sk, e));
 #ifdef BGF_DECODER
     if(iter >= 1)
     {
@@ -346,14 +346,14 @@ decode(OUT split_e_t *e,
     DMSG("    Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
 
     find_err2(e, &black_e, &s, sk->wlist, ((DV + 1) / 2) + 1);
-    GUARD(recompute_syndrome(&s, ct, sk, e));
+    POSIX_GUARD(recompute_syndrome(&s, ct, sk, e));
 
     DMSG("    Weight of e: %lu\n",
          r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1]));
     DMSG("    Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
 
     find_err2(e, &gray_e, &s, sk->wlist, ((DV + 1) / 2) + 1);
-    GUARD(recompute_syndrome(&s, ct, sk, e));
+    POSIX_GUARD(recompute_syndrome(&s, ct, sk, e));
   }
 
   if(r_bits_vector_weight((r_t *)s.qw) > 0)
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c
index 09e0af3fde..c80d3365cb 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c
@@ -108,15 +108,15 @@ ossl_add(OUT uint8_t      res_bin[R_SIZE],
     BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
   }
 
-  GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
-  GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
+  POSIX_GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
+  POSIX_GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
 
   if(BN_GF2m_add(r, a, b) == 0)
   {
     BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
   }
 
-  GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
+  POSIX_GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
 
   return SUCCESS;
 }
@@ -176,10 +176,10 @@ cyclic_product(OUT uint8_t      res_bin[R_SIZE],
     BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
   }
 
-  GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
-  GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
-  GUARD(ossl_cyclic_product(r, a, b, bn_ctx));
-  GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
+  POSIX_GUARD(ossl_bin2bn(a, a_bin, R_SIZE));
+  POSIX_GUARD(ossl_bin2bn(b, b_bin, R_SIZE));
+  POSIX_GUARD(ossl_cyclic_product(r, a, b, bn_ctx));
+  POSIX_GUARD(ossl_bn2bin(res_bin, r, R_SIZE));
 
   return SUCCESS;
 }
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c
index 3686338fad..d08fa5eea7 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c
@@ -20,7 +20,7 @@ get_rand_mod_len(OUT uint32_t *    rand_pos,
   do
   {
     // Generate 128bit of random numbers
-    GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos)));
+    POSIX_GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos)));
 
     // Mask only relevant bits
     (*rand_pos) &= mask;
@@ -56,7 +56,7 @@ sample_uniform_r_bits_with_fixed_prf_context(OUT r_t *r,
                                              IN const must_be_odd_t   must_be_odd)
 {
   // Generate random data
-  GUARD(aes_ctr_prf(r->raw, prf_state, R_SIZE));
+  POSIX_GUARD(aes_ctr_prf(r->raw, prf_state, R_SIZE));
 
   // Mask upper bits of the MSByte
   r->raw[R_SIZE - 1] &= MASK(R_BITS + 8 - (R_SIZE * 8));
@@ -104,7 +104,7 @@ generate_sparse_rep(OUT uint64_t *    a,
   // Generate weight rand numbers
   do
   {
-    GUARD(get_rand_mod_len(&wlist[ctr], len, prf_state));
+    POSIX_GUARD(get_rand_mod_len(&wlist[ctr], len, prf_state));
     ctr += is_new(wlist, ctr);
   } while(ctr < weight);
 
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h
index 1ffd56f34a..4ec60683de 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h
@@ -53,9 +53,9 @@ sample_uniform_r_bits(OUT r_t *r,
   // For the seedexpander
   DEFER_CLEANUP(aes_ctr_prf_state_t prf_state = {0}, aes_ctr_prf_state_cleanup);
 
-  GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed));
+  POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed));
 
-  GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd));
+  POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd));
 
   return SUCCESS;
 }
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/LICENSE b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/LICENSE
new file mode 100644
index 0000000000..7a4a3ea242
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+\ No newline at end of file
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes.h
new file mode 100644
index 0000000000..b8b04c3655
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes.h
@@ -0,0 +1,62 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include <openssl/evp.h>
+
+#include "cleanup.h"
+
+#define MAX_AES_INVOKATION (MASK(32))
+
+#define AES256_KEY_BYTES   (32U)
+#define AES256_KEY_BITS    (AES256_KEY_BYTES * 8)
+#define AES256_BLOCK_BYTES (16U)
+#define AES256_ROUNDS      (14U)
+
+typedef ALIGN(16) struct aes256_key_s {
+  uint8_t raw[AES256_KEY_BYTES];
+} aes256_key_t;
+
+CLEANUP_FUNC(aes256_key, aes256_key_t)
+
+// Using OpenSSL structures
+typedef EVP_CIPHER_CTX *aes256_ks_t;
+
+_INLINE_ ret_t aes256_key_expansion(OUT aes256_ks_t *ks,
+                                    IN const aes256_key_t *key)
+{
+  *ks = EVP_CIPHER_CTX_new();
+  if(*ks == NULL) {
+    BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
+  }
+  if(0 == EVP_EncryptInit_ex(*ks, EVP_aes_256_ecb(), NULL, key->raw, NULL)) {
+    EVP_CIPHER_CTX_free(*ks);
+    BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
+  }
+
+  EVP_CIPHER_CTX_set_padding(*ks, 0);
+
+  return SUCCESS;
+}
+
+_INLINE_ ret_t aes256_enc(OUT uint8_t *ct,
+                          IN const uint8_t *pt,
+                          IN const aes256_ks_t *ks)
+{
+  int outlen = 0;
+  if(0 == EVP_EncryptUpdate(*ks, ct, &outlen, pt, AES256_BLOCK_BYTES)) {
+    BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL);
+  }
+  return SUCCESS;
+}
+
+_INLINE_ void aes256_free_ks(OUT aes256_ks_t *ks)
+{
+  EVP_CIPHER_CTX_free(*ks);
+  ks = NULL;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.c
new file mode 100644
index 0000000000..9b50469ef1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.c
@@ -0,0 +1,97 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include "aes_ctr_prf.h"
+#include "utilities.h"
+
+ret_t init_aes_ctr_prf_state(OUT aes_ctr_prf_state_t *s,
+                             IN const uint32_t        max_invokations,
+                             IN const seed_t *seed)
+{
+  if(0 == max_invokations) {
+    BIKE_ERROR(E_AES_CTR_PRF_INIT_FAIL);
+  }
+
+  // Set the key schedule (from seed).
+  // Make sure the size matches the AES256 key size.
+  DEFER_CLEANUP(aes256_key_t key, aes256_key_cleanup);
+
+  bike_static_assert(sizeof(*seed) == sizeof(key.raw), seed_size_equals_ky_size);
+  bike_memcpy(key.raw, seed->raw, sizeof(key.raw));
+
+  POSIX_GUARD(aes256_key_expansion(&s->ks, &key));
+
+  // Initialize buffer and counter
+  s->ctr.u.qw[0]    = 0;
+  s->ctr.u.qw[1]    = 0;
+  s->buffer.u.qw[0] = 0;
+  s->buffer.u.qw[1] = 0;
+
+  s->pos             = AES256_BLOCK_BYTES;
+  s->rem_invokations = max_invokations;
+
+  DMSG("    Init aes_prf_ctr state:\n");
+  DMSG("      s.pos = %d\n", s->pos);
+  DMSG("      s.rem_invokations = %u\n", s->rem_invokations);
+
+  return SUCCESS;
+}
+
+_INLINE_ ret_t perform_aes(OUT uint8_t *ct, IN OUT aes_ctr_prf_state_t *s)
+{
+  // Ensure that the CTR is large enough
+  bike_static_assert(
+    ((sizeof(s->ctr.u.qw[0]) == 8) && (BIT(33) >= MAX_AES_INVOKATION)),
+    ctr_size_is_too_small);
+
+  if(0 == s->rem_invokations) {
+    BIKE_ERROR(E_AES_OVER_USED);
+  }
+
+  POSIX_GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks));
+
+  s->ctr.u.qw[0]++;
+  s->rem_invokations--;
+
+  return SUCCESS;
+}
+
+ret_t aes_ctr_prf(OUT uint8_t *a,
+                  IN OUT aes_ctr_prf_state_t *s,
+                  IN const uint32_t           len)
+{
+  // When Len is smaller than use what's left in the buffer,
+  // there is no need for additional AES invocations.
+  if((len + s->pos) <= AES256_BLOCK_BYTES) {
+    bike_memcpy(a, &s->buffer.u.bytes[s->pos], len);
+    s->pos += len;
+
+    return SUCCESS;
+  }
+
+  // If s.pos != AES256_BLOCK_BYTES then copy what's left in the buffer.
+  // Else copy zero bytes
+  uint32_t idx = AES256_BLOCK_BYTES - s->pos;
+  bike_memcpy(a, &s->buffer.u.bytes[s->pos], idx);
+
+  // Init s.pos
+  s->pos = 0;
+
+  // Copy full AES blocks
+  while((len - idx) >= AES256_BLOCK_BYTES) {
+    POSIX_GUARD(perform_aes(&a[idx], s));
+    idx += AES256_BLOCK_BYTES;
+  }
+
+  POSIX_GUARD(perform_aes(s->buffer.u.bytes, s));
+
+  // Copy the tail
+  s->pos = len - idx;
+  bike_memcpy(&a[idx], s->buffer.u.bytes, s->pos);
+
+  return SUCCESS;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.h
new file mode 100644
index 0000000000..684a52a6fc
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.h
@@ -0,0 +1,43 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "aes.h"
+
+//////////////////////////////
+//        Types
+/////////////////////////////
+
+typedef struct aes_ctr_prf_state_s {
+  uint128_t   ctr;
+  uint128_t   buffer;
+  aes256_ks_t ks;
+  uint32_t    rem_invokations;
+  uint8_t     pos;
+} aes_ctr_prf_state_t;
+
+//////////////////////////////
+//        Methods
+/////////////////////////////
+
+ret_t init_aes_ctr_prf_state(OUT aes_ctr_prf_state_t *s,
+                             IN uint32_t              max_invokations,
+                             IN const seed_t *seed);
+
+ret_t aes_ctr_prf(OUT uint8_t *a, IN OUT aes_ctr_prf_state_t *s, IN uint32_t len);
+
+_INLINE_ void finalize_aes_ctr_prf(IN OUT aes_ctr_prf_state_t *s)
+{
+  aes256_free_ks(&s->ks);
+  secure_clean((uint8_t *)s, sizeof(*s));
+}
+
+_INLINE_ void aes_ctr_prf_state_cleanup(IN OUT aes_ctr_prf_state_t *s)
+{
+  finalize_aes_ctr_prf(s);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_defs.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_defs.h
new file mode 100644
index 0000000000..697efd0627
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_defs.h
@@ -0,0 +1,91 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "defs.h"
+
+////////////////////////////////////////////
+//             BIKE Parameters
+///////////////////////////////////////////
+#define N0 2
+
+#if !defined(LEVEL)
+#  define LEVEL 1
+#endif
+
+#if(LEVEL == 3)
+#  define R_BITS 24659
+#  define DV      103
+#  define T1      199
+
+#  define THRESHOLD_COEFF0 15.2588
+#  define THRESHOLD_COEFF1 0.005265
+#  define THRESHOLD_MIN    52
+
+// The gf2m code is optimized to a block in this case:
+#  define BLOCK_BITS 32768
+#elif(LEVEL == 1)
+// 64-bits of post-quantum security parameters (BIKE paper):
+#  define R_BITS 12323
+#  define DV      71
+#  define T1      134
+
+#  define THRESHOLD_COEFF0 13.530
+#  define THRESHOLD_COEFF1 0.0069722
+#  define THRESHOLD_MIN    36
+
+// The gf2x code is optimized to a block in this case:
+#  define BLOCK_BITS       (16384)
+#else
+#  error "Bad level, choose one of 1/3/5"
+#endif
+
+#define NUM_OF_SEEDS 2
+
+// Round the size to the nearest byte.
+// SIZE suffix, is the number of bytes (uint8_t).
+#define N_BITS   (R_BITS * N0)
+#define R_BYTES  DIVIDE_AND_CEIL(R_BITS, 8)
+#define R_QWORDS DIVIDE_AND_CEIL(R_BITS, 8 * BYTES_IN_QWORD)
+#define R_XMM    DIVIDE_AND_CEIL(R_BITS, 8 * BYTES_IN_XMM)
+#define R_YMM    DIVIDE_AND_CEIL(R_BITS, 8 * BYTES_IN_YMM)
+#define R_ZMM    DIVIDE_AND_CEIL(R_BITS, 8 * BYTES_IN_ZMM)
+
+#define R_BLOCKS        DIVIDE_AND_CEIL(R_BITS, BLOCK_BITS)
+#define R_PADDED        (R_BLOCKS * BLOCK_BITS)
+#define R_PADDED_BYTES  (R_PADDED / 8)
+#define R_PADDED_QWORDS (R_PADDED / 64)
+
+#define LAST_R_QWORD_LEAD  (R_BITS & MASK(6))
+#define LAST_R_QWORD_TRAIL (64 - LAST_R_QWORD_LEAD)
+#define LAST_R_QWORD_MASK  MASK(LAST_R_QWORD_LEAD)
+
+#define LAST_R_BYTE_LEAD  (R_BITS & MASK(3))
+#define LAST_R_BYTE_TRAIL (8 - LAST_R_BYTE_LEAD)
+#define LAST_R_BYTE_MASK  MASK(LAST_R_BYTE_LEAD)
+
+// Data alignement
+#define ALIGN_BYTES (BYTES_IN_ZMM)
+
+#define M_BITS  256
+#define M_BYTES (M_BITS / 8)
+
+#define SS_BITS  256
+#define SS_BYTES (SS_BITS / 8)
+
+#define SEED_BYTES (256 / 8)
+
+//////////////////////////////////
+// Parameters for the BGF decoder.
+//////////////////////////////////
+#define BGF_DECODER
+#define DELTA  3
+#define SLICES (LOG2_MSB(DV) + 1)
+
+// GF2X inversion can only handle R < 32768
+bike_static_assert((R_BITS < 32768), r_too_large_for_inversion);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_r3_kem.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_r3_kem.c
new file mode 100644
index 0000000000..328bb52db8
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_r3_kem.c
@@ -0,0 +1,288 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron, and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include "decode.h"
+#include "gf2x.h"
+#include "sampling.h"
+#include "sha.h"
+#include "tls/s2n_kem.h"
+#include "pq-crypto/s2n_pq.h"
+
+// m_t and seed_t have the same size and thus can be considered
+// to be of the same type. However, for security reasons we distinguish
+// these types, even on the costs of small extra complexity.
+_INLINE_ void convert_seed_to_m_type(OUT m_t *m, IN const seed_t *seed)
+{
+  bike_static_assert(sizeof(*m) == sizeof(*seed), m_size_eq_seed_size);
+  bike_memcpy(m->raw, seed->raw, sizeof(*m));
+}
+
+_INLINE_ void convert_m_to_seed_type(OUT seed_t *seed, IN const m_t *m)
+{
+  bike_static_assert(sizeof(*m) == sizeof(*seed), m_size_eq_seed_size);
+  bike_memcpy(seed->raw, m->raw, sizeof(*seed));
+}
+
+// (e0, e1) = H(m)
+_INLINE_ ret_t function_h(OUT pad_e_t *e, IN const m_t *m)
+{
+  DEFER_CLEANUP(seed_t seed = {0}, seed_cleanup);
+
+  convert_m_to_seed_type(&seed, m);
+  return generate_error_vector(e, &seed);
+}
+
+// out = L(e)
+_INLINE_ ret_t function_l(OUT m_t *out, IN const pad_e_t *e)
+{
+  DEFER_CLEANUP(sha_dgst_t dgst = {0}, sha_dgst_cleanup);
+  DEFER_CLEANUP(e_t tmp, e_cleanup);
+
+  // Take the padding away
+  tmp.val[0] = e->val[0].val;
+  tmp.val[1] = e->val[1].val;
+
+  POSIX_GUARD(sha(&dgst, sizeof(tmp), (uint8_t *)&tmp));
+
+  // Truncate the SHA384 digest to a 256-bits m_t
+  bike_static_assert(sizeof(dgst) >= sizeof(*out), dgst_size_lt_m_size);
+  bike_memcpy(out->raw, dgst.u.raw, sizeof(*out));
+
+  return SUCCESS;
+}
+
+// Generate the Shared Secret K(m, c0, c1)
+_INLINE_ ret_t function_k(OUT ss_t *out, IN const m_t *m, IN const ct_t *ct)
+{
+  DEFER_CLEANUP(func_k_t tmp, func_k_cleanup);
+  DEFER_CLEANUP(sha_dgst_t dgst = {0}, sha_dgst_cleanup);
+
+  // Copy every element, padded to the nearest byte
+  tmp.m  = *m;
+  tmp.c0 = ct->c0;
+  tmp.c1 = ct->c1;
+
+  POSIX_GUARD(sha(&dgst, sizeof(tmp), (uint8_t *)&tmp));
+
+  // Truncate the SHA384 digest to a 256-bits value
+  // to subsequently use it as a seed.
+  bike_static_assert(sizeof(dgst) >= sizeof(*out), dgst_size_lt_out_size);
+  bike_memcpy(out->raw, dgst.u.raw, sizeof(*out));
+
+  return SUCCESS;
+}
+
+_INLINE_ ret_t encrypt(OUT ct_t *ct,
+                       IN const pad_e_t *e,
+                       IN const pk_t *pk,
+                       IN const m_t *m)
+{
+  // Pad the public key and the ciphertext
+  pad_r_t p_ct = {0};
+  pad_r_t p_pk = {0};
+  p_pk.val     = *pk;
+
+  // Generate the ciphertext
+  // ct = pk * e1 + e0
+  gf2x_mod_mul(&p_ct, &e->val[1], &p_pk);
+  gf2x_mod_add(&p_ct, &p_ct, &e->val[0]);
+
+  ct->c0 = p_ct.val;
+
+  // c1 = L(e0, e1)
+  POSIX_GUARD(function_l(&ct->c1, e));
+
+  // m xor L(e0, e1)
+  for(size_t i = 0; i < sizeof(*m); i++) {
+    ct->c1.raw[i] ^= m->raw[i];
+  }
+
+  return SUCCESS;
+}
+
+_INLINE_ ret_t reencrypt(OUT m_t *m, IN const pad_e_t *e, IN const ct_t *l_ct)
+{
+  DEFER_CLEANUP(m_t tmp, m_cleanup);
+
+  POSIX_GUARD(function_l(&tmp, e));
+
+  // m' = c1 ^ L(e')
+  for(size_t i = 0; i < sizeof(*m); i++) {
+    m->raw[i] = tmp.raw[i] ^ l_ct->c1.raw[i];
+  }
+
+  return SUCCESS;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The three APIs below (keypair, encapsulate, decapsulate) are defined by NIST:
+////////////////////////////////////////////////////////////////////////////////
+int BIKE_L1_R3_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk)
+{
+  POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+  POSIX_ENSURE_REF(sk);
+  POSIX_ENSURE_REF(pk);
+
+  DEFER_CLEANUP(aligned_sk_t l_sk = {0}, sk_cleanup);
+
+  // The secret key is (h0, h1),
+  // and the public key h=(h0^-1 * h1).
+  // Padded structures are used internally, and are required by the
+  // decoder and the gf2x multiplication.
+  DEFER_CLEANUP(pad_r_t h0 = {0}, pad_r_cleanup);
+  DEFER_CLEANUP(pad_r_t h1 = {0}, pad_r_cleanup);
+  DEFER_CLEANUP(pad_r_t h0inv = {0}, pad_r_cleanup);
+  DEFER_CLEANUP(pad_r_t h = {0}, pad_r_cleanup);
+
+  // The randomness of the key generation
+  DEFER_CLEANUP(seeds_t seeds = {0}, seeds_cleanup);
+
+  // An AES_PRF state for the secret key
+  DEFER_CLEANUP(aes_ctr_prf_state_t h_prf_state = {0}, aes_ctr_prf_state_cleanup);
+
+  POSIX_GUARD(get_seeds(&seeds));
+  POSIX_GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0]));
+
+  // Generate the secret key (h0, h1) with weight w/2
+  POSIX_GUARD(generate_sparse_rep(&h0, l_sk.wlist[0].val, &h_prf_state));
+  POSIX_GUARD(generate_sparse_rep(&h1, l_sk.wlist[1].val, &h_prf_state));
+
+  // Generate sigma
+  convert_seed_to_m_type(&l_sk.sigma, &seeds.seed[1]);
+
+  // Calculate the public key
+  gf2x_mod_inv(&h0inv, &h0);
+  gf2x_mod_mul(&h, &h1, &h0inv);
+
+  // Fill the secret key data structure with contents - cancel the padding
+  l_sk.bin[0] = h0.val;
+  l_sk.bin[1] = h1.val;
+  l_sk.pk     = h.val;
+
+  // Copy the data to the output buffers
+  bike_memcpy(sk, &l_sk, sizeof(l_sk));
+  bike_memcpy(pk, &l_sk.pk, sizeof(l_sk.pk));
+
+  return SUCCESS;
+}
+
+// Encapsulate - pk is the public key,
+//               ct is a key encapsulation message (ciphertext),
+//               ss is the shared secret.
+int BIKE_L1_R3_crypto_kem_enc(OUT unsigned char *     ct,
+                   OUT unsigned char *     ss,
+                   IN const unsigned char *pk)
+{
+  POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+  POSIX_ENSURE_REF(pk);
+  POSIX_ENSURE_REF(ct);
+  POSIX_ENSURE_REF(ss);
+
+  // Public values (they do not require cleanup on exit).
+  pk_t l_pk;
+  ct_t l_ct;
+
+  DEFER_CLEANUP(m_t m, m_cleanup);
+  DEFER_CLEANUP(ss_t l_ss, ss_cleanup);
+  DEFER_CLEANUP(seeds_t seeds = {0}, seeds_cleanup);
+  DEFER_CLEANUP(pad_e_t e, pad_e_cleanup);
+
+  // Copy the data from the input buffer. This is required in order to avoid
+  // alignment issues on non x86_64 processors.
+  bike_memcpy(&l_pk, pk, sizeof(l_pk));
+
+  POSIX_GUARD(get_seeds(&seeds));
+
+  // e = H(m) = H(seed[0])
+  convert_seed_to_m_type(&m, &seeds.seed[0]);
+  POSIX_GUARD(function_h(&e, &m));
+
+  // Calculate the ciphertext
+  POSIX_GUARD(encrypt(&l_ct, &e, &l_pk, &m));
+
+  // Generate the shared secret
+  POSIX_GUARD(function_k(&l_ss, &m, &l_ct));
+
+  // Copy the data to the output buffers
+  bike_memcpy(ct, &l_ct, sizeof(l_ct));
+  bike_memcpy(ss, &l_ss, sizeof(l_ss));
+
+  return SUCCESS;
+}
+
+// Decapsulate - ct is a key encapsulation message (ciphertext),
+//               sk is the private key,
+//               ss is the shared secret
+int BIKE_L1_R3_crypto_kem_dec(OUT unsigned char *     ss,
+                   IN const unsigned char *ct,
+                   IN const unsigned char *sk)
+{
+  POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+  POSIX_ENSURE_REF(sk);
+  POSIX_ENSURE_REF(ct);
+  POSIX_ENSURE_REF(ss);
+
+  // Public values, does not require a cleanup on exit
+  ct_t l_ct;
+
+  DEFER_CLEANUP(seeds_t seeds = {0}, seeds_cleanup);
+
+  DEFER_CLEANUP(ss_t l_ss, ss_cleanup);
+  DEFER_CLEANUP(aligned_sk_t l_sk, sk_cleanup);
+  DEFER_CLEANUP(e_t e, e_cleanup);
+  DEFER_CLEANUP(m_t m_prime, m_cleanup);
+  DEFER_CLEANUP(pad_e_t e_tmp, pad_e_cleanup);
+  DEFER_CLEANUP(pad_e_t e_prime, pad_e_cleanup);
+
+  // Copy the data from the input buffers. This is required in order to avoid
+  // alignment issues on non x86_64 processors.
+  bike_memcpy(&l_ct, ct, sizeof(l_ct));
+  bike_memcpy(&l_sk, sk, sizeof(l_sk));
+
+  // Generate a random error vector to be used in case of decoding failure
+  // (Note: possibly, a "fixed" zeroed error vector could suffice too,
+  // and serve this generation)
+  POSIX_GUARD(get_seeds(&seeds));
+  POSIX_GUARD(generate_error_vector(&e_prime, &seeds.seed[0]));
+
+  // Decode and on success check if |e|=T (all in constant-time)
+  volatile uint32_t success_cond = (decode(&e, &l_ct, &l_sk) == SUCCESS);
+  success_cond &= secure_cmp32(T1, r_bits_vector_weight(&e.val[0]) +
+                                    r_bits_vector_weight(&e.val[1]));
+
+  // Set appropriate error based on the success condition
+  uint8_t mask = ~secure_l32_mask(0, success_cond);
+  for(size_t i = 0; i < R_BYTES; i++) {
+    PE0_RAW(&e_prime)[i] &= u8_barrier(~mask);
+    PE0_RAW(&e_prime)[i] |= (u8_barrier(mask) & E0_RAW(&e)[i]);
+    PE1_RAW(&e_prime)[i] &= u8_barrier(~mask);
+    PE1_RAW(&e_prime)[i] |= (u8_barrier(mask) & E1_RAW(&e)[i]);
+  }
+
+  POSIX_GUARD(reencrypt(&m_prime, &e_prime, &l_ct));
+
+  // Check if H(m') is equal to (e0', e1')
+  // (in constant-time)
+  POSIX_GUARD(function_h(&e_tmp, &m_prime));
+  success_cond = secure_cmp(PE0_RAW(&e_prime), PE0_RAW(&e_tmp), R_BYTES);
+  success_cond &= secure_cmp(PE1_RAW(&e_prime), PE1_RAW(&e_tmp), R_BYTES);
+
+  // Compute either K(m', C) or K(sigma, C) based on the success condition
+  mask = secure_l32_mask(0, success_cond);
+  for(size_t i = 0; i < M_BYTES; i++) {
+    m_prime.raw[i] &= u8_barrier(~mask);
+    m_prime.raw[i] |= (u8_barrier(mask) & l_sk.sigma.raw[i]);
+  }
+
+  // Generate the shared secret
+  POSIX_GUARD(function_k(&l_ss, &m_prime, &l_ct));
+
+  // Copy the data into the output buffer
+  bike_memcpy(ss, &l_ss, sizeof(l_ss));
+
+  return SUCCESS;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/cleanup.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/cleanup.h
new file mode 100644
index 0000000000..22e8c44250
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/cleanup.h
@@ -0,0 +1,63 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "utilities.h"
+
+/* Runs _thecleanup function on _thealloc once _thealloc went out of scope */
+#define DEFER_CLEANUP(_thealloc, _thecleanup) \
+  __attribute__((cleanup(_thecleanup))) _thealloc
+
+// len is bytes length of in
+_INLINE_ void secure_clean(OUT uint8_t *p, IN const uint32_t len)
+{
+#if defined(_WIN32)
+  SecureZeroMemory(p, len);
+#else
+  typedef void *(*memset_t)(void *, int, size_t);
+  static volatile memset_t memset_func = bike_memset;
+  memset_func(p, 0, len);
+#endif
+}
+
+#define CLEANUP_FUNC(name, type)               \
+  _INLINE_ void name##_cleanup(IN OUT type *o) \
+  {                                            \
+    secure_clean((uint8_t *)o, sizeof(*o));    \
+  }
+
+CLEANUP_FUNC(r, r_t)
+CLEANUP_FUNC(m, m_t)
+CLEANUP_FUNC(e, e_t)
+CLEANUP_FUNC(sk, sk_t)
+CLEANUP_FUNC(ss, ss_t)
+CLEANUP_FUNC(ct, ct_t)
+CLEANUP_FUNC(pad_r, pad_r_t)
+CLEANUP_FUNC(pad_e, pad_e_t)
+CLEANUP_FUNC(seed, seed_t)
+CLEANUP_FUNC(syndrome, syndrome_t)
+CLEANUP_FUNC(upc, upc_t)
+CLEANUP_FUNC(func_k, func_k_t)
+CLEANUP_FUNC(dbl_pad_r, dbl_pad_r_t)
+
+// The functions below require special handling because we deal
+// with arrays and not structures.
+
+_INLINE_ void compressed_idx_d_ar_cleanup(IN OUT compressed_idx_d_ar_t *o)
+{
+  for(int i = 0; i < N0; i++) {
+    secure_clean((uint8_t *)&(*o)[i], sizeof((*o)[0]));
+  }
+}
+
+_INLINE_ void seeds_cleanup(IN OUT seeds_t *o)
+{
+  for(int i = 0; i < NUM_OF_SEEDS; i++) {
+    seed_cleanup(&(o->seed[i]));
+  }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.c
new file mode 100644
index 0000000000..c280b95f03
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.c
@@ -0,0 +1,280 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * [1] The optimizations are based on the description developed in the paper:
+ *     Drucker, Nir, and Shay Gueron. 2019. “A Toolbox for Software Optimization
+ *     of QC-MDPC Code-Based Cryptosystems.” Journal of Cryptographic Engineering,
+ *     January, 1–17. https://doi.org/10.1007/s13389-018-00200-4.
+ *
+ * [2] The decoder algorithm is the Black-Gray decoder in
+ *     the early submission of CAKE (due to N. Sandrier and R Misoczki).
+ *
+ * [3] The analysis for the constant time implementation is given in
+ *     Drucker, Nir, Shay Gueron, and Dusan Kostic. 2019.
+ *     “On Constant-Time QC-MDPC Decoding with Negligible Failure Rate.”
+ *     Cryptology EPrint Archive, 2019. https://eprint.iacr.org/2019/1289.
+ *
+ * [4] it was adapted to BGF in:
+ *     Drucker, Nir, Shay Gueron, and Dusan Kostic. 2019.
+ *     “QC-MDPC decoders with several shades of gray.”
+ *     Cryptology EPrint Archive, 2019. To be published.
+ *
+ * [5] Chou, T.: QcBits: Constant-Time Small-Key Code-Based Cryptography.
+ *     In: Gier-lichs, B., Poschmann, A.Y. (eds.) Cryptographic Hardware
+ *     and Embedded Systems– CHES 2016. pp. 280–300. Springer Berlin Heidelberg,
+ *     Berlin, Heidelberg (2016)
+ *
+ * [6] The rotate512_small funciton is a derivative of the code described in:
+ *     Guimarães, Antonio, Diego F Aranha, and Edson Borin. 2019.
+ *     “Optimized Implementation of QC-MDPC Code-Based Cryptography.”
+ *     Concurrency and Computation: Practice and Experience 31 (18):
+ *     e5089. https://doi.org/10.1002/cpe.5089.
+ */
+
+#include "decode.h"
+#include "cleanup.h"
+#include "decode_internal.h"
+#include "gf2x.h"
+#include "utilities.h"
+
+// Decoding (bit-flipping) parameter
+#if defined(BG_DECODER)
+#  if(LEVEL == 1)
+#    define MAX_IT 3
+#  elif(LEVEL == 3)
+#    define MAX_IT 4
+#  else
+#    error "Level can only be 1/3"
+#  endif
+#elif defined(BGF_DECODER)
+#  if(LEVEL == 1)
+#    define MAX_IT 5
+#  elif(LEVEL == 3)
+#    define MAX_IT 5
+#  else
+#    error "Level can only be 1/3"
+#  endif
+#endif
+
+ret_t compute_syndrome(OUT syndrome_t *syndrome,
+                       IN const pad_r_t *c0,
+                       IN const pad_r_t *h0,
+                       IN const decode_ctx *ctx)
+{
+  DEFER_CLEANUP(pad_r_t pad_s, pad_r_cleanup);
+
+  gf2x_mod_mul(&pad_s, c0, h0);
+
+  bike_memcpy((uint8_t *)syndrome->qw, pad_s.val.raw, R_BYTES);
+  ctx->dup(syndrome);
+
+  return SUCCESS;
+}
+
+_INLINE_ ret_t recompute_syndrome(OUT syndrome_t *syndrome,
+                                  IN const pad_r_t *c0,
+                                  IN const pad_r_t *h0,
+                                  IN const pad_r_t *pk,
+                                  IN const e_t *e,
+                                  IN const decode_ctx *ctx)
+{
+  DEFER_CLEANUP(pad_r_t tmp_c0, pad_r_cleanup);
+  DEFER_CLEANUP(pad_r_t e0 = {0}, pad_r_cleanup);
+  DEFER_CLEANUP(pad_r_t e1 = {0}, pad_r_cleanup);
+
+  e0.val = e->val[0];
+  e1.val = e->val[1];
+
+  // tmp_c0 = pk * e1 + c0 + e0
+  gf2x_mod_mul(&tmp_c0, &e1, pk);
+  gf2x_mod_add(&tmp_c0, &tmp_c0, c0);
+  gf2x_mod_add(&tmp_c0, &tmp_c0, &e0);
+
+  // Recompute the syndrome using the updated ciphertext
+  POSIX_GUARD(compute_syndrome(syndrome, &tmp_c0, h0, ctx));
+
+  return SUCCESS;
+}
+
+_INLINE_ uint8_t get_threshold(IN const syndrome_t *s)
+{
+  bike_static_assert(sizeof(*s) >= sizeof(r_t), syndrome_is_large_enough);
+
+  const uint32_t syndrome_weight = r_bits_vector_weight((const r_t *)s->qw);
+
+  // The equations below are defined in BIKE's specification p. 16, Section 5.2
+  uint32_t       thr  = THRESHOLD_COEFF0 + (THRESHOLD_COEFF1 * syndrome_weight);
+  const uint32_t mask = secure_l32_mask(thr, THRESHOLD_MIN);
+  thr = (u32_barrier(mask) & thr) | (u32_barrier(~mask) & THRESHOLD_MIN);
+
+  DMSG("    Threshold: %d\n", thr);
+  return thr;
+}
+
+// Calculate the Unsatisfied Parity Checks (UPCs) and update the errors
+// vector (e) accordingly. In addition, update the black and gray errors vector
+// with the relevant values.
+_INLINE_ void find_err1(OUT e_t *e,
+                        OUT e_t *black_e,
+                        OUT e_t *gray_e,
+                        IN const syndrome_t *          syndrome,
+                        IN const compressed_idx_d_ar_t wlist,
+                        IN const uint8_t               threshold,
+                        IN const decode_ctx *ctx)
+{
+  // This function uses the bit-slice-adder methodology of [5]:
+  DEFER_CLEANUP(syndrome_t rotated_syndrome = {0}, syndrome_cleanup);
+  DEFER_CLEANUP(upc_t upc, upc_cleanup);
+
+  for(uint32_t i = 0; i < N0; i++) {
+    // UPC must start from zero at every iteration
+    bike_memset(&upc, 0, sizeof(upc));
+
+    // 1) Right-rotate the syndrome for every secret key set bit index
+    //    Then slice-add it to the UPC array.
+    for(size_t j = 0; j < DV; j++) {
+      ctx->rotate_right(&rotated_syndrome, syndrome, wlist[i].val[j]);
+      ctx->bit_sliced_adder(&upc, &rotated_syndrome, LOG2_MSB(j + 1));
+    }
+
+    // 2) Subtract the threshold from the UPC counters
+    ctx->bit_slice_full_subtract(&upc, threshold);
+
+    // 3) Update the errors and the black errors vectors.
+    //    The last slice of the UPC array holds the MSB of the accumulated values
+    //    minus the threshold. Every zero bit indicates a potential error bit.
+    //    The errors values are stored in the black array and xored with the
+    //    errors Of the previous iteration.
+    const r_t *last_slice = &(upc.slice[SLICES - 1].u.r.val);
+    for(size_t j = 0; j < R_BYTES; j++) {
+      const uint8_t sum_msb  = (~last_slice->raw[j]);
+      black_e->val[i].raw[j] = sum_msb;
+      e->val[i].raw[j] ^= sum_msb;
+    }
+
+    // Ensure that the padding bits (upper bits of the last byte) are zero so
+    // they will not be included in the multiplication and in the hash function.
+    e->val[i].raw[R_BYTES - 1] &= LAST_R_BYTE_MASK;
+
+    // 4) Calculate the gray error array by adding "DELTA" to the UPC array.
+    //    For that we reuse the rotated_syndrome variable setting it to all "1".
+    for(size_t l = 0; l < DELTA; l++) {
+      bike_memset((uint8_t *)rotated_syndrome.qw, 0xff, R_BYTES);
+      ctx->bit_sliced_adder(&upc, &rotated_syndrome, SLICES);
+    }
+
+    // 5) Update the gray list with the relevant bits that are not
+    //    set in the black list.
+    for(size_t j = 0; j < R_BYTES; j++) {
+      const uint8_t sum_msb = (~last_slice->raw[j]);
+      gray_e->val[i].raw[j] = (~(black_e->val[i].raw[j])) & sum_msb;
+    }
+  }
+}
+
+// Recalculate the UPCs and update the errors vector (e) according to it
+// and to the black/gray vectors.
+_INLINE_ void find_err2(OUT e_t *e,
+                        IN e_t * pos_e,
+                        IN const syndrome_t *          syndrome,
+                        IN const compressed_idx_d_ar_t wlist,
+                        IN const uint8_t               threshold,
+                        IN const decode_ctx *ctx)
+{
+  DEFER_CLEANUP(syndrome_t rotated_syndrome = {0}, syndrome_cleanup);
+  DEFER_CLEANUP(upc_t upc, upc_cleanup);
+
+  for(uint32_t i = 0; i < N0; i++) {
+    // UPC must start from zero at every iteration
+    bike_memset(&upc, 0, sizeof(upc));
+
+    // 1) Right-rotate the syndrome, for every index of a set bit in the secret
+    // key. Then slice-add it to the UPC array.
+    for(size_t j = 0; j < DV; j++) {
+      ctx->rotate_right(&rotated_syndrome, syndrome, wlist[i].val[j]);
+      ctx->bit_sliced_adder(&upc, &rotated_syndrome, LOG2_MSB(j + 1));
+    }
+
+    // 2) Subtract the threshold from the UPC counters
+    ctx->bit_slice_full_subtract(&upc, threshold);
+
+    // 3) Update the errors vector.
+    //    The last slice of the UPC array holds the MSB of the accumulated values
+    //    minus the threshold. Every zero bit indicates a potential error bit.
+    const r_t *last_slice = &(upc.slice[SLICES - 1].u.r.val);
+    for(size_t j = 0; j < R_BYTES; j++) {
+      const uint8_t sum_msb = (~last_slice->raw[j]);
+      e->val[i].raw[j] ^= (pos_e->val[i].raw[j] & sum_msb);
+    }
+
+    // Ensure that the padding bits (upper bits of the last byte) are zero, so
+    // they are not included in the multiplication, and in the hash function.
+    e->val[i].raw[R_BYTES - 1] &= LAST_R_BYTE_MASK;
+  }
+}
+
+ret_t decode(OUT e_t *e, IN const ct_t *ct, IN const sk_t *sk)
+{
+  // Initialize the decode methods struct
+  decode_ctx ctx;
+  decode_ctx_init(&ctx);
+
+  DEFER_CLEANUP(e_t black_e = {0}, e_cleanup);
+  DEFER_CLEANUP(e_t gray_e = {0}, e_cleanup);
+
+  DEFER_CLEANUP(pad_r_t c0 = {0}, pad_r_cleanup);
+  DEFER_CLEANUP(pad_r_t h0 = {0}, pad_r_cleanup);
+  pad_r_t pk = {0};
+
+  // Pad ciphertext (c0), secret key (h0), and public key (h)
+  c0.val = ct->c0;
+  h0.val = sk->bin[0];
+  pk.val = sk->pk;
+
+  DEFER_CLEANUP(syndrome_t s = {0}, syndrome_cleanup);
+  DMSG("  Computing s.\n");
+  POSIX_GUARD(compute_syndrome(&s, &c0, &h0, &ctx));
+  ctx.dup(&s);
+
+  // Reset (init) the error because it is xored in the find_err functions.
+  bike_memset(e, 0, sizeof(*e));
+
+  for(uint32_t iter = 0; iter < MAX_IT; iter++) {
+    const uint8_t threshold = get_threshold(&s);
+
+    DMSG("    Iteration: %d\n", iter);
+    DMSG("    Weight of e: %lu\n",
+         r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1]));
+    DMSG("    Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
+
+    find_err1(e, &black_e, &gray_e, &s, sk->wlist, threshold, &ctx);
+    POSIX_GUARD(recompute_syndrome(&s, &c0, &h0, &pk, e, &ctx));
+#if defined(BGF_DECODER)
+    if(iter >= 1) {
+      continue;
+    }
+#endif
+    DMSG("    Weight of e: %lu\n",
+         r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1]));
+    DMSG("    Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
+
+    find_err2(e, &black_e, &s, sk->wlist, ((DV + 1) / 2) + 1, &ctx);
+    POSIX_GUARD(recompute_syndrome(&s, &c0, &h0, &pk, e, &ctx));
+
+    DMSG("    Weight of e: %lu\n",
+         r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1]));
+    DMSG("    Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw));
+
+    find_err2(e, &gray_e, &s, sk->wlist, ((DV + 1) / 2) + 1, &ctx);
+    POSIX_GUARD(recompute_syndrome(&s, &c0, &h0, &pk, e, &ctx));
+  }
+
+  if(r_bits_vector_weight((r_t *)s.qw) > 0) {
+    BIKE_ERROR(E_DECODING_FAILURE);
+  }
+
+  return SUCCESS;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.h
new file mode 100644
index 0000000000..8e405ea12e
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.h
@@ -0,0 +1,12 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "types.h"
+
+ret_t decode(OUT e_t *e, IN const ct_t *ct, IN const sk_t *sk);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx2.c
new file mode 100644
index 0000000000..ea8b91a499
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx2.c
@@ -0,0 +1,173 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * The rotate functions are based on the Barrel shifter described in [1] and
+ * some code snippets from [2]:
+ *
+ * [1] Chou, T.: QcBits: Constant-Time Small-Key Code-Based Cryptography.
+ *     In: Gier-lichs, B., Poschmann, A.Y. (eds.) Cryptographic Hardware
+ *     and Embedded Systems– CHES 2016. pp. 280–300. Springer Berlin Heidelberg,
+ *     Berlin, Heidelberg (2016)
+ *
+ * [2] Guimarães, Antonio, Diego F Aranha, and Edson Borin. 2019.
+ *     “Optimized Implementation of QC-MDPC Code-Based Cryptography.”
+ *     Concurrency and Computation: Practice and Experience 31 (18):
+ *     e5089. https://doi.org/10.1002/cpe.5089.
+ */
+
+#if defined(S2N_BIKE_R3_AVX2)
+
+#include "decode.h"
+#include "decode_internal.h"
+#include "utilities.h"
+
+#define AVX2_INTERNAL
+#include "x86_64_intrinsic.h"
+
+#define R_YMM_HALF_LOG2 UPTOPOW2(R_YMM / 2)
+
+_INLINE_ void
+rotate256_big(OUT syndrome_t *out, IN const syndrome_t *in, IN size_t ymm_num)
+{
+  // For preventing overflows (comparison in bytes)
+  bike_static_assert(sizeof(*out) >
+                       (BYTES_IN_YMM * (R_YMM + (2 * R_YMM_HALF_LOG2))),
+                     rotr_big_err);
+
+  *out = *in;
+
+  for(uint32_t idx = R_YMM_HALF_LOG2; idx >= 1; idx >>= 1) {
+    const uint8_t mask       = secure_l32_mask(ymm_num, idx);
+    const __m256i blend_mask = SET1_I8(mask);
+    ymm_num                  = ymm_num - (idx & mask);
+
+    for(size_t i = 0; i < (R_YMM + idx); i++) {
+      __m256i a = LOAD(&out->qw[4 * (i + idx)]);
+      __m256i b = LOAD(&out->qw[4 * i]);
+      b         = BLENDV_I8(b, a, blend_mask);
+      STORE(&out->qw[4 * i], b);
+    }
+  }
+}
+
+_INLINE_ void
+rotate256_small(OUT syndrome_t *out, IN const syndrome_t *in, size_t count)
+{
+  __m256i        carry_in   = SET_ZERO;
+  const int      count64    = (int)count & 0x3f;
+  const uint64_t count_mask = (count >> 5) & 0xe;
+
+  __m256i       idx       = SET_I32(7, 6, 5, 4, 3, 2, 1, 0);
+  const __m256i zero_mask = SET_I64(-1, -1, -1, 0);
+  const __m256i count_vet = SET1_I8(count_mask);
+
+  ALIGN(ALIGN_BYTES)
+  const uint8_t zero_mask2_buf[] = {
+    0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x84, 0x84, 0x84,
+    0x84, 0x84, 0x84, 0x84, 0x84, 0x82, 0x82, 0x82, 0x82, 0x82, 0x82,
+    0x82, 0x82, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+  __m256i zero_mask2 = LOAD(zero_mask2_buf);
+
+  zero_mask2 = SUB_I8(zero_mask2, count_vet);
+  idx        = ADD_I8(idx, count_vet);
+
+  for(int i = R_YMM; i >= 0; i--) {
+    // Load the next 256 bits
+    __m256i in256 = LOAD(&in->qw[4 * i]);
+
+    // Rotate the current and previous 256 registers so that their quadwords
+    // would be in the right positions.
+    __m256i carry_out = PERMVAR_I32(in256, idx);
+    in256             = BLENDV_I8(carry_in, carry_out, zero_mask2);
+
+    // Shift less than 64 (quadwords internal)
+    __m256i inner_carry = BLENDV_I8(carry_in, in256, zero_mask);
+    inner_carry         = PERM_I64(inner_carry, 0x39);
+    const __m256i out256 =
+      SRLI_I64(in256, count64) | SLLI_I64(inner_carry, (int)64 - count64);
+
+    // Store the rotated value
+    STORE(&out->qw[4 * i], out256);
+    carry_in = carry_out;
+  }
+}
+
+void rotate_right_avx2(OUT syndrome_t *out,
+                       IN const syndrome_t *in,
+                       IN const uint32_t    bitscount)
+{
+  // 1) Rotate in granularity of 256 bits blocks, using YMMs
+  rotate256_big(out, in, (bitscount / BITS_IN_YMM));
+  // 2) Rotate in smaller granularity (less than 256 bits), using YMMs
+  rotate256_small(out, out, (bitscount % BITS_IN_YMM));
+}
+
+// Duplicates the first R_BITS of the syndrome three times
+// |------------------------------------------|
+// |  Third copy | Second copy | first R_BITS |
+// |------------------------------------------|
+// This is required by the rotate functions.
+void dup_avx2(IN OUT syndrome_t *s)
+{
+  s->qw[R_QWORDS - 1] =
+    (s->qw[0] << LAST_R_QWORD_LEAD) | (s->qw[R_QWORDS - 1] & LAST_R_QWORD_MASK);
+
+  for(size_t i = 0; i < (2 * R_QWORDS) - 1; i++) {
+    s->qw[R_QWORDS + i] =
+      (s->qw[i] >> LAST_R_QWORD_TRAIL) | (s->qw[i + 1] << LAST_R_QWORD_LEAD);
+  }
+}
+
+// Use half-adder as described in [1].
+void bit_sliced_adder_avx2(OUT upc_t *upc,
+                           IN OUT syndrome_t *rotated_syndrome,
+                           IN const size_t    num_of_slices)
+{
+  // From cache-memory perspective this loop should be the outside loop
+  for(size_t j = 0; j < num_of_slices; j++) {
+    for(size_t i = 0; i < R_QWORDS; i++) {
+      const uint64_t carry = (upc->slice[j].u.qw[i] & rotated_syndrome->qw[i]);
+      upc->slice[j].u.qw[i] ^= rotated_syndrome->qw[i];
+      rotated_syndrome->qw[i] = carry;
+    }
+  }
+}
+
+void bit_slice_full_subtract_avx2(OUT upc_t *upc, IN uint8_t val)
+{
+  // Borrow
+  uint64_t br[R_QWORDS] = {0};
+
+  for(size_t j = 0; j < SLICES; j++) {
+
+    const uint64_t lsb_mask = 0 - (val & 0x1);
+    val >>= 1;
+
+    // Perform a - b with c as the input/output carry
+    // br = 0 0 0 0 1 1 1 1
+    // a  = 0 0 1 1 0 0 1 1
+    // b  = 0 1 0 1 0 1 0 1
+    // -------------------
+    // o  = 0 1 1 0 0 1 1 1
+    // c  = 0 1 0 0 1 1 0 1
+    //
+    // o  = a^b^c
+    //            _     __    _ _   _ _     _
+    // br = abc + abc + abc + abc = abc + ((a+b))c
+
+    for(size_t i = 0; i < R_QWORDS; i++) {
+      const uint64_t a      = upc->slice[j].u.qw[i];
+      const uint64_t b      = lsb_mask;
+      const uint64_t tmp    = ((~a) & b & (~br[i])) | ((((~a) | b) & br[i]));
+      upc->slice[j].u.qw[i] = a ^ b ^ br[i];
+      br[i]                 = tmp;
+    }
+  }
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx512.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx512.c
new file mode 100644
index 0000000000..ef7f6d29d5
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx512.c
@@ -0,0 +1,167 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * The rotation functions are based on the Barrel shifter described in [1]
+ * and some modifed snippet from [2]
+ * [1] Chou, T.: QcBits: Constant-Time Small-Key Code-Based Cryptography.
+ *     In: Gier-lichs, B., Poschmann, A.Y. (eds.) Cryptographic Hardware
+ *     and Embedded Systems– CHES 2016. pp. 280–300. Springer Berlin Heidelberg,
+ *     Berlin, Heidelberg (2016)
+ *
+ * [2] Guimarães, Antonio, Diego F Aranha, and Edson Borin. 2019.
+ *     “Optimized Implementation of QC-MDPC Code-Based Cryptography.”
+ *     Concurrency and Computation: Practice and Experience 31 (18):
+ *     e5089. https://doi.org/10.1002/cpe.5089.
+ */
+
+#if defined(S2N_BIKE_R3_AVX512)
+
+#include "decode.h"
+#include "decode_internal.h"
+#include "utilities.h"
+
+#define AVX512_INTERNAL
+#include "x86_64_intrinsic.h"
+
+#define R_ZMM_HALF_LOG2 UPTOPOW2(R_ZMM / 2)
+
+_INLINE_ void
+rotate512_big(OUT syndrome_t *out, IN const syndrome_t *in, size_t zmm_num)
+{
+  // For preventing overflows (comparison in bytes)
+  bike_static_assert(sizeof(*out) >
+                       (BYTES_IN_ZMM * (R_ZMM + (2 * R_ZMM_HALF_LOG2))),
+                     rotr_big_err);
+  *out = *in;
+
+  for(uint32_t idx = R_ZMM_HALF_LOG2; idx >= 1; idx >>= 1) {
+    const uint8_t mask = secure_l32_mask(zmm_num, idx);
+    zmm_num            = zmm_num - (idx & mask);
+
+    for(size_t i = 0; i < (R_ZMM + idx); i++) {
+      const __m512i a = LOAD(&out->qw[8 * (i + idx)]);
+      MSTORE(&out->qw[8 * i], mask, a);
+    }
+  }
+}
+
+// The rotate512_small function is a derivative of the code described in [1]
+_INLINE_ void
+rotate512_small(OUT syndrome_t *out, IN const syndrome_t *in, size_t bitscount)
+{
+  __m512i       previous     = SET_ZERO;
+  const int     count64      = (int)bitscount & 0x3f;
+  const __m512i count64_512  = SET1_I64(count64);
+  const __m512i count64_512r = SET1_I64((int)64 - count64);
+
+  const __m512i num_full_qw = SET1_I64(bitscount >> 6);
+  const __m512i one         = SET1_I64(1);
+  __m512i       a0, a1;
+
+  __m512i idx = SET_I64(7, 6, 5, 4, 3, 2, 1, 0);
+
+  // Positions above 7 are taken from the second register in
+  // _mm512_permutex2var_epi64
+  idx          = ADD_I64(idx, num_full_qw);
+  __m512i idx1 = ADD_I64(idx, one);
+
+  for(int i = R_ZMM; i >= 0; i--) {
+    // Load the next 512 bits
+    const __m512i in512 = LOAD(&in->qw[8 * i]);
+
+    // Rotate the current and previous 512 registers so that their quadwords
+    // would be in the right positions.
+    a0 = PERMX2VAR_I64(in512, idx, previous);
+    a1 = PERMX2VAR_I64(in512, idx1, previous);
+
+    a0 = SRLV_I64(a0, count64_512);
+    a1 = SLLV_I64(a1, count64_512r);
+
+    // Shift less than 64 (quadwords internal)
+    const __m512i out512 = a0 | a1;
+
+    // Store the rotated value
+    STORE(&out->qw[8 * i], out512);
+    previous = in512;
+  }
+}
+
+void rotate_right_avx512(OUT syndrome_t *out,
+                         IN const syndrome_t *in,
+                         IN const uint32_t    bitscount)
+{
+  // 1) Rotate in granularity of 512 bits blocks, using ZMMs
+  rotate512_big(out, in, (bitscount / BITS_IN_ZMM));
+  // 2) Rotate in smaller granularity (less than 512 bits), using ZMMs
+  rotate512_small(out, out, (bitscount % BITS_IN_ZMM));
+}
+
+// Duplicates the first R_BITS of the syndrome three times
+// |------------------------------------------|
+// |  Third copy | Second copy | first R_BITS |
+// |------------------------------------------|
+// This is required by the rotate functions.
+void dup_avx512(IN OUT syndrome_t *s)
+{
+  s->qw[R_QWORDS - 1] =
+    (s->qw[0] << LAST_R_QWORD_LEAD) | (s->qw[R_QWORDS - 1] & LAST_R_QWORD_MASK);
+
+  for(size_t i = 0; i < (2 * R_QWORDS) - 1; i++) {
+    s->qw[R_QWORDS + i] =
+      (s->qw[i] >> LAST_R_QWORD_TRAIL) | (s->qw[i + 1] << LAST_R_QWORD_LEAD);
+  }
+}
+
+// Use half-adder as described in [1].
+void bit_sliced_adder_avx512(OUT upc_t *upc,
+                             IN OUT syndrome_t *rotated_syndrome,
+                             IN const size_t    num_of_slices)
+{
+  // From cache-memory perspective this loop should be the outside loop
+  for(size_t j = 0; j < num_of_slices; j++) {
+    for(size_t i = 0; i < R_QWORDS; i++) {
+      const uint64_t carry = (upc->slice[j].u.qw[i] & rotated_syndrome->qw[i]);
+      upc->slice[j].u.qw[i] ^= rotated_syndrome->qw[i];
+      rotated_syndrome->qw[i] = carry;
+    }
+  }
+}
+
+void bit_slice_full_subtract_avx512(OUT upc_t *upc, IN uint8_t val)
+{
+  // Borrow
+  uint64_t br[R_QWORDS] = {0};
+
+  for(size_t j = 0; j < SLICES; j++) {
+
+    const uint64_t lsb_mask = 0 - (val & 0x1);
+    val >>= 1;
+
+    // Perform a - b with c as the input/output carry
+    // br = 0 0 0 0 1 1 1 1
+    // a  = 0 0 1 1 0 0 1 1
+    // b  = 0 1 0 1 0 1 0 1
+    // -------------------
+    // o  = 0 1 1 0 0 1 1 1
+    // c  = 0 1 0 0 1 1 0 1
+    //
+    // o  = a^b^c
+    //            _     __    _ _   _ _     _
+    // br = abc + abc + abc + abc = abc + ((a+b))c
+
+    for(size_t i = 0; i < R_QWORDS; i++) {
+      const uint64_t a      = upc->slice[j].u.qw[i];
+      const uint64_t b      = lsb_mask;
+      const uint64_t tmp    = ((~a) & b & (~br[i])) | ((((~a) | b) & br[i]));
+      upc->slice[j].u.qw[i] = a ^ b ^ br[i];
+      br[i]                 = tmp;
+    }
+  }
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_internal.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_internal.h
new file mode 100644
index 0000000000..817cc4603a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_internal.h
@@ -0,0 +1,86 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "pq-crypto/s2n_pq.h"
+#include "defs.h"
+#include "types.h"
+
+// Rotate right the first R_BITS of a syndrome.
+// At input, the syndrome is stored as three R_BITS triplicate.
+// (this makes rotation easier to implement)
+// For the output: the output syndrome has only one R_BITS rotation, the remaining
+// (2 * R_BITS) bits are undefined.
+void rotate_right_port(OUT syndrome_t *out,
+                       IN const syndrome_t *in,
+                       IN uint32_t          bitscount);
+void dup_port(IN OUT syndrome_t *s);
+void bit_sliced_adder_port(OUT upc_t *upc,
+                           IN OUT syndrome_t *rotated_syndrome,
+                           IN const size_t    num_of_slices);
+void bit_slice_full_subtract_port(OUT upc_t *upc, IN uint8_t val);
+
+#if defined(S2N_BIKE_R3_AVX2)
+void rotate_right_avx2(OUT syndrome_t *out,
+                       IN const syndrome_t *in,
+                       IN uint32_t          bitscount);
+void dup_avx2(IN OUT syndrome_t *s);
+void bit_sliced_adder_avx2(OUT upc_t *upc,
+                           IN OUT syndrome_t *rotated_syndrome,
+                           IN const size_t    num_of_slices);
+void bit_slice_full_subtract_avx2(OUT upc_t *upc, IN uint8_t val);
+#endif
+
+#if defined(S2N_BIKE_R3_AVX512)
+void rotate_right_avx512(OUT syndrome_t *out,
+                         IN const syndrome_t *in,
+                         IN uint32_t          bitscount);
+void dup_avx512(IN OUT syndrome_t *s);
+void bit_sliced_adder_avx512(OUT upc_t *upc,
+                             IN OUT syndrome_t *rotated_syndrome,
+                             IN const size_t    num_of_slices);
+void bit_slice_full_subtract_avx512(OUT upc_t *upc, IN uint8_t val);
+#endif
+
+// Decode methods struct
+typedef struct decode_ctx_st {
+  void (*rotate_right)(OUT syndrome_t *out,
+                       IN const syndrome_t *in,
+                       IN uint32_t          bitscount);
+  void (*dup)(IN OUT syndrome_t *s);
+  void (*bit_sliced_adder)(OUT upc_t *upc,
+                           IN OUT syndrome_t *rotated_syndrom,
+                           IN const size_t    num_of_slices);
+  void (*bit_slice_full_subtract)(OUT upc_t *upc, IN uint8_t val);
+} decode_ctx;
+
+_INLINE_ void decode_ctx_init(decode_ctx *ctx)
+{
+#if defined(S2N_BIKE_R3_AVX512)
+  if(s2n_bike_r3_is_avx512_enabled()) {
+    ctx->rotate_right            = rotate_right_avx512;
+    ctx->dup                     = dup_avx512;
+    ctx->bit_sliced_adder        = bit_sliced_adder_avx512;
+    ctx->bit_slice_full_subtract = bit_slice_full_subtract_avx512;
+  } else
+#endif
+#if defined(S2N_BIKE_R3_AVX2)
+  if(s2n_bike_r3_is_avx2_enabled()) {
+    ctx->rotate_right            = rotate_right_avx2;
+    ctx->dup                     = dup_avx2;
+    ctx->bit_sliced_adder        = bit_sliced_adder_avx2;
+    ctx->bit_slice_full_subtract = bit_slice_full_subtract_avx2;
+  } else
+#endif
+  {
+    ctx->rotate_right            = rotate_right_port;
+    ctx->dup                     = dup_port;
+    ctx->bit_sliced_adder        = bit_sliced_adder_port;
+    ctx->bit_slice_full_subtract = bit_slice_full_subtract_port;
+  }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_portable.c
new file mode 100644
index 0000000000..846818386d
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_portable.c
@@ -0,0 +1,126 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include "decode.h"
+#include "decode_internal.h"
+#include "utilities.h"
+
+#define R_QWORDS_HALF_LOG2 UPTOPOW2(R_QWORDS / 2)
+
+_INLINE_ void
+rotr_big(OUT syndrome_t *out, IN const syndrome_t *in, IN size_t qw_num)
+{
+  // For preventing overflows (comparison in bytes)
+  bike_static_assert(sizeof(*out) > 8 * (R_QWORDS + (2 * R_QWORDS_HALF_LOG2)),
+                     rotr_big_err);
+
+  *out = *in;
+
+  for(uint32_t idx = R_QWORDS_HALF_LOG2; idx >= 1; idx >>= 1) {
+    // Convert 32 bit mask to 64 bit mask
+    const uint64_t mask = ((uint32_t)secure_l32_mask(qw_num, idx) + 1U) - 1ULL;
+    qw_num              = qw_num - (idx & u64_barrier(mask));
+
+    // Rotate R_QWORDS quadwords and another idx quadwords,
+    // as needed by the next iteration.
+    for(size_t i = 0; i < (R_QWORDS + idx); i++) {
+      out->qw[i] = (out->qw[i] & u64_barrier(~mask)) |
+                   (out->qw[i + idx] & u64_barrier(mask));
+    }
+  }
+}
+
+_INLINE_ void
+rotr_small(OUT syndrome_t *out, IN const syndrome_t *in, IN const size_t bits)
+{
+  bike_static_assert(bits < 64, rotr_small_err);
+  bike_static_assert(sizeof(*out) > (8 * R_QWORDS), rotr_small_qw_err);
+
+  // Convert |bits| to 0/1 by using !!bits; then create a mask of 0 or
+  // 0xffffffffff Use high_shift to avoid undefined behaviour when doing x << 64;
+  const uint64_t mask       = (0 - (!!bits));
+  const uint64_t high_shift = (64 - bits) & u64_barrier(mask);
+
+  for(size_t i = 0; i < R_QWORDS; i++) {
+    const uint64_t low_part  = in->qw[i] >> bits;
+    const uint64_t high_part = (in->qw[i + 1] << high_shift) & u64_barrier(mask);
+    out->qw[i]               = low_part | high_part;
+  }
+}
+
+void rotate_right_port(OUT syndrome_t *out,
+                       IN const syndrome_t *in,
+                       IN const uint32_t    bitscount)
+{
+  // Rotate (64-bit) quad-words
+  rotr_big(out, in, (bitscount / 64));
+  // Rotate bits (less than 64)
+  rotr_small(out, out, (bitscount % 64));
+}
+
+// Duplicates the first R_BITS of the syndrome three times
+// |------------------------------------------|
+// |  Third copy | Second copy | first R_BITS |
+// |------------------------------------------|
+// This is required by the rotate functions.
+void dup_port(IN OUT syndrome_t *s)
+{
+  s->qw[R_QWORDS - 1] =
+    (s->qw[0] << LAST_R_QWORD_LEAD) | (s->qw[R_QWORDS - 1] & LAST_R_QWORD_MASK);
+
+  for(size_t i = 0; i < (2 * R_QWORDS) - 1; i++) {
+    s->qw[R_QWORDS + i] =
+      (s->qw[i] >> LAST_R_QWORD_TRAIL) | (s->qw[i + 1] << LAST_R_QWORD_LEAD);
+  }
+}
+
+// Use half-adder as described in [1].
+void bit_sliced_adder_port(OUT upc_t *upc,
+                           IN OUT syndrome_t *rotated_syndrome,
+                           IN const size_t    num_of_slices)
+{
+  // From cache-memory perspective this loop should be the outside loop
+  for(size_t j = 0; j < num_of_slices; j++) {
+    for(size_t i = 0; i < R_QWORDS; i++) {
+      const uint64_t carry = (upc->slice[j].u.qw[i] & rotated_syndrome->qw[i]);
+      upc->slice[j].u.qw[i] ^= rotated_syndrome->qw[i];
+      rotated_syndrome->qw[i] = carry;
+    }
+  }
+}
+
+void bit_slice_full_subtract_port(OUT upc_t *upc, IN uint8_t val)
+{
+  // Borrow
+  uint64_t br[R_QWORDS] = {0};
+
+  for(size_t j = 0; j < SLICES; j++) {
+
+    const uint64_t lsb_mask = 0 - (val & 0x1);
+    val >>= 1;
+
+    // Perform a - b with c as the input/output carry
+    // br = 0 0 0 0 1 1 1 1
+    // a  = 0 0 1 1 0 0 1 1
+    // b  = 0 1 0 1 0 1 0 1
+    // -------------------
+    // o  = 0 1 1 0 0 1 1 1
+    // c  = 0 1 0 0 1 1 0 1
+    //
+    // o  = a^b^c
+    //            _     __    _ _   _ _     _
+    // br = abc + abc + abc + abc = abc + ((a+b))c
+
+    for(size_t i = 0; i < R_QWORDS; i++) {
+      const uint64_t a      = upc->slice[j].u.qw[i];
+      const uint64_t b      = lsb_mask;
+      const uint64_t tmp    = ((~a) & b & (~br[i])) | ((((~a) | b) & br[i]));
+      upc->slice[j].u.qw[i] = a ^ b ^ br[i];
+      br[i]                 = tmp;
+    }
+  }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/defs.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/defs.h
new file mode 100644
index 0000000000..ab3f5c7a32
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/defs.h
@@ -0,0 +1,107 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+////////////////////////////////////////////
+//             Basic defs
+///////////////////////////////////////////
+
+// For code clarity.
+#define IN
+#define OUT
+
+#define ALIGN(n)        __attribute__((aligned(n)))
+#define BIKE_UNUSED_ATT __attribute__((unused))
+
+#define _INLINE_ static inline
+
+// In asm the symbols '==' and '?' are not allowed. Therefore, if using
+// divide_and_ceil in asm files, we must ensure with static_assert its validity.
+#if(__cplusplus >= 201103L) || defined(static_assert)
+#  define bike_static_assert(COND, MSG) static_assert(COND, "MSG")
+#else
+#  define bike_static_assert(COND, MSG) \
+    typedef char static_assertion_##MSG[(COND) ? 1 : -1] BIKE_UNUSED_ATT
+#endif
+
+// Divide by the divider and round up to next integer
+#define DIVIDE_AND_CEIL(x, divider) (((x) + (divider) - 1) / (divider))
+
+// Bit manipulations
+// Linux Assemblies, except for Ubuntu, cannot understand what ULL mean.
+// Therefore, in that case len must be smaller than 31.
+#define BIT(len)       (1ULL << (len))
+#define MASK(len)      (BIT(len) - 1)
+#define SIZEOF_BITS(b) (sizeof(b) * 8)
+
+#define BYTES_IN_QWORD 0x8
+#define BYTES_IN_XMM   0x10
+#define BYTES_IN_YMM   0x20
+#define BYTES_IN_ZMM   0x40
+
+#define BITS_IN_YMM (BYTES_IN_YMM * 8)
+#define BITS_IN_ZMM (BYTES_IN_ZMM * 8)
+
+#define WORDS_IN_YMM (BYTES_IN_YMM / sizeof(uint16_t))
+#define WORDS_IN_ZMM (BYTES_IN_ZMM / sizeof(uint16_t))
+
+#define QWORDS_IN_XMM (BYTES_IN_XMM / sizeof(uint64_t))
+#define QWORDS_IN_YMM (BYTES_IN_YMM / sizeof(uint64_t))
+#define QWORDS_IN_ZMM (BYTES_IN_ZMM / sizeof(uint64_t))
+
+// Copied from (Kaz answer)
+// https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2
+#define UPTOPOW2_0(v) ((v)-1)
+#define UPTOPOW2_1(v) (UPTOPOW2_0(v) | (UPTOPOW2_0(v) >> 1))
+#define UPTOPOW2_2(v) (UPTOPOW2_1(v) | (UPTOPOW2_1(v) >> 2))
+#define UPTOPOW2_3(v) (UPTOPOW2_2(v) | (UPTOPOW2_2(v) >> 4))
+#define UPTOPOW2_4(v) (UPTOPOW2_3(v) | (UPTOPOW2_3(v) >> 8))
+#define UPTOPOW2_5(v) (UPTOPOW2_4(v) | (UPTOPOW2_4(v) >> 16))
+
+#define UPTOPOW2(v) (UPTOPOW2_5(v) + 1)
+
+// Works only for 0 < v < 512
+#define LOG2_MSB(v)                                 \
+  ((v) == 0                                         \
+     ? 0                                            \
+     : ((v) < 2                                     \
+          ? 1                                       \
+          : ((v) < 4                                \
+               ? 2                                  \
+               : ((v) < 8                           \
+                    ? 3                             \
+                    : ((v) < 16                     \
+                         ? 4                        \
+                         : ((v) < 32                \
+                              ? 5                   \
+                              : ((v) < 64           \
+                                   ? 6              \
+                                   : ((v) < 128 ? 7 \
+                                                : ((v) < 256 ? 8 : 9)))))))))
+
+////////////////////////////////////////////
+//             Debug
+///////////////////////////////////////////
+
+#if defined(VERBOSE)
+#  include <stdio.h>
+
+#  define DMSG(...)        \
+    {                      \
+      printf(__VA_ARGS__); \
+    }
+#else
+#  define DMSG(...)
+#endif
+
+////////////////////////////////////////////
+//              Printing
+///////////////////////////////////////////
+//#define PRINT_IN_BE
+//#define NO_SPACE
+//#define NO_NEWLINE
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.c
new file mode 100644
index 0000000000..9f779b7df9
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.c
@@ -0,0 +1,10 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include "error.h"
+
+__thread _bike_err_t bike_errno;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.h
new file mode 100644
index 0000000000..b1b9db6d5e
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.h
@@ -0,0 +1,33 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "utils/s2n_safety.h"
+
+#define SUCCESS 0
+#define FAIL    (-1)
+
+#define ret_t int __attribute__((warn_unused_result))
+
+enum _bike_err
+{
+  E_DECODING_FAILURE         = 1,
+  E_AES_CTR_PRF_INIT_FAIL    = 2,
+  E_AES_OVER_USED            = 3,
+  EXTERNAL_LIB_ERROR_OPENSSL = 4,
+  E_FAIL_TO_GET_SEED         = 5
+};
+
+typedef enum _bike_err _bike_err_t;
+
+extern __thread _bike_err_t bike_errno;
+#define BIKE_ERROR(x) \
+  do {                \
+    bike_errno = (x); \
+    return FAIL;      \
+  } while(0)
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x.h
new file mode 100644
index 0000000000..f4cdb53a80
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x.h
@@ -0,0 +1,29 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "types.h"
+
+// c = a+b mod (x^r - 1)
+_INLINE_ void
+gf2x_mod_add(OUT pad_r_t *c, IN const pad_r_t *a, IN const pad_r_t *b)
+{
+  const uint64_t *a_qwords = (const uint64_t *)a;
+  const uint64_t *b_qwords = (const uint64_t *)b;
+  uint64_t *      c_qwords = (uint64_t *)c;
+
+  for(size_t i = 0; i < R_PADDED_QWORDS; i++) {
+    c_qwords[i] = a_qwords[i] ^ b_qwords[i];
+  }
+}
+
+// c = a*b mod (x^r - 1)
+void gf2x_mod_mul(OUT pad_r_t *c, IN const pad_r_t *a, IN const pad_r_t *b);
+
+// c = a^-1 mod (x^r - 1)
+void gf2x_mod_inv(OUT pad_r_t *c, IN const pad_r_t *a);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_internal.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_internal.h
new file mode 100644
index 0000000000..a87478aba1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_internal.h
@@ -0,0 +1,177 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+// For size_t
+#include <stdlib.h>
+
+#include "pq-crypto/s2n_pq.h"
+#include "types.h"
+
+// The size in quadwords of the operands in the gf2x_mul_base function
+// for different implementations.
+#define GF2X_PORT_BASE_QWORDS    (1)
+#define GF2X_PCLMUL_BASE_QWORDS  (8)
+#define GF2X_VPCLMUL_BASE_QWORDS (16)
+
+// ------------------ FUNCTIONS NEEDED FOR GF2X MULTIPLICATION ------------------
+// GF2X multiplication of a and b of size GF2X_BASE_QWORDS, c = a * b
+void gf2x_mul_base_port(OUT uint64_t *c,
+                        IN const uint64_t *a,
+                        IN const uint64_t *b);
+void karatzuba_add1_port(OUT uint64_t *alah,
+                         OUT uint64_t *blbh,
+                         IN const uint64_t *a,
+                         IN const uint64_t *b,
+                         IN const size_t    qwords_len);
+void karatzuba_add2_port(OUT uint64_t *z,
+                         IN const uint64_t *x,
+                         IN const uint64_t *y,
+                         IN const size_t    qwords_len);
+void karatzuba_add3_port(OUT uint64_t *c,
+                         IN const uint64_t *mid,
+                         IN const size_t    qwords_len);
+
+// -------------------- FUNCTIONS NEEDED FOR GF2X INVERSION --------------------
+// c = a^2
+void gf2x_sqr_port(OUT dbl_pad_r_t *c, IN const pad_r_t *a);
+// The k-squaring function computes c = a^(2^k) % (x^r - 1),
+// It is required by inversion, where l_param is derived from k.
+void k_sqr_port(OUT pad_r_t *c, IN const pad_r_t *a, IN size_t l_param);
+// c = a mod (x^r - 1)
+void gf2x_red_port(OUT pad_r_t *c, IN const dbl_pad_r_t *a);
+
+// AVX2 versions of the functions
+#if defined(S2N_BIKE_R3_AVX2)
+void karatzuba_add1_avx2(OUT uint64_t *alah,
+                         OUT uint64_t *blbh,
+                         IN const uint64_t *a,
+                         IN const uint64_t *b,
+                         IN const size_t    qwords_len);
+void karatzuba_add2_avx2(OUT uint64_t *z,
+                         IN const uint64_t *x,
+                         IN const uint64_t *y,
+                         IN const size_t    qwords_len);
+void karatzuba_add3_avx2(OUT uint64_t *c,
+                         IN const uint64_t *mid,
+                         IN const size_t    qwords_len);
+void k_sqr_avx2(OUT pad_r_t *c, IN const pad_r_t *a, IN size_t l_param);
+void gf2x_red_avx2(OUT pad_r_t *c, IN const dbl_pad_r_t *a);
+#endif
+
+// AVX512 versions of the functions
+#if defined(S2N_BIKE_R3_AVX512)
+void karatzuba_add1_avx512(OUT uint64_t *alah,
+                           OUT uint64_t *blbh,
+                           IN const uint64_t *a,
+                           IN const uint64_t *b,
+                           IN const size_t    qwords_len);
+void karatzuba_add2_avx512(OUT uint64_t *z,
+                           IN const uint64_t *x,
+                           IN const uint64_t *y,
+                           IN const size_t    qwords_len);
+void karatzuba_add3_avx512(OUT uint64_t *c,
+                           IN const uint64_t *mid,
+                           IN const size_t    qwords_len);
+void k_sqr_avx512(OUT pad_r_t *c, IN const pad_r_t *a, IN size_t l_param);
+void gf2x_red_avx512(OUT pad_r_t *c, IN const dbl_pad_r_t *a);
+#endif
+
+// PCLMUL based multiplication
+#if defined(S2N_BIKE_R3_PCLMUL)
+void gf2x_mul_base_pclmul(OUT uint64_t *c,
+                          IN const uint64_t *a,
+                          IN const uint64_t *b);
+void gf2x_sqr_pclmul(OUT dbl_pad_r_t *c, IN const pad_r_t *a);
+#endif
+
+// VPCLMUL based multiplication
+#if defined(S2N_BIKE_R3_VPCLMUL)
+void gf2x_mul_base_vpclmul(OUT uint64_t *c,
+                           IN const uint64_t *a,
+                           IN const uint64_t *b);
+void gf2x_sqr_vpclmul(OUT dbl_pad_r_t *c, IN const pad_r_t *a);
+#endif
+
+// GF2X methods struct
+typedef struct gf2x_ctx_st {
+  size_t mul_base_qwords;
+  void (*mul_base)(OUT uint64_t *c, IN const uint64_t *a, IN const uint64_t *b);
+  void (*karatzuba_add1)(OUT uint64_t *alah,
+                         OUT uint64_t *blbh,
+                         IN const uint64_t *a,
+                         IN const uint64_t *b,
+                         IN const size_t    qwords_len);
+  void (*karatzuba_add2)(OUT uint64_t *z,
+                         IN const uint64_t *x,
+                         IN const uint64_t *y,
+                         IN const size_t    qwords_len);
+  void (*karatzuba_add3)(OUT uint64_t *c,
+                         IN const uint64_t *mid,
+                         IN const size_t    qwords_len);
+
+  void (*sqr)(OUT dbl_pad_r_t *c, IN const pad_r_t *a);
+  void (*k_sqr)(OUT pad_r_t *c, IN const pad_r_t *a, IN size_t l_param);
+
+  void (*red)(OUT pad_r_t *c, IN const dbl_pad_r_t *a);
+} gf2x_ctx;
+
+// Used in gf2x_inv.c to avoid initializing the context many times.
+void gf2x_mod_mul_with_ctx(OUT pad_r_t *c,
+                           IN const pad_r_t *a,
+                           IN const pad_r_t *b,
+                           IN const gf2x_ctx *ctx);
+
+_INLINE_ void gf2x_ctx_init(gf2x_ctx *ctx)
+{
+#if defined(S2N_BIKE_R3_AVX512)
+  if(s2n_bike_r3_is_avx512_enabled()) {
+    ctx->karatzuba_add1 = karatzuba_add1_avx512;
+    ctx->karatzuba_add2 = karatzuba_add2_avx512;
+    ctx->karatzuba_add3 = karatzuba_add3_avx512;
+    ctx->k_sqr          = k_sqr_avx512;
+    ctx->red            = gf2x_red_avx512;
+  } else
+#endif
+#if defined(S2N_BIKE_R3_AVX2)
+  if(s2n_bike_r3_is_avx2_enabled()) {
+    ctx->karatzuba_add1 = karatzuba_add1_avx2;
+    ctx->karatzuba_add2 = karatzuba_add2_avx2;
+    ctx->karatzuba_add3 = karatzuba_add3_avx2;
+    ctx->k_sqr          = k_sqr_avx2;
+    ctx->red            = gf2x_red_avx2;
+  } else
+#endif
+  {
+    ctx->karatzuba_add1 = karatzuba_add1_port;
+    ctx->karatzuba_add2 = karatzuba_add2_port;
+    ctx->karatzuba_add3 = karatzuba_add3_port;
+    ctx->k_sqr          = k_sqr_port;
+    ctx->red            = gf2x_red_port;
+  }
+
+#if defined(S2N_BIKE_R3_VPCLMUL)
+  if(s2n_bike_r3_is_vpclmul_enabled()) {
+    ctx->mul_base_qwords = GF2X_VPCLMUL_BASE_QWORDS;
+    ctx->mul_base        = gf2x_mul_base_vpclmul;
+    ctx->sqr             = gf2x_sqr_vpclmul;
+  } else
+#endif
+#if defined(S2N_BIKE_R3_PCLMUL)
+  if(s2n_bike_r3_is_pclmul_enabled()) {
+    ctx->mul_base_qwords = GF2X_PCLMUL_BASE_QWORDS;
+    ctx->mul_base        = gf2x_mul_base_pclmul;
+    ctx->sqr             = gf2x_sqr_pclmul;
+  } else
+#endif
+  {
+    ctx->mul_base_qwords = GF2X_PORT_BASE_QWORDS;
+    ctx->mul_base        = gf2x_mul_base_port;
+    ctx->sqr             = gf2x_sqr_port;
+  }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_inv.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_inv.c
new file mode 100644
index 0000000000..bea7ee84b1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_inv.c
@@ -0,0 +1,156 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * The inversion algorithm in this file is based on:
+ * [1] Nir Drucker, Shay Gueron, and Dusan Kostic. 2020. "Fast polynomial
+ * inversion for post quantum QC-MDPC cryptography". Cryptology ePrint Archive,
+ * 2020. https://eprint.iacr.org/2020/298.pdf
+ */
+
+#include "cleanup.h"
+#include "gf2x.h"
+#include "gf2x_internal.h"
+
+// a = a^2 mod (x^r - 1)
+_INLINE_ void gf2x_mod_sqr_in_place(IN OUT pad_r_t *a,
+                                    OUT dbl_pad_r_t *secure_buffer,
+                                    IN const gf2x_ctx *ctx)
+{
+  ctx->sqr(secure_buffer, a);
+  ctx->red(a, secure_buffer);
+}
+
+// c = a^2^2^num_sqrs
+_INLINE_ void repeated_squaring(OUT pad_r_t *c,
+                                IN pad_r_t *    a,
+                                IN const size_t num_sqrs,
+                                OUT dbl_pad_r_t *sec_buf,
+                                IN const gf2x_ctx *ctx)
+{
+  c->val = a->val;
+
+  for(size_t i = 0; i < num_sqrs; i++) {
+    gf2x_mod_sqr_in_place(c, sec_buf, ctx);
+  }
+}
+
+// The gf2x_mod_inv function implements inversion in F_2[x]/(x^R - 1)
+// based on [1](Algorithm 2).
+
+// In every iteration, [1](Algorithm 2) performs two exponentiations:
+// exponentiation 0 (exp0) and exponentiation 1 (exp1) of the form f^(2^k).
+// These exponentiations are computed either by repeated squaring of f, k times,
+// or by a single k-squaring of f. The method for a specific value of k
+// is chosen based on the performance of squaring and k-squaring.
+//
+// Benchmarks on several platforms indicate that a good threshold
+// for switching from repeated squaring to k-squaring is k = 64.
+#define K_SQR_THR (64)
+
+// k-squaring is computed by a permutation of bits of the input polynomial,
+// as defined in [1](Observation 1). The required parameter for the permutation
+// is l = (2^k)^-1 % R.
+// Therefore, there are two sets of parameters for every exponentiation:
+//   - exp0_k and exp1_k
+//   - exp0_l and exp1_l
+
+// Exponentiation 0 computes f^2^2^(i-1) for 0 < i < MAX_I.
+// Exponentiation 1 computes f^2^((r-2) % 2^i) for 0 < i < MAX_I,
+// only when the i-th bit of (r-2) is 1. Therefore, the value 0 in
+// exp1_k[i] and exp1_l[i] means that exp1 is skipped in i-th iteration.
+
+// To quickly generate all the required parameters in Sage:
+//   r = DESIRED_R
+//   max_i = floor(log(r-2, 2)) + 1
+//   exp0_k = [2^i for i in range(max_i)]
+//   exp0_l = [inverse_mod((2^k) % r, r) for k in exp0_k]
+//   exp1_k = [(r-2)%(2^i) if ((r-2) & (1<<i)) else 0 for i in range(max_i)]
+//   exp1_l = [inverse_mod((2^k) % r, r) if k != 0 else 0 for k in exp1_k]
+
+#if(LEVEL == 1)
+// The parameters below are hard-coded for R=12323
+bike_static_assert((R_BITS == 12323), gf2x_inv_r_doesnt_match_parameters);
+
+// MAX_I = floor(log(r-2)) + 1
+#  define MAX_I (14)
+#  define EXP0_K_VALS \
+    1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192
+#  define EXP0_L_VALS                                                           \
+    6162, 3081, 3851, 5632, 22, 484, 119, 1838, 1742, 3106, 10650, 1608, 10157, \
+      8816
+#  define EXP1_K_VALS 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 33, 4129
+#  define EXP1_L_VALS 0, 0, 0, 0, 0, 6162, 0, 0, 0, 0, 0, 0, 242, 5717
+
+#else
+// The parameters below are hard-coded for R=24659
+bike_static_assert((R_BITS == 24659), gf2x_inv_r_doesnt_match_parameters);
+
+// MAX_I = floor(log(r-2)) + 1
+#  define MAX_I (15)
+#  define EXP0_K_VALS \
+    1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384
+#  define EXP0_L_VALS                                                          \
+    12330, 6165, 7706, 3564, 2711, 1139, 15053, 1258, 4388, 20524, 9538, 6393, \
+      10486, 1715, 6804
+#  define EXP1_K_VALS 0, 0, 0, 0, 1, 0, 17, 0, 0, 0, 0, 0, 0, 81, 8273
+#  define EXP1_L_VALS 0, 0, 0, 0, 12330, 0, 13685, 0, 0, 0, 0, 0, 0, 23678, 19056
+
+#endif
+
+// Inversion in F_2[x]/(x^R - 1), [1](Algorithm 2).
+// c = a^{-1} mod x^r-1
+void gf2x_mod_inv(OUT pad_r_t *c, IN const pad_r_t *a)
+{
+  // Initialize gf2x methods struct
+  gf2x_ctx ctx = {0};
+  gf2x_ctx_init(&ctx);
+
+  // Note that exp0/1_k/l are predefined constants that depend only on the value
+  // of R. This value is public. Therefore, branches in this function, which
+  // depends on R, are also "public". Code that releases these branches
+  // (taken/not-taken) does not leak secret information.
+  const size_t exp0_k[MAX_I] = {EXP0_K_VALS};
+  const size_t exp0_l[MAX_I] = {EXP0_L_VALS};
+  const size_t exp1_k[MAX_I] = {EXP1_K_VALS};
+  const size_t exp1_l[MAX_I] = {EXP1_L_VALS};
+
+  DEFER_CLEANUP(pad_r_t f = {0}, pad_r_cleanup);
+  DEFER_CLEANUP(pad_r_t g = {0}, pad_r_cleanup);
+  DEFER_CLEANUP(pad_r_t t = {0}, pad_r_cleanup);
+  DEFER_CLEANUP(dbl_pad_r_t sec_buf = {0}, dbl_pad_r_cleanup);
+
+  // Steps 2 and 3 in [1](Algorithm 2)
+  f.val = a->val;
+  t.val = a->val;
+
+  for(size_t i = 1; i < MAX_I; i++) {
+    // Step 5 in [1](Algorithm 2), exponentiation 0: g = f^2^2^(i-1)
+    if(exp0_k[i - 1] <= K_SQR_THR) {
+      repeated_squaring(&g, &f, exp0_k[i - 1], &sec_buf, &ctx);
+    } else {
+      ctx.k_sqr(&g, &f, exp0_l[i - 1]);
+    }
+
+    // Step 6, [1](Algorithm 2): f = f*g
+    gf2x_mod_mul_with_ctx(&f, &g, &f, &ctx);
+
+    if(exp1_k[i] != 0) {
+      // Step 8, [1](Algorithm 2), exponentiation 1: g = f^2^((r-2) % 2^i)
+      if(exp1_k[i] <= K_SQR_THR) {
+        repeated_squaring(&g, &f, exp1_k[i], &sec_buf, &ctx);
+      } else {
+        ctx.k_sqr(&g, &f, exp1_l[i]);
+      }
+
+      // Step 9, [1](Algorithm 2): t = t*g;
+      gf2x_mod_mul_with_ctx(&t, &g, &t, &ctx);
+    }
+  }
+
+  // Step 10, [1](Algorithm 2): c = t^2
+  gf2x_mod_sqr_in_place(&t, &sec_buf, &ctx);
+  c->val = t.val;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx2.c
new file mode 100644
index 0000000000..91ed73d3f2
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx2.c
@@ -0,0 +1,188 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * The k-squaring algorithm in this file is based on:
+ * [1] Nir Drucker, Shay Gueron, and Dusan Kostic. 2020. "Fast polynomial
+ * inversion for post quantum QC-MDPC cryptography". Cryptology ePrint Archive,
+ * 2020. https://eprint.iacr.org/2020/298.pdf
+ */
+
+#if defined(S2N_BIKE_R3_AVX2)
+
+#include "cleanup.h"
+#include "gf2x_internal.h"
+
+#define AVX2_INTERNAL
+#include "x86_64_intrinsic.h"
+
+#define NUM_YMMS    (2)
+#define NUM_OF_VALS (NUM_YMMS * WORDS_IN_YMM)
+
+_INLINE_ void generate_map(OUT uint16_t *map, IN const uint16_t l_param)
+{
+  __m256i vmap[NUM_YMMS], vtmp[NUM_YMMS], vr, inc, zero;
+
+  // The permutation map is generated in the following way:
+  //   1. for i = 0 to map size:
+  //   2.  map[i] = (i * l_param) % r
+  // However, to avoid the expensive multiplication and modulo operations
+  // we modify the algorithm to:
+  //   1. map[0] = l_param
+  //   2. for i = 1 to map size:
+  //   3.   map[i] = map[i - 1] + l_param
+  //   4.   if map[i] >= r:
+  //   5.     map[i] = map[i] - r
+  // This algorithm is parallelized with vector instructions by processing
+  // certain number of values (NUM_OF_VALS) in parallel. Therefore,
+  // in the beginning we need to initialize the first NUM_OF_VALS elements.
+  for(size_t i = 0; i < NUM_OF_VALS; i++) {
+    map[i] = (i * l_param) % R_BITS;
+  }
+
+  vr   = SET1_I16(R_BITS);
+  zero = SET_ZERO;
+
+  // Set the increment vector such that adding it to vmap vectors
+  // gives the next NUM_OF_VALS elements of the map. AVX2 does not
+  // support comparison of vectors where vector elements are considered
+  // as unsigned integers. This is a problem when r > 2^14 because
+  // sum of two values can be greater than 2^15 which would make the it
+  // a negative number when considered as a signed 16-bit integer,
+  // and therefore, the condition in step 4 of the algorithm would be
+  // evaluated incorrectly. So, we use the following trick:
+  // we subtract R from the increment and modify the algorithm:
+  //   1. map[0] = l_param
+  //   2. for i = 1 to map size:
+  //   3.   map[i] = map[i - 1] + (l_param - r)
+  //   4.   if map[i] < 0:
+  //   5.     map[i] = map[i] + r
+  inc = SET1_I16((l_param * NUM_OF_VALS) % R_BITS);
+  inc = SUB_I16(inc, vr);
+
+  // Load the first NUM_OF_VALS elements in the vmap vectors
+  for(size_t i = 0; i < NUM_YMMS; i++) {
+    vmap[i] = LOAD(&map[i * WORDS_IN_YMM]);
+  }
+
+  for(size_t i = NUM_YMMS; i < (R_PADDED / WORDS_IN_YMM); i += NUM_YMMS) {
+    for(size_t j = 0; j < NUM_YMMS; j++) {
+      vmap[j] = ADD_I16(vmap[j], inc);
+      vtmp[j] = CMPGT_I16(zero, vmap[j]);
+      vmap[j] = ADD_I16(vmap[j], vtmp[j] & vr);
+
+      STORE(&map[(i + j) * WORDS_IN_YMM], vmap[j]);
+    }
+  }
+}
+
+// Convert from bytes representation, where every byte holds a single bit,
+// of the polynomial, to a binary representation where every byte
+// holds 8 bits of the polynomial.
+_INLINE_ void bytes_to_bin(OUT pad_r_t *bin_buf, IN const uint8_t *bytes_buf)
+{
+  uint32_t *bin32 = (uint32_t *)bin_buf;
+
+  for(size_t i = 0; i < R_QWORDS * 2; i++) {
+    __m256i t = LOAD(&bytes_buf[i * BYTES_IN_YMM]);
+    bin32[i]  = MOVEMASK(t);
+  }
+}
+
+// Convert from binary representation where every byte holds 8 bits
+// of the polynomial, to byte representation where
+// every byte holds a single bit of the polynomial.
+_INLINE_ void bin_to_bytes(OUT uint8_t *bytes_buf, IN const pad_r_t *bin_buf)
+{
+  // The algorithm works by taking every 32 bits of the input and converting
+  // them to 32 bytes where each byte holds one of the bits. The first step is
+  // to broadcast a 32-bit value (call it a)  to all elements of vector t.
+  // Then t contains bytes of a in the following order:
+  //   t = [ a3 a2 a1 a0 ... a3 a2 a1 a0 ]
+  // where a0 contains the first 8 bits of a, a1 the second 8 bits, etc.
+  // Let the output vector be [ out31 out30 ... out0 ]. We want to store
+  // bit 0 of a in out0 byte, bit 1 of a in out1 byte, ect. (note that
+  // we want to store the bit in the most significant position of a byte
+  // because this is required by MOVEMASK instruction used in bytes_to_bin.)
+  //
+  // Ideally, we would shuffle the bytes of t such that the byte in
+  // i-th position contains i-th bit of val, shift t appropriately and obtain
+  // the result. However, AVX2 doesn't support shift operation on bytes, only
+  // shifts of individual QWORDS (64 bit) and DWORDS (32 bit) are allowed.
+  // Consider the two least significant DWORDS of t:
+  //   t = [ ... | a3 a2 a1 a0 | a3 a2 a1 a0 ]
+  // and shift them by 6 and 4 to the left, respectively, to obtain:
+  //   t = [ ... | t7 t6 t5 t4 | t3 t2 t1 t0 ]
+  // where t3 = a3 << 6, t2 = a2 << 6, t1 = a1 << 6, t0 = a0 << 6,
+  // and   t7 = a3 << 4, t6 = a2 << 4, t5 = a1 << 4, t4 = a0 << 4.
+  // Now we shuffle vector t to obtain vector p such that:
+  //   p = [ ... | t12 t12 t8 t8 | t4 t4 t0 t0 ]
+  // Note that in every even position of the vector p we have the right byte
+  // of the input shifted by the required shift. The values in the odd
+  // positions contain the right bytes of the input but they need to be shifted
+  // one more time to the left by 1. By shifting each DWORD of p by 1 we get:
+  //   q = [ ... | p7 p6 p5 p4 | p3 p2 p1 p0 ]
+  // where p1 = t0 << 1 = a0 << 7, p3 = t4 << 1 = 5, etc. Therefore, by
+  // blending p and q (taking even positions from p and odd positions from q)
+  // we obtain the desired result.
+
+  __m256i t, p, q;
+
+  const __m256i shift_mask = SET_I32(0, 2, 4, 6, 0, 2, 4, 6);
+
+  const __m256i shuffle_mask =
+    SET_I8(15, 15, 11, 11, 7, 7, 3, 3, 14, 14, 10, 10, 6, 6, 2, 2, 13, 13, 9, 9,
+           5, 5, 1, 1, 12, 12, 8, 8, 4, 4, 0, 0);
+
+  const __m256i blend_mask = SET1_I16(0x00ff);
+
+  const uint32_t *bin32 = (const uint32_t *)bin_buf;
+
+  for(size_t i = 0; i < R_QWORDS * 2; i++) {
+    t = SET1_I32(bin32[i]);
+    t = SLLV_I32(t, shift_mask);
+
+    p = SHUF_I8(t, shuffle_mask);
+    q = SLLI_I32(p, 1);
+
+    STORE(&bytes_buf[i * 32], BLENDV_I8(p, q, blend_mask));
+  }
+}
+
+// The k-squaring function computes c = a^(2^k) % (x^r - 1).
+// By [1](Observation 1), if
+//     a = sum_{j in supp(a)} x^j,
+// then
+//     a^(2^k) % (x^r - 1) = sum_{j in supp(a)} x^((j * 2^k) % r).
+// Therefore, k-squaring can be computed as permutation of the bits of "a":
+//     pi0 : j --> (j * 2^k) % r.
+// For improved performance, we compute the result by inverted permutation pi1:
+//     pi1 : (j * 2^-k) % r --> j.
+// Input argument l_param is defined as the value (2^-k) % r.
+void k_sqr_avx2(OUT pad_r_t *c, IN const pad_r_t *a, IN const size_t l_param)
+{
+  ALIGN(ALIGN_BYTES) uint16_t map[R_PADDED];
+  ALIGN(ALIGN_BYTES) uint8_t  a_bytes[R_PADDED];
+  ALIGN(ALIGN_BYTES) uint8_t  c_bytes[R_PADDED] = {0};
+
+  // Generate the permutation map defined by pi1 and l_param.
+  generate_map(map, l_param);
+
+  bin_to_bytes(a_bytes, a);
+
+  // Permute "a" using the generated permutation map.
+  for(size_t i = 0; i < R_BITS; i++) {
+    c_bytes[i] = a_bytes[map[i]];
+  }
+
+  bytes_to_bin(c, c_bytes);
+
+  secure_clean(a_bytes, sizeof(a_bytes));
+  secure_clean(c_bytes, sizeof(c_bytes));
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx512.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx512.c
new file mode 100644
index 0000000000..af2c5738a8
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx512.c
@@ -0,0 +1,135 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * The k-squaring algorithm in this file is based on:
+ * [1] Nir Drucker, Shay Gueron, and Dusan Kostic. 2020. "Fast polynomial
+ * inversion for post quantum QC-MDPC cryptography". Cryptology ePrint Archive,
+ * 2020. https://eprint.iacr.org/2020/298.pdf
+ */
+
+#if defined(S2N_BIKE_R3_AVX512)
+
+#include "cleanup.h"
+#include "gf2x_internal.h"
+
+#define AVX512_INTERNAL
+#include "x86_64_intrinsic.h"
+
+#define NUM_ZMMS    (2)
+#define NUM_OF_VALS (NUM_ZMMS * WORDS_IN_ZMM)
+
+// clang-3.9 doesn't recognize these two macros
+#if !defined(_MM_CMPINT_EQ)
+#  define _MM_CMPINT_EQ (0)
+#endif
+
+#if !defined(_MM_CMPINT_NLT)
+#  define _MM_CMPINT_NLT (5)
+#endif
+
+_INLINE_ void generate_map(OUT uint16_t *map, IN const size_t l_param)
+{
+  __m512i   vmap[NUM_ZMMS], vr, inc;
+  __mmask32 mask[NUM_ZMMS];
+
+  // The permutation map is generated in the following way:
+  //   1. for i = 0 to map size:
+  //   2.  map[i] = (i * l_param) % r
+  // However, to avoid the expensive multiplication and modulo operations
+  // we modify the algorithm to:
+  //   1. map[0] = l_param
+  //   2. for i = 1 to map size:
+  //   3.   map[i] = map[i - 1] + l_param
+  //   4.   if map[i] >= r:
+  //   5.     map[i] = map[i] - r
+  // This algorithm is parallelized with vector instructions by processing
+  // certain number of values (NUM_OF_VALS) in parallel. Therefore,
+  // in the beginning we need to initialize the first NUM_OF_VALS elements.
+  for(size_t i = 0; i < NUM_OF_VALS; i++) {
+    map[i] = (i * l_param) % R_BITS;
+  }
+
+  // Set the increment vector such that by adding it to vmap vectors
+  // we will obtain the next NUM_OF_VALS elements of the map.
+  inc = SET1_I16((l_param * NUM_OF_VALS) % R_BITS);
+  vr  = SET1_I16(R_BITS);
+
+  // Load the first NUM_OF_VALS elements in the vmap vectors
+  for(size_t i = 0; i < NUM_ZMMS; i++) {
+    vmap[i] = LOAD(&map[i * WORDS_IN_ZMM]);
+  }
+
+  for(size_t i = NUM_ZMMS; i < (R_PADDED / WORDS_IN_ZMM); i += NUM_ZMMS) {
+    for(size_t j = 0; j < NUM_ZMMS; j++) {
+      vmap[j] = ADD_I16(vmap[j], inc);
+      mask[j] = CMPM_U16(vmap[j], vr, _MM_CMPINT_NLT);
+      vmap[j] = MSUB_I16(vmap[j], mask[j], vmap[j], vr);
+
+      STORE(&map[(i + j) * WORDS_IN_ZMM], vmap[j]);
+    }
+  }
+}
+
+// Convert from bytes representation where each byte holds a single bit
+// to binary representation where each byte holds 8 bits of the polynomial
+_INLINE_ void bytes_to_bin(OUT pad_r_t *bin_buf, IN const uint8_t *bytes_buf)
+{
+  uint64_t *bin64 = (uint64_t *)bin_buf;
+
+  __m512i first_bit_mask = SET1_I8(1);
+  for(size_t i = 0; i < R_QWORDS; i++) {
+    __m512i t = LOAD(&bytes_buf[i * BYTES_IN_ZMM]);
+    bin64[i]  = CMPM_U8(t, first_bit_mask, _MM_CMPINT_EQ);
+  }
+}
+
+// Convert from binary representation where each byte holds 8 bits
+// to byte representation where each byte holds a single bit of the polynomial
+_INLINE_ void bin_to_bytes(OUT uint8_t *bytes_buf, IN const pad_r_t *bin_buf)
+{
+  const uint64_t *bin64 = (const uint64_t *)bin_buf;
+
+  for(size_t i = 0; i < R_QWORDS; i++) {
+    __m512i t = SET1MZ_I8(bin64[i], 1);
+    STORE(&bytes_buf[i * BYTES_IN_ZMM], t);
+  }
+}
+
+// The k-squaring function computes c = a^(2^k) % (x^r - 1),
+// By [1](Observation 1), if
+//     a = sum_{j in supp(a)} x^j,
+// then
+//     a^(2^k) % (x^r - 1) = sum_{j in supp(a)} x^((j * 2^k) % r).
+// Therefore, k-squaring can be computed as permutation of the bits of "a":
+//     pi0 : j --> (j * 2^k) % r.
+// For improved performance, we compute the result by inverted permutation pi1:
+//     pi1 : (j * 2^-k) % r --> j.
+// Input argument l_param is defined as the value (2^-k) % r.
+void k_sqr_avx512(OUT pad_r_t *c, IN const pad_r_t *a, IN const size_t l_param)
+{
+  ALIGN(ALIGN_BYTES) uint16_t map[R_PADDED];
+  ALIGN(ALIGN_BYTES) uint8_t  a_bytes[R_PADDED];
+  ALIGN(ALIGN_BYTES) uint8_t  c_bytes[R_PADDED] = {0};
+
+  // Generate the permutation map defined by pi1 and l_param.
+  generate_map(map, l_param);
+
+  bin_to_bytes(a_bytes, a);
+
+  // Permute "a" using the generated permutation map.
+  for(size_t i = 0; i < R_BITS; i++) {
+    c_bytes[i] = a_bytes[map[i]];
+  }
+
+  bytes_to_bin(c, c_bytes);
+
+  secure_clean(a_bytes, sizeof(a_bytes));
+  secure_clean(c_bytes, sizeof(c_bytes));
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_portable.c
new file mode 100644
index 0000000000..c757687f58
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_portable.c
@@ -0,0 +1,48 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ *
+ * The k-squaring algorithm in this file is based on:
+ * [1] Nir Drucker, Shay Gueron, and Dusan Kostic. 2020. "Fast polynomial
+ * inversion for post quantum QC-MDPC cryptography". Cryptology ePrint Archive,
+ * 2020. https://eprint.iacr.org/2020/298.pdf
+ */
+
+#include "gf2x_internal.h"
+#include "utilities.h"
+
+#define BITS_IN_BYTE (8)
+
+// The k-squaring function computes c = a^(2^k) % (x^r - 1),
+// By [1](Observation 1), if
+//     a = sum_{j in supp(a)} x^j,
+// then
+//     a^(2^k) % (x^r - 1) = sum_{j in supp(a)} x^((j * 2^k) % r).
+// Therefore, k-squaring can be computed as permutation of the bits of "a":
+//     pi0 : j --> (j * 2^k) % r.
+// For improved performance, we compute the result by inverted permutation pi1:
+//     pi1 : (j * 2^-k) % r --> j.
+// Input argument l_param is defined as the value (2^-k) % r.
+void k_sqr_port(OUT pad_r_t *c, IN const pad_r_t *a, IN const size_t l_param)
+{
+  bike_memset(c->val.raw, 0, sizeof(c->val));
+
+  // Compute the result byte by byte
+  size_t idx = 0;
+  for(size_t i = 0; i < R_BYTES; i++) {
+    for(size_t j = 0; j < BITS_IN_BYTE; j++, idx++) {
+      // Bit of "c" at position idx is set to the value of
+      // the bit of "a" at position pi1(idx) = (l_param * idx) % R_BITS.
+      size_t pos = (l_param * idx) % R_BITS;
+
+      size_t  pos_byte = pos >> 3;
+      size_t  pos_bit  = pos & 7;
+      uint8_t bit      = (a->val.raw[pos_byte] >> pos_bit) & 1;
+
+      c->val.raw[i] |= (bit << j);
+    }
+  }
+  c->val.raw[R_BYTES - 1] &= LAST_R_BYTE_MASK;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul.c
new file mode 100644
index 0000000000..ae1d7a510a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul.c
@@ -0,0 +1,113 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include <assert.h>
+
+#include "cleanup.h"
+#include "gf2x.h"
+#include "gf2x_internal.h"
+
+// The secure buffer size required for Karatsuba is computed by:
+//    size(n) = 3*n/2 + size(n/2) = 3*sum_{i}{n/2^i} < 3n
+#define SECURE_BUFFER_QWORDS (3 * R_PADDED_QWORDS)
+
+// Karatsuba multiplication algorithm.
+// Input arguments a and b are padded with zeros, here:
+//   - n: real number of digits in a and b (R_QWORDS)
+//   - n_padded: padded number of digits of a and b (assumed to be power of 2)
+// A buffer sec_buf is used for storing temporary data between recursion calls.
+// It might contain secrets, and therefore should be securely cleaned after
+// completion.
+_INLINE_ void karatzuba(OUT uint64_t *c,
+                        IN const uint64_t *a,
+                        IN const uint64_t *b,
+                        IN const size_t    qwords_len,
+                        IN const size_t    qwords_len_pad,
+                        uint64_t *         sec_buf,
+                        IN const gf2x_ctx *ctx)
+{
+  if(qwords_len <= ctx->mul_base_qwords) {
+    ctx->mul_base(c, a, b);
+    return;
+  }
+
+  const size_t half_qw_len = qwords_len_pad >> 1;
+
+  // Split a and b into low and high parts of size n_padded/2
+  const uint64_t *a_lo = a;
+  const uint64_t *b_lo = b;
+  const uint64_t *a_hi = &a[half_qw_len];
+  const uint64_t *b_hi = &b[half_qw_len];
+
+  // Split c into 4 parts of size n_padded/2 (the last ptr is not needed)
+  uint64_t *c0 = c;
+  uint64_t *c1 = &c[half_qw_len];
+  uint64_t *c2 = &c[half_qw_len * 2];
+
+  // Allocate 3 ptrs of size n_padded/2  on sec_buf
+  uint64_t *alah = sec_buf;
+  uint64_t *blbh = &sec_buf[half_qw_len];
+  uint64_t *tmp  = &sec_buf[half_qw_len * 2];
+
+  // Move sec_buf ptr to the first free location for the next recursion call
+  sec_buf = &sec_buf[half_qw_len * 3];
+
+  // Compute a_lo*b_lo and store the result in (c1|c0)
+  karatzuba(c0, a_lo, b_lo, half_qw_len, half_qw_len, sec_buf, ctx);
+
+  // If the real number of digits n is less or equal to n_padded/2 then:
+  //     a_hi = 0 and b_hi = 0
+  // and
+  //     (a_hi|a_lo)*(b_hi|b_lo) = a_lo*b_lo
+  // so we can skip the remaining two multiplications
+  if(qwords_len > half_qw_len) {
+    // Compute a_hi*b_hi and store the result in (c3|c2)
+    karatzuba(c2, a_hi, b_hi, qwords_len - half_qw_len, half_qw_len, sec_buf,
+              ctx);
+
+    // Compute alah = (a_lo + a_hi) and blbh = (b_lo + b_hi)
+    ctx->karatzuba_add1(alah, blbh, a, b, half_qw_len);
+
+    // Compute (c1 + c2) and store the result in tmp
+    ctx->karatzuba_add2(tmp, c1, c2, half_qw_len);
+
+    // Compute alah*blbh and store the result in (c2|c1)
+    karatzuba(c1, alah, blbh, half_qw_len, half_qw_len, sec_buf, ctx);
+
+    // Add (tmp|tmp) and (c3|c0) to (c2|c1)
+    ctx->karatzuba_add3(c0, tmp, half_qw_len);
+  }
+}
+
+void gf2x_mod_mul_with_ctx(OUT pad_r_t *c,
+                           IN const pad_r_t *a,
+                           IN const pad_r_t *b,
+                           IN const gf2x_ctx *ctx)
+{
+  bike_static_assert((R_PADDED_BYTES % 2 == 0), karatzuba_n_is_odd);
+
+  DEFER_CLEANUP(dbl_pad_r_t t = {0}, dbl_pad_r_cleanup);
+  ALIGN(ALIGN_BYTES) uint64_t secure_buffer[SECURE_BUFFER_QWORDS];
+
+  karatzuba((uint64_t *)&t, (const uint64_t *)a, (const uint64_t *)b, R_QWORDS,
+            R_PADDED_QWORDS, secure_buffer, ctx);
+
+  ctx->red(c, &t);
+
+  secure_clean((uint8_t *)secure_buffer, sizeof(secure_buffer));
+}
+
+void gf2x_mod_mul(OUT pad_r_t *c, IN const pad_r_t *a, IN const pad_r_t *b)
+{
+  bike_static_assert((R_PADDED_BYTES % 2 == 0), karatzuba_n_is_odd);
+
+  // Initialize gf2x methods struct
+  gf2x_ctx ctx = {0};
+  gf2x_ctx_init(&ctx);
+
+  gf2x_mod_mul_with_ctx(c, a, b, &ctx);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx2.c
new file mode 100644
index 0000000000..8f9c17dc09
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx2.c
@@ -0,0 +1,109 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#if defined(S2N_BIKE_R3_AVX2)
+
+#include <assert.h>
+
+#include "cleanup.h"
+#include "gf2x_internal.h"
+
+#define AVX2_INTERNAL
+#include "x86_64_intrinsic.h"
+
+void karatzuba_add1_avx2(OUT uint64_t *alah,
+                         OUT uint64_t *blbh,
+                         IN const uint64_t *a,
+                         IN const uint64_t *b,
+                         IN const size_t    qwords_len)
+{
+  assert(qwords_len % REG_QWORDS == 0);
+
+  REG_T va0, va1, vb0, vb1;
+
+  for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+    va0 = LOAD(&a[i]);
+    va1 = LOAD(&a[i + qwords_len]);
+    vb0 = LOAD(&b[i]);
+    vb1 = LOAD(&b[i + qwords_len]);
+
+    STORE(&alah[i], va0 ^ va1);
+    STORE(&blbh[i], vb0 ^ vb1);
+  }
+}
+
+void karatzuba_add2_avx2(OUT uint64_t *z,
+                         IN const uint64_t *x,
+                         IN const uint64_t *y,
+                         IN const size_t    qwords_len)
+{
+  assert(qwords_len % REG_QWORDS == 0);
+
+  REG_T vx, vy;
+
+  for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+    vx = LOAD(&x[i]);
+    vy = LOAD(&y[i]);
+
+    STORE(&z[i], vx ^ vy);
+  }
+}
+
+void karatzuba_add3_avx2(OUT uint64_t *c,
+                         IN const uint64_t *mid,
+                         IN const size_t    qwords_len)
+{
+  assert(qwords_len % REG_QWORDS == 0);
+
+  REG_T vr0, vr1, vr2, vr3, vt;
+
+  uint64_t *c0 = c;
+  uint64_t *c1 = &c[qwords_len];
+  uint64_t *c2 = &c[2 * qwords_len];
+  uint64_t *c3 = &c[3 * qwords_len];
+
+  for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+    vr0 = LOAD(&c0[i]);
+    vr1 = LOAD(&c1[i]);
+    vr2 = LOAD(&c2[i]);
+    vr3 = LOAD(&c3[i]);
+    vt  = LOAD(&mid[i]);
+
+    STORE(&c1[i], vt ^ vr0 ^ vr1);
+    STORE(&c2[i], vt ^ vr2 ^ vr3);
+  }
+}
+
+// c = a mod (x^r - 1)
+void gf2x_red_avx2(OUT pad_r_t *c, IN const dbl_pad_r_t *a)
+{
+  const uint64_t *a64 = (const uint64_t *)a;
+  uint64_t *      c64 = (uint64_t *)c;
+
+  for(size_t i = 0; i < R_QWORDS; i += REG_QWORDS) {
+    REG_T vt0 = LOAD(&a64[i]);
+    REG_T vt1 = LOAD(&a64[i + R_QWORDS]);
+    REG_T vt2 = LOAD(&a64[i + R_QWORDS - 1]);
+
+    vt1 = SLLI_I64(vt1, LAST_R_QWORD_TRAIL);
+    vt2 = SRLI_I64(vt2, LAST_R_QWORD_LEAD);
+
+    vt0 ^= (vt1 | vt2);
+
+    STORE(&c64[i], vt0);
+  }
+
+  c64[R_QWORDS - 1] &= LAST_R_QWORD_MASK;
+
+  // Clean the secrets from the upper part of c
+  secure_clean((uint8_t *)&c64[R_QWORDS],
+               (R_PADDED_QWORDS - R_QWORDS) * sizeof(uint64_t));
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx512.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx512.c
new file mode 100644
index 0000000000..78ce9683ad
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx512.c
@@ -0,0 +1,109 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#if defined(S2N_BIKE_R3_AVX512)
+
+#include <assert.h>
+
+#include "cleanup.h"
+#include "gf2x_internal.h"
+
+#define AVX512_INTERNAL
+#include "x86_64_intrinsic.h"
+
+void karatzuba_add1_avx512(OUT uint64_t *alah,
+                           OUT uint64_t *blbh,
+                           IN const uint64_t *a,
+                           IN const uint64_t *b,
+                           IN const size_t    qwords_len)
+{
+  assert(qwords_len % REG_QWORDS == 0);
+
+  REG_T va0, va1, vb0, vb1;
+
+  for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+    va0 = LOAD(&a[i]);
+    va1 = LOAD(&a[i + qwords_len]);
+    vb0 = LOAD(&b[i]);
+    vb1 = LOAD(&b[i + qwords_len]);
+
+    STORE(&alah[i], va0 ^ va1);
+    STORE(&blbh[i], vb0 ^ vb1);
+  }
+}
+
+void karatzuba_add2_avx512(OUT uint64_t *z,
+                           IN const uint64_t *x,
+                           IN const uint64_t *y,
+                           IN const size_t    qwords_len)
+{
+  assert(qwords_len % REG_QWORDS == 0);
+
+  REG_T vx, vy;
+
+  for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+    vx = LOAD(&x[i]);
+    vy = LOAD(&y[i]);
+
+    STORE(&z[i], vx ^ vy);
+  }
+}
+
+void karatzuba_add3_avx512(OUT uint64_t *c,
+                           IN const uint64_t *mid,
+                           IN const size_t    qwords_len)
+{
+  assert(qwords_len % REG_QWORDS == 0);
+
+  REG_T vr0, vr1, vr2, vr3, vt;
+
+  uint64_t *c0 = c;
+  uint64_t *c1 = &c[qwords_len];
+  uint64_t *c2 = &c[2 * qwords_len];
+  uint64_t *c3 = &c[3 * qwords_len];
+
+  for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+    vr0 = LOAD(&c0[i]);
+    vr1 = LOAD(&c1[i]);
+    vr2 = LOAD(&c2[i]);
+    vr3 = LOAD(&c3[i]);
+    vt  = LOAD(&mid[i]);
+
+    STORE(&c1[i], vt ^ vr0 ^ vr1);
+    STORE(&c2[i], vt ^ vr2 ^ vr3);
+  }
+}
+
+// c = a mod (x^r - 1)
+void gf2x_red_avx512(OUT pad_r_t *c, IN const dbl_pad_r_t *a)
+{
+  const uint64_t *a64 = (const uint64_t *)a;
+  uint64_t *      c64 = (uint64_t *)c;
+
+  for(size_t i = 0; i < R_QWORDS; i += REG_QWORDS) {
+    REG_T vt0 = LOAD(&a64[i]);
+    REG_T vt1 = LOAD(&a64[i + R_QWORDS]);
+    REG_T vt2 = LOAD(&a64[i + R_QWORDS - 1]);
+
+    vt1 = SLLI_I64(vt1, LAST_R_QWORD_TRAIL);
+    vt2 = SRLI_I64(vt2, LAST_R_QWORD_LEAD);
+
+    vt0 ^= (vt1 | vt2);
+
+    STORE(&c64[i], vt0);
+  }
+
+  c64[R_QWORDS - 1] &= LAST_R_QWORD_MASK;
+
+  // Clean the secrets from the upper part of c
+  secure_clean((uint8_t *)&c64[R_QWORDS],
+               (R_PADDED_QWORDS - R_QWORDS) * sizeof(uint64_t));
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_pclmul.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_pclmul.c
new file mode 100644
index 0000000000..1d4553997c
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_pclmul.c
@@ -0,0 +1,155 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#if defined(S2N_BIKE_R3_PCLMUL)
+
+#include <immintrin.h>
+
+#include "gf2x_internal.h"
+
+#define LOAD128(mem)       _mm_loadu_si128((const void *)(mem))
+#define STORE128(mem, reg) _mm_storeu_si128((void *)(mem), (reg))
+#define UNPACKLO(x, y)     _mm_unpacklo_epi64((x), (y))
+#define UNPACKHI(x, y)     _mm_unpackhi_epi64((x), (y))
+#define CLMUL(x, y, imm)   _mm_clmulepi64_si128((x), (y), (imm))
+#define BSRLI(x, imm)      _mm_srli_si128((x), (imm))
+#define BSLLI(x, imm)      _mm_slli_si128((x), (imm))
+
+// 4x4 Karatsuba multiplication
+_INLINE_ void gf2x_mul4_int(OUT __m128i      c[4],
+                            IN const __m128i a_lo,
+                            IN const __m128i a_hi,
+                            IN const __m128i b_lo,
+                            IN const __m128i b_hi)
+{
+  // a_lo = [a1 | a0]; a_hi = [a3 | a2];
+  // b_lo = [b1 | b0]; b_hi = [b3 | b2];
+  // 4x4 Karatsuba requires three 2x2 multiplications:
+  //   (1) a_lo * b_lo
+  //   (2) a_hi * b_hi
+  //   (3) aa * bb = (a_lo + a_hi) * (b_lo + b_hi)
+  // Each of the three 2x2 multiplications requires three 1x1 multiplications:
+  //   (1) is computed by a0*b0, a1*b1, (a0+a1)*(b0+b1)
+  //   (2) is computed by a2*b2, a3*b3, (a2+a3)*(b2+b3)
+  //   (3) is computed by aa0*bb0, aa1*bb1, (aa0+aa1)*(bb0+bb1)
+  // All the required additions are performed in the end.
+
+  __m128i aa, bb;
+  __m128i xx, yy, uu, vv, m;
+  __m128i lo[2], hi[2], mi[2];
+  __m128i t[9];
+
+  aa = a_lo ^ a_hi;
+  bb = b_lo ^ b_hi;
+
+  // xx <-- [(a2+a3) | (a0+a1)]
+  // yy <-- [(b2+b3) | (b0+b1)]
+  xx = UNPACKLO(a_lo, a_hi);
+  yy = UNPACKLO(b_lo, b_hi);
+  xx = xx ^ UNPACKHI(a_lo, a_hi);
+  yy = yy ^ UNPACKHI(b_lo, b_hi);
+
+  // uu <-- [ 0 | (aa0+aa1)]
+  // vv <-- [ 0 | (bb0+bb1)]
+  uu = aa ^ BSRLI(aa, 8);
+  vv = bb ^ BSRLI(bb, 8);
+
+  // 9 multiplications
+  t[0] = CLMUL(a_lo, b_lo, 0x00);
+  t[1] = CLMUL(a_lo, b_lo, 0x11);
+  t[2] = CLMUL(a_hi, b_hi, 0x00);
+  t[3] = CLMUL(a_hi, b_hi, 0x11);
+  t[4] = CLMUL(xx, yy, 0x00);
+  t[5] = CLMUL(xx, yy, 0x11);
+  t[6] = CLMUL(aa, bb, 0x00);
+  t[7] = CLMUL(aa, bb, 0x11);
+  t[8] = CLMUL(uu, vv, 0x00);
+
+  t[4] ^= (t[0] ^ t[1]);
+  t[5] ^= (t[2] ^ t[3]);
+  t[8] ^= (t[6] ^ t[7]);
+
+  lo[0] = t[0] ^ BSLLI(t[4], 8);
+  lo[1] = t[1] ^ BSRLI(t[4], 8);
+  hi[0] = t[2] ^ BSLLI(t[5], 8);
+  hi[1] = t[3] ^ BSRLI(t[5], 8);
+  mi[0] = t[6] ^ BSLLI(t[8], 8);
+  mi[1] = t[7] ^ BSRLI(t[8], 8);
+
+  m = lo[1] ^ hi[0];
+
+  c[0] = lo[0];
+  c[1] = lo[0] ^ mi[0] ^ m;
+  c[2] = hi[1] ^ mi[1] ^ m;
+  c[3] = hi[1];
+}
+
+// 512x512bit multiplication performed by Karatsuba algorithm
+// where a and b are considered as having 8 digits of size 64 bits.
+void gf2x_mul_base_pclmul(OUT uint64_t *c,
+                          IN const uint64_t *a,
+                          IN const uint64_t *b)
+{
+  __m128i va[4], vb[4];
+  __m128i aa[2], bb[2];
+  __m128i lo[4], hi[4], mi[4], m[2];
+
+  for(size_t i = 0; i < 4; i++) {
+    va[i] = LOAD128(&a[QWORDS_IN_XMM * i]);
+    vb[i] = LOAD128(&b[QWORDS_IN_XMM * i]);
+  }
+
+  // Multiply the low and the high halves of a and b
+  // lo <-- a_lo * b_lo
+  // hi <-- a_hi * b_hi
+  gf2x_mul4_int(lo, va[0], va[1], vb[0], vb[1]);
+  gf2x_mul4_int(hi, va[2], va[3], vb[2], vb[3]);
+
+  // Compute the middle multiplication
+  // aa <-- a_lo + a_hi
+  // bb <-- b_lo + b_hi
+  // mi <-- aa * bb
+  aa[0] = va[0] ^ va[2];
+  aa[1] = va[1] ^ va[3];
+  bb[0] = vb[0] ^ vb[2];
+  bb[1] = vb[1] ^ vb[3];
+  gf2x_mul4_int(mi, aa[0], aa[1], bb[0], bb[1]);
+
+  m[0] = lo[2] ^ hi[0];
+  m[1] = lo[3] ^ hi[1];
+
+  STORE128(&c[0 * QWORDS_IN_XMM], lo[0]);
+  STORE128(&c[1 * QWORDS_IN_XMM], lo[1]);
+  STORE128(&c[2 * QWORDS_IN_XMM], mi[0] ^ lo[0] ^ m[0]);
+  STORE128(&c[3 * QWORDS_IN_XMM], mi[1] ^ lo[1] ^ m[1]);
+  STORE128(&c[4 * QWORDS_IN_XMM], mi[2] ^ hi[2] ^ m[0]);
+  STORE128(&c[5 * QWORDS_IN_XMM], mi[3] ^ hi[3] ^ m[1]);
+  STORE128(&c[6 * QWORDS_IN_XMM], hi[2]);
+  STORE128(&c[7 * QWORDS_IN_XMM], hi[3]);
+}
+
+void gf2x_sqr_pclmul(OUT dbl_pad_r_t *c, IN const pad_r_t *a)
+{
+  __m128i va, vr0, vr1;
+
+  const uint64_t *a64 = (const uint64_t *)a;
+  uint64_t *      c64 = (uint64_t *)c;
+
+  for(size_t i = 0; i < (R_XMM * QWORDS_IN_XMM); i += QWORDS_IN_XMM) {
+    va = LOAD128(&a64[i]);
+
+    vr0 = CLMUL(va, va, 0x00);
+    vr1 = CLMUL(va, va, 0x11);
+
+    STORE128(&c64[i * 2], vr0);
+    STORE128(&c64[i * 2 + QWORDS_IN_XMM], vr1);
+  }
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_portable.c
new file mode 100644
index 0000000000..86c21a1e28
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_portable.c
@@ -0,0 +1,77 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include "gf2x_internal.h"
+#include "utilities.h"
+
+#define LSB3(x) ((x)&7)
+
+// 64x64 bit multiplication
+// The algorithm is based on the windowing method, for example as in:
+// Brent, R. P., Gaudry, P., Thomé, E., & Zimmermann, P. (2008, May), "Faster
+// multiplication in GF (2)[x]". In: International Algorithmic Number Theory
+// Symposium (pp. 153-166). Springer, Berlin, Heidelberg. In this implementation,
+// the last three bits are multiplied using a schoolbook multiplication.
+void gf2x_mul_base_port(OUT uint64_t *c,
+                        IN const uint64_t *a,
+                        IN const uint64_t *b)
+{
+  uint64_t       h = 0, l = 0, g1, g2, u[8];
+  const uint64_t w  = 64;
+  const uint64_t s  = 3;
+  const uint64_t a0 = a[0];
+  const uint64_t b0 = b[0];
+
+  // Multiplying 64 bits by 7 can results in an overflow of 3 bits.
+  // Therefore, these bits are masked out, and are treated in step 3.
+  const uint64_t b0m = b0 & MASK(61);
+
+  // Step 1: Calculate a multiplication table with 8 entries.
+  u[0] = 0;
+  u[1] = b0m;
+  u[2] = u[1] << 1;
+  u[3] = u[2] ^ b0m;
+  u[4] = u[2] << 1;
+  u[5] = u[4] ^ b0m;
+  u[6] = u[3] << 1;
+  u[7] = u[6] ^ b0m;
+
+  // Step 2: Multiply two elements in parallel in positions i, i+s
+  l = u[LSB3(a0)] ^ (u[LSB3(a0 >> 3)] << 3);
+  h = (u[LSB3(a0 >> 3)] >> 61);
+
+  for(size_t i = (2 * s); i < w; i += (2 * s)) {
+    const size_t i2 = (i + s);
+
+    g1 = u[LSB3(a0 >> i)];
+    g2 = u[LSB3(a0 >> i2)];
+
+    l ^= (g1 << i) ^ (g2 << i2);
+    h ^= (g1 >> (w - i)) ^ (g2 >> (w - i2));
+  }
+
+  // Step 3: Multiply the last three bits.
+  for(size_t i = 61; i < 64; i++) {
+    uint64_t mask = (-((b0 >> i) & 1));
+    l ^= ((a0 << i) & mask);
+    h ^= ((a0 >> (w - i)) & mask);
+  }
+
+  c[0] = l;
+  c[1] = h;
+}
+
+// c = a^2
+void gf2x_sqr_port(OUT dbl_pad_r_t *c, IN const pad_r_t *a)
+{
+  const uint64_t *a64 = (const uint64_t *)a;
+  uint64_t *      c64 = (uint64_t *)c;
+
+  for(size_t i = 0; i < R_QWORDS; i++) {
+    gf2x_mul_base_port(&c64[2 * i], &a64[i], &a64[i]);
+  }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_vpclmul.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_vpclmul.c
new file mode 100644
index 0000000000..c321bf355f
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_vpclmul.c
@@ -0,0 +1,135 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#if defined(S2N_BIKE_R3_VPCLMUL)
+
+#include "gf2x_internal.h"
+
+#define AVX512_INTERNAL
+#include "x86_64_intrinsic.h"
+
+#define CLMUL(x, y, imm) _mm512_clmulepi64_epi128((x), (y), (imm))
+
+_INLINE_ void
+mul2_512(OUT __m512i *h, OUT __m512i *l, IN const __m512i a, IN const __m512i b)
+{
+  const __m512i mask_abq = SET_I64(6, 7, 4, 5, 2, 3, 0, 1);
+  const __m512i s1       = a ^ PERMX_I64(a, _MM_SHUFFLE(2, 3, 0, 1));
+  const __m512i s2       = b ^ PERMX_I64(b, _MM_SHUFFLE(2, 3, 0, 1));
+
+  __m512i lq  = CLMUL(a, b, 0x00);
+  __m512i hq  = CLMUL(a, b, 0x11);
+  __m512i abq = lq ^ hq ^ CLMUL(s1, s2, 0x00);
+  abq         = PERMXVAR_I64(mask_abq, abq);
+  *l          = MXOR_I64(lq, 0xaa, lq, abq);
+  *h          = MXOR_I64(hq, 0x55, hq, abq);
+}
+
+// 8x8 Karatsuba multiplication
+_INLINE_ void gf2x_mul8_512_int(OUT __m512i *zh,
+                                OUT __m512i *    zl,
+                                IN const __m512i a,
+                                IN const __m512i b)
+{
+  const __m512i mask0   = SET_I64(13, 12, 5, 4, 9, 8, 1, 0);
+  const __m512i mask1   = SET_I64(15, 14, 7, 6, 11, 10, 3, 2);
+  const __m512i mask2   = SET_I64(3, 2, 1, 0, 7, 6, 5, 4);
+  const __m512i mask3   = SET_I64(11, 10, 9, 8, 3, 2, 1, 0);
+  const __m512i mask4   = SET_I64(15, 14, 13, 12, 7, 6, 5, 4);
+  const __m512i mask_s1 = SET_I64(7, 6, 5, 4, 1, 0, 3, 2);
+  const __m512i mask_s2 = SET_I64(3, 2, 7, 6, 5, 4, 1, 0);
+
+  __m512i xl, xh, xabl, xabh, xab, xab1, xab2;
+  __m512i yl, yh, yabl, yabh, yab;
+  __m512i t[4];
+
+  // Calculate:
+  // AX1^AX3|| AX2^AX3 || AX0^AX2 || AX0^AX1
+  // BX1^BX3|| BX2^BX3 || BX0^BX2 || BX0^BX1
+  // Where (AX1^AX3 || AX0^AX2) stands for (AX1 || AX0)^(AX3 || AX2) = AY0^AY1
+  t[0] = PERMXVAR_I64(mask_s1, a) ^ PERMXVAR_I64(mask_s2, a);
+  t[1] = PERMXVAR_I64(mask_s1, b) ^ PERMXVAR_I64(mask_s2, b);
+
+  // Calculate:
+  // Don't care || AX1^AX3^AX0^AX2
+  // Don't care || BX1^BX3^BX0^BX2
+  t[2] = t[0] ^ VALIGN(t[0], t[0], 4);
+  t[3] = t[1] ^ VALIGN(t[1], t[1], 4);
+
+  mul2_512(&xh, &xl, a, b);
+  mul2_512(&xabh, &xabl, t[0], t[1]);
+  mul2_512(&yabh, &yabl, t[2], t[3]);
+
+  xab  = xl ^ xh ^ PERMX2VAR_I64(xabl, mask0, xabh);
+  yl   = PERMX2VAR_I64(xl, mask3, xh);
+  yh   = PERMX2VAR_I64(xl, mask4, xh);
+  xab1 = VALIGN(xab, xab, 6);
+  xab2 = VALIGN(xab, xab, 2);
+  yl   = MXOR_I64(yl, 0x3c, yl, xab1);
+  yh   = MXOR_I64(yh, 0x3c, yh, xab2);
+
+  __m512i oxh = PERMX2VAR_I64(xabl, mask1, xabh);
+  __m512i oxl = VALIGN(oxh, oxh, 4);
+  yab         = oxl ^ oxh ^ PERMX2VAR_I64(yabl, mask0, yabh);
+  yab         = MXOR_I64(oxh, 0x3c, oxh, VALIGN(yab, yab, 2));
+  yab ^= yl ^ yh;
+
+  // Z0 (yl) + Z1 (yab) + Z2 (yh)
+  yab = PERMXVAR_I64(mask2, yab);
+  *zl = MXOR_I64(yl, 0xf0, yl, yab);
+  *zh = MXOR_I64(yh, 0x0f, yh, yab);
+}
+
+// 1024x1024 bit multiplication performed by Karatsuba algorithm.
+// Here, a and b are considered as having 16 digits of size 64 bits.
+void gf2x_mul_base_vpclmul(OUT uint64_t *c,
+                           IN const uint64_t *a,
+                           IN const uint64_t *b)
+{
+  const __m512i a0 = LOAD(a);
+  const __m512i a1 = LOAD(&a[QWORDS_IN_ZMM]);
+  const __m512i b0 = LOAD(b);
+  const __m512i b1 = LOAD(&b[QWORDS_IN_ZMM]);
+
+  __m512i hi[2], lo[2], mi[2];
+
+  gf2x_mul8_512_int(&lo[1], &lo[0], a0, b0);
+  gf2x_mul8_512_int(&hi[1], &hi[0], a1, b1);
+  gf2x_mul8_512_int(&mi[1], &mi[0], a0 ^ a1, b0 ^ b1);
+
+  __m512i m = lo[1] ^ hi[0];
+
+  STORE(&c[0 * QWORDS_IN_ZMM], lo[0]);
+  STORE(&c[1 * QWORDS_IN_ZMM], mi[0] ^ lo[0] ^ m);
+  STORE(&c[2 * QWORDS_IN_ZMM], mi[1] ^ hi[1] ^ m);
+  STORE(&c[3 * QWORDS_IN_ZMM], hi[1]);
+}
+
+void gf2x_sqr_vpclmul(OUT dbl_pad_r_t *c, IN const pad_r_t *a)
+{
+  __m512i va, vm, vr0, vr1;
+
+  const uint64_t *a64 = (const uint64_t *)a;
+  uint64_t *      c64 = (uint64_t *)c;
+
+  vm = SET_I64(7, 3, 6, 2, 5, 1, 4, 0);
+
+  for(size_t i = 0; i < (R_ZMM * QWORDS_IN_ZMM); i += QWORDS_IN_ZMM) {
+    va = LOAD(&a64[i]);
+    va = PERMXVAR_I64(vm, va);
+
+    vr0 = CLMUL(va, va, 0x00);
+    vr1 = CLMUL(va, va, 0x11);
+
+    STORE(&c64[i * 2], vr0);
+    STORE(&c64[i * 2 + QWORDS_IN_ZMM], vr1);
+  }
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_portable.c
new file mode 100644
index 0000000000..187042d44c
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_portable.c
@@ -0,0 +1,103 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include <assert.h>
+
+#include "cleanup.h"
+#include "gf2x_internal.h"
+
+#define PORTABLE_INTERNAL
+#include "x86_64_intrinsic.h"
+
+void karatzuba_add1_port(OUT uint64_t *alah,
+                         OUT uint64_t *blbh,
+                         IN const uint64_t *a,
+                         IN const uint64_t *b,
+                         IN const size_t    qwords_len)
+{
+  assert(qwords_len % REG_QWORDS == 0);
+
+  REG_T va0, va1, vb0, vb1;
+
+  for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+    va0 = LOAD(&a[i]);
+    va1 = LOAD(&a[i + qwords_len]);
+    vb0 = LOAD(&b[i]);
+    vb1 = LOAD(&b[i + qwords_len]);
+
+    STORE(&alah[i], va0 ^ va1);
+    STORE(&blbh[i], vb0 ^ vb1);
+  }
+}
+
+void karatzuba_add2_port(OUT uint64_t *z,
+                         IN const uint64_t *x,
+                         IN const uint64_t *y,
+                         IN const size_t    qwords_len)
+{
+  assert(qwords_len % REG_QWORDS == 0);
+
+  REG_T vx, vy;
+
+  for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+    vx = LOAD(&x[i]);
+    vy = LOAD(&y[i]);
+
+    STORE(&z[i], vx ^ vy);
+  }
+}
+
+void karatzuba_add3_port(OUT uint64_t *c,
+                         IN const uint64_t *mid,
+                         IN const size_t    qwords_len)
+{
+  assert(qwords_len % REG_QWORDS == 0);
+
+  REG_T vr0, vr1, vr2, vr3, vt;
+
+  uint64_t *c0 = c;
+  uint64_t *c1 = &c[qwords_len];
+  uint64_t *c2 = &c[2 * qwords_len];
+  uint64_t *c3 = &c[3 * qwords_len];
+
+  for(size_t i = 0; i < qwords_len; i += REG_QWORDS) {
+    vr0 = LOAD(&c0[i]);
+    vr1 = LOAD(&c1[i]);
+    vr2 = LOAD(&c2[i]);
+    vr3 = LOAD(&c3[i]);
+    vt  = LOAD(&mid[i]);
+
+    STORE(&c1[i], vt ^ vr0 ^ vr1);
+    STORE(&c2[i], vt ^ vr2 ^ vr3);
+  }
+}
+
+// c = a mod (x^r - 1)
+void gf2x_red_port(OUT pad_r_t *c, IN const dbl_pad_r_t *a)
+{
+  const uint64_t *a64 = (const uint64_t *)a;
+  uint64_t *      c64 = (uint64_t *)c;
+
+  for(size_t i = 0; i < R_QWORDS; i += REG_QWORDS) {
+    REG_T vt0 = LOAD(&a64[i]);
+    REG_T vt1 = LOAD(&a64[i + R_QWORDS]);
+    REG_T vt2 = LOAD(&a64[i + R_QWORDS - 1]);
+
+    vt1 = SLLI_I64(vt1, LAST_R_QWORD_TRAIL);
+    vt2 = SRLI_I64(vt2, LAST_R_QWORD_LEAD);
+
+    vt0 ^= (vt1 | vt2);
+
+    STORE(&c64[i], vt0);
+  }
+
+  c64[R_QWORDS - 1] &= LAST_R_QWORD_MASK;
+
+  // Clean the secrets from the upper part of c
+  secure_clean((uint8_t *)&c64[R_QWORDS],
+               (R_PADDED_QWORDS - R_QWORDS) * sizeof(uint64_t));
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.c
new file mode 100644
index 0000000000..a76a31ef87
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.c
@@ -0,0 +1,170 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include <assert.h>
+
+#include "sampling.h"
+#include "sampling_internal.h"
+
+// SIMD implementation of is_new function requires the size of wlist
+// to be a multiple of the number of DWORDS in a SIMD register (REG_DWORDS).
+// The function is used both for generating DV and T1 random numbers so we define
+// two separate macros.
+#define AVX512_REG_DWORDS (16)
+#define WLIST_SIZE_ADJUSTED_D \
+  (AVX512_REG_DWORDS * DIVIDE_AND_CEIL(DV, AVX512_REG_DWORDS))
+#define WLIST_SIZE_ADJUSTED_T \
+  (AVX512_REG_DWORDS * DIVIDE_AND_CEIL(T1, AVX512_REG_DWORDS))
+
+// BSR returns ceil(log2(val))
+_INLINE_ uint8_t bit_scan_reverse_vartime(IN uint64_t val)
+{
+  // index is always smaller than 64
+  uint8_t index = 0;
+
+  while(val != 0) {
+    val >>= 1;
+    index++;
+  }
+
+  return index;
+}
+
+_INLINE_ ret_t get_rand_mod_len(OUT uint32_t *    rand_pos,
+                                IN const uint32_t len,
+                                IN OUT aes_ctr_prf_state_t *prf_state)
+{
+  const uint64_t mask = MASK(bit_scan_reverse_vartime(len));
+
+  do {
+    // Generate a 32 bits (pseudo) random value.
+    // This can be optimized to take only 16 bits.
+    POSIX_GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos)));
+
+    // Mask relevant bits only
+    (*rand_pos) &= mask;
+
+    // Break if a number that is smaller than len is found
+    if((*rand_pos) < len) {
+      break;
+    }
+
+  } while(1 == 1);
+
+  return SUCCESS;
+}
+
+_INLINE_ void make_odd_weight(IN OUT r_t *r)
+{
+  if(((r_bits_vector_weight(r) % 2) == 1)) {
+    // Already odd
+    return;
+  }
+
+  r->raw[0] ^= 1;
+}
+
+// Returns an array of r pseudorandom bits.
+// No restrictions exist for the top or bottom bits.
+// If the generation requires an odd number, then set must_be_odd=1.
+// The function uses the provided prf context.
+ret_t sample_uniform_r_bits_with_fixed_prf_context(
+  OUT r_t *r,
+  IN OUT aes_ctr_prf_state_t *prf_state,
+  IN const must_be_odd_t      must_be_odd)
+{
+  // Generate random data
+  POSIX_GUARD(aes_ctr_prf(r->raw, prf_state, R_BYTES));
+
+  // Mask upper bits of the MSByte
+  r->raw[R_BYTES - 1] &= MASK(R_BITS + 8 - (R_BYTES * 8));
+
+  if(must_be_odd == MUST_BE_ODD) {
+    make_odd_weight(r);
+  }
+
+  return SUCCESS;
+}
+
+ret_t generate_indices_mod_z(OUT idx_t *     out,
+                             IN const size_t num_indices,
+                             IN const size_t z,
+                             IN OUT aes_ctr_prf_state_t *prf_state,
+                             IN const sampling_ctx *ctx)
+{
+  size_t ctr = 0;
+
+  // Generate num_indices unique (pseudo) random numbers modulo z
+  do {
+    POSIX_GUARD(get_rand_mod_len(&out[ctr], z, prf_state));
+    ctr += ctx->is_new(out, ctr);
+  } while(ctr < num_indices);
+
+  return SUCCESS;
+}
+
+// Returns an array of r pseudorandom bits.
+// No restrictions exist for the top or bottom bits.
+// If the generation requires an odd number, then set must_be_odd = MUST_BE_ODD
+ret_t sample_uniform_r_bits(OUT r_t *r,
+                            IN const seed_t *      seed,
+                            IN const must_be_odd_t must_be_odd)
+{
+  // For the seedexpander
+  DEFER_CLEANUP(aes_ctr_prf_state_t prf_state = {0}, aes_ctr_prf_state_cleanup);
+
+  POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed));
+
+  POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd));
+
+  return SUCCESS;
+}
+
+ret_t generate_sparse_rep(OUT pad_r_t *r,
+                          OUT idx_t *wlist,
+                          IN OUT aes_ctr_prf_state_t *prf_state)
+{
+
+  // Initialize the sampling context
+  sampling_ctx ctx;
+  sampling_ctx_init(&ctx);
+
+  idx_t wlist_temp[WLIST_SIZE_ADJUSTED_D] = {0};
+
+  POSIX_GUARD(generate_indices_mod_z(wlist_temp, DV, R_BITS, prf_state, &ctx));
+
+  bike_memcpy(wlist, wlist_temp, DV * sizeof(idx_t));
+  ctx.secure_set_bits(r, 0, wlist, DV);
+
+  return SUCCESS;
+}
+
+ret_t generate_error_vector(OUT pad_e_t *e, IN const seed_t *seed)
+{
+  DEFER_CLEANUP(aes_ctr_prf_state_t prf_state = {0}, aes_ctr_prf_state_cleanup);
+
+  POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed));
+
+  // Initialize the sampling context
+  sampling_ctx ctx;
+  sampling_ctx_init(&ctx);
+
+  idx_t wlist[WLIST_SIZE_ADJUSTED_T] = {0};
+  POSIX_GUARD(generate_indices_mod_z(wlist, T1, N_BITS, &prf_state, &ctx));
+
+  // (e0, e1) hold bits 0..R_BITS-1 and R_BITS..2*R_BITS-1 of the error, resp.
+  ctx.secure_set_bits(&e->val[0], 0, wlist, T1);
+  ctx.secure_set_bits(&e->val[1], R_BITS, wlist, T1);
+
+  // Clean the padding of the elements
+  PE0_RAW(e)[R_BYTES - 1] &= LAST_R_BYTE_MASK;
+  PE1_RAW(e)[R_BYTES - 1] &= LAST_R_BYTE_MASK;
+  bike_memset(&PE0_RAW(e)[R_BYTES], 0, R_PADDED_BYTES - R_BYTES);
+  bike_memset(&PE1_RAW(e)[R_BYTES], 0, R_PADDED_BYTES - R_BYTES);
+
+  return SUCCESS;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.h
new file mode 100644
index 0000000000..a9d50c9bc2
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.h
@@ -0,0 +1,40 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include <stdlib.h>
+#include "aes_ctr_prf.h"
+#include "pq-crypto/s2n_pq_random.h"
+#include "utils/s2n_result.h"
+#include "utilities.h"
+
+typedef enum
+{
+  NO_RESTRICTION = 0,
+  MUST_BE_ODD    = 1
+} must_be_odd_t;
+
+_INLINE_ ret_t get_seeds(OUT seeds_t *seeds) {
+  if(s2n_result_is_ok(s2n_get_random_bytes(seeds->seed[0].raw, sizeof(seeds_t)))) {
+    return SUCCESS;
+  } else {
+    BIKE_ERROR(E_FAIL_TO_GET_SEED);
+  }
+}
+
+// Returns an array of r pseudorandom bits. If an odd
+// weight of r is required, set must_be_odd to MUST_BE_ODD.
+ret_t sample_uniform_r_bits(OUT r_t *r,
+                            IN const seed_t *seed,
+                            IN must_be_odd_t must_be_odd);
+
+ret_t generate_sparse_rep(OUT pad_r_t *r,
+                          OUT idx_t *wlist,
+                          IN OUT aes_ctr_prf_state_t *prf_state);
+
+ret_t generate_error_vector(OUT pad_e_t *e, IN const seed_t *seed);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx2.c
new file mode 100644
index 0000000000..c23be2e86e
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx2.c
@@ -0,0 +1,123 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#if defined(S2N_BIKE_R3_AVX2)
+
+#include <assert.h>
+
+#include "sampling_internal.h"
+
+#define AVX2_INTERNAL
+#include "x86_64_intrinsic.h"
+
+// For improved performance, we process NUM_YMMS amount of data in parallel.
+#define NUM_YMMS    (4)
+#define YMMS_QWORDS (QWORDS_IN_YMM * NUM_YMMS)
+
+void secure_set_bits_avx2(OUT pad_r_t *   r,
+                          IN const size_t first_pos,
+                          IN const idx_t *wlist,
+                          IN const size_t w_size)
+{
+  // The function assumes that the size of r is a multiple
+  // of the cumulative size of used YMM registers.
+  assert((sizeof(*r) / sizeof(uint64_t)) % YMMS_QWORDS == 0);
+
+  // va vectors hold the bits of the output array "r"
+  // va_pos_qw vectors hold the qw position indices of "r"
+  // The algorithm works as follows:
+  //   1. Initialize va_pos_qw with starting positions of qw's of "r"
+  //      va_pos_qw = (3, 2, 1, 0);
+  //   2. While the size of "r" is not exceeded:
+  //   3.   For each w in wlist:
+  //   4.     Compare the pos_qw of w with positions in va_pos_qw
+  //          and for the position which is equal set the appropriate
+  //          bit in va vector.
+  //   5.   Set va_pos_qw to the next qw positions of "r"
+  __m256i va[NUM_YMMS], va_pos_qw[NUM_YMMS], va_mask;
+  __m256i w_pos_qw, w_pos_bit;
+  __m256i one, inc;
+
+  uint64_t *r64 = (uint64_t *)r;
+
+  one = SET1_I64(1);
+  inc = SET1_I64(QWORDS_IN_YMM);
+
+  // 1. Initialize
+  va_pos_qw[0] = SET_I64(3, 2, 1, 0);
+  for(size_t i = 1; i < NUM_YMMS; i++) {
+    va_pos_qw[i] = ADD_I64(va_pos_qw[i - 1], inc);
+  }
+
+  // va_pos_qw vectors hold qw positions 0 .. (NUM_YMMS * QWORDS_IN_YMM - 1)
+  // Therefore, we set the increment vector inc such that by adding it to
+  // va_pos_qw vectors, they hold the next YMM_QWORDS qw positions.
+  inc = SET1_I64(YMMS_QWORDS);
+
+  for(size_t i = 0; i < (sizeof(*r) / sizeof(uint64_t)); i += YMMS_QWORDS) {
+    for(size_t va_iter = 0; va_iter < NUM_YMMS; va_iter++) {
+      va[va_iter] = SET_ZERO;
+    }
+
+    for(size_t w_iter = 0; w_iter < w_size; w_iter++) {
+      int32_t w = wlist[w_iter] - first_pos;
+      w_pos_qw  = SET1_I64(w >> 6);
+      w_pos_bit = SLLI_I64(one, w & MASK(6));
+
+      // 4. Compare the positions in va_pos_qw with w_pos_qw
+      //    and set the appropriate bit in va
+      for(size_t va_iter = 0; va_iter < NUM_YMMS; va_iter++) {
+        va_mask = CMPEQ_I64(va_pos_qw[va_iter], w_pos_qw);
+        va[va_iter] |= (va_mask & w_pos_bit);
+      }
+    }
+
+    // 5. Set the va_pos_qw to the next qw positions of r
+    //    and store the previously computed data in r
+    for(size_t va_iter = 0; va_iter < NUM_YMMS; va_iter++) {
+      STORE(&r64[i + (va_iter * QWORDS_IN_YMM)], va[va_iter]);
+      va_pos_qw[va_iter] = ADD_I64(va_pos_qw[va_iter], inc);
+    }
+  }
+}
+
+int is_new_avx2(IN const idx_t *wlist, IN const size_t ctr)
+{
+  bike_static_assert((sizeof(idx_t) == sizeof(uint32_t)), idx_t_is_not_uint32_t);
+
+  REG_T idx_ctr = SET1_I32(wlist[ctr]);
+
+  for(size_t i = 0; i < ctr; i += REG_DWORDS) {
+    // Comparisons are done with SIMD instructions with each SIMD register
+    // containing REG_DWORDS elements. We compare registers element-wise:
+    // idx_ctr = {8 repetitions of wlist[ctr]}, with
+    // idx_cur = {8 consecutive elements from wlist}.
+    // In the last iteration we consider wlist elements only up to ctr.
+
+    REG_T    idx_cur = LOAD(&wlist[i]);
+    REG_T    cmp_res = CMPEQ_I32(idx_ctr, idx_cur);
+    uint32_t check   = MOVEMASK(cmp_res);
+
+    // Handle the last iteration by appropriate masking.
+    if(ctr < (i + REG_DWORDS)) {
+      // MOVEMASK instruction in AVX2 compares corresponding bytes from
+      // two given vector registers and produces a 32-bit mask. On the other hand,
+      // we compare idx_t elements, not bytes, so we multiply by sizeof(idx_t).
+      check &= MASK((ctr - i) * sizeof(idx_t));
+    }
+
+    if(check != 0) {
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx512.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx512.c
new file mode 100644
index 0000000000..6cab4cffea
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx512.c
@@ -0,0 +1,123 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#if defined(S2N_BIKE_R3_AVX512)
+
+#include <assert.h>
+
+#include "sampling_internal.h"
+
+#define AVX512_INTERNAL
+#include "x86_64_intrinsic.h"
+
+// For improved performance, we process NUM_ZMMS amount of data in parallel.
+#define NUM_ZMMS    (8)
+#define ZMMS_QWORDS (QWORDS_IN_ZMM * NUM_ZMMS)
+
+void secure_set_bits_avx512(OUT pad_r_t *   r,
+                            IN const size_t first_pos,
+                            IN const idx_t *wlist,
+                            IN const size_t w_size)
+{
+  // The function assumes that the size of r is a multiple
+  // of the cumulative size of used ZMM registers.
+  assert((sizeof(*r) / sizeof(uint64_t)) % ZMMS_QWORDS == 0);
+
+  // va vectors hold the bits of the output array "r"
+  // va_pos_qw vectors hold the qw position indices of "r"
+  // The algorithm works as follows:
+  //   1. Initialize va_pos_qw with starting positions of qw's of "r"
+  //      va_pos_qw = (7, 6, 5, 4, 3, 2, 1, 0);
+  //   2. While the size of "r" is not exceeded:
+  //   3.   For each w in wlist:
+  //   4.     Compare the pos_qw of w with positions in va_pos_qw
+  //          and for the position which is equal set the appropriate
+  //          bit in va vector.
+  //   5.   Set va_pos_qw to the next qw positions of "r"
+  __m512i  va[NUM_ZMMS], va_pos_qw[NUM_ZMMS];
+  __m512i  w_pos_qw, w_pos_bit, one, inc;
+  __mmask8 va_mask;
+
+  uint64_t *r64 = (uint64_t *)r;
+
+  one = SET1_I64(1);
+  inc = SET1_I64(QWORDS_IN_ZMM);
+
+  // 1. Initialize
+  va_pos_qw[0] = SET_I64(7, 6, 5, 4, 3, 2, 1, 0);
+  for(size_t i = 1; i < NUM_ZMMS; i++) {
+    va_pos_qw[i] = ADD_I64(va_pos_qw[i - 1], inc);
+  }
+
+  // va_pos_qw vectors hold qw positions 0 .. (NUM_ZMMS * QWORDS_IN_ZMM - 1)
+  // Therefore, we set the increment vector inc such that by adding it to
+  // va_pos_qw vectors they hold the next ZMMS_QWORDS qw positions.
+  inc = SET1_I64(ZMMS_QWORDS);
+
+  for(size_t i = 0; i < (sizeof(*r) / sizeof(uint64_t)); i += ZMMS_QWORDS) {
+    for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) {
+      va[va_iter] = SET_ZERO;
+    }
+
+    for(size_t w_iter = 0; w_iter < w_size; w_iter++) {
+      int32_t w = wlist[w_iter] - first_pos;
+      w_pos_qw  = SET1_I64(w >> 6);
+#if (defined(__GNUC__) && ((__GNUC__ == 6) || (__GNUC__ == 5)) && !defined(__clang__)) || (defined(__clang__) && __clang_major__ == 3 && __clang_minor__ == 9)
+      // Workaround for gcc-6, gcc-5, and clang < 3.9, which do not allowing the second
+      // argument of SLLI to be non-immediate value.
+      __m512i temp = SET1_I64(w & MASK(6));
+      w_pos_bit = SLLV_I64(one, temp);
+#else
+      w_pos_bit = SLLI_I64(one, w & MASK(6));
+#endif
+
+      // 4. Compare the positions in va_pos_qw with w_pos_qw
+      //    and set the appropriate bit in va
+      for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) {
+        va_mask     = CMPMEQ_I64(va_pos_qw[va_iter], w_pos_qw);
+        va[va_iter] = MOR_I64(va[va_iter], va_mask, va[va_iter], w_pos_bit);
+      }
+    }
+
+    // 5. Set the va_pos_qw to the next qw positions of r
+    //    and store the previously computed data in r
+    for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) {
+      STORE(&r64[i + (va_iter * QWORDS_IN_ZMM)], va[va_iter]);
+      va_pos_qw[va_iter] = ADD_I64(va_pos_qw[va_iter], inc);
+    }
+  }
+}
+
+int is_new_avx512(IN const idx_t *wlist, IN const size_t ctr)
+{
+  bike_static_assert((sizeof(idx_t) == sizeof(uint32_t)), idx_t_is_not_uint32_t);
+
+  REG_T idx_ctr = SET1_I32(wlist[ctr]);
+
+  for(size_t i = 0; i < ctr; i += REG_DWORDS) {
+    // Comparisons are done with SIMD instructions with each SIMD register
+    // containing REG_DWORDS elements. We compare registers element-wise:
+    // idx_ctr = {8 repetitions of wlist[ctr]}, with
+    // idx_cur = {8 consecutive elements from wlist}.
+    // In the last iteration we consider wlist elements only up to ctr.
+
+    REG_T idx_cur = LOAD(&wlist[i]);
+
+    uint16_t mask  = (ctr < (i + REG_DWORDS)) ? MASK(ctr - i) : 0xffff;
+    uint16_t check = MCMPMEQ_I32(mask, idx_ctr, idx_cur);
+
+    if(check != 0) {
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+#endif
+
+typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_internal.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_internal.h
new file mode 100644
index 0000000000..3fd68354f2
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_internal.h
@@ -0,0 +1,66 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "pq-crypto/s2n_pq.h"
+#include "defs.h"
+#include "types.h"
+
+void secure_set_bits_port(OUT pad_r_t *r,
+                          IN size_t    first_pos,
+                          IN const idx_t *wlist,
+                          IN size_t       w_size);
+
+// Compares wlist[ctr] to w[i] for all i < ctr.
+// Returns 0 if wlist[ctr] is contained in wlist, returns 1 otherwise.
+int is_new_port(IN const idx_t *wlist, IN const size_t ctr);
+
+#if defined(S2N_BIKE_R3_AVX2)
+void secure_set_bits_avx2(OUT pad_r_t *r,
+                          IN size_t    first_pos,
+                          IN const idx_t *wlist,
+                          IN size_t       w_size);
+
+int is_new_avx2(IN const idx_t *wlist, IN const size_t ctr);
+#endif
+
+#if defined(S2N_BIKE_R3_AVX512)
+void secure_set_bits_avx512(OUT pad_r_t *r,
+                            IN size_t    first_pos,
+                            IN const idx_t *wlist,
+                            IN size_t       w_size);
+int is_new_avx512(IN const idx_t *wlist, IN const size_t ctr);
+#endif
+
+typedef struct sampling_ctx_st {
+  void (*secure_set_bits)(OUT pad_r_t *r,
+                          IN size_t    first_pos,
+                          IN const idx_t *wlist,
+                          IN size_t       w_size);
+  int (*is_new)(IN const idx_t *wlist, IN const size_t ctr);
+} sampling_ctx;
+
+_INLINE_ void sampling_ctx_init(sampling_ctx *ctx)
+{
+#if defined(S2N_BIKE_R3_AVX512)
+  if(s2n_bike_r3_is_avx512_enabled()) {
+    ctx->secure_set_bits = secure_set_bits_avx512;
+    ctx->is_new          = is_new_avx512;
+  } else
+#endif
+#if defined(S2N_BIKE_R3_AVX2)
+  if(s2n_bike_r3_is_avx2_enabled()) {
+    ctx->secure_set_bits = secure_set_bits_avx2;
+    ctx->is_new          = is_new_avx2;
+  } else
+#endif
+  {
+    ctx->secure_set_bits = secure_set_bits_port;
+    ctx->is_new          = is_new_port;
+  }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_portable.c
new file mode 100644
index 0000000000..b670730f0a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_portable.c
@@ -0,0 +1,60 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include <assert.h>
+
+#include "sampling_internal.h"
+#include "utilities.h"
+
+#define MAX_WLIST_SIZE (T1 > DV ? T1 : DV)
+
+void secure_set_bits_port(OUT pad_r_t *   r,
+                          IN const size_t first_pos,
+                          IN const idx_t *wlist,
+                          IN const size_t w_size)
+{
+  assert(w_size <= MAX_WLIST_SIZE);
+
+  // Ideally we would like to cast r.val but it is not guaranteed to be aligned
+  // as the entire pad_r_t structure. Thus, we assert that the position of val
+  // is at the beginning of r.
+  bike_static_assert(offsetof(pad_r_t, val) == 0, val_wrong_pos_in_pad_r_t);
+  uint64_t *a64 = (uint64_t *)r;
+  uint64_t  val, mask;
+
+  // The size of wlist can be either DV or T. So, we set it to max(D, T)
+  size_t pos_qw[MAX_WLIST_SIZE];
+  size_t pos_bit[MAX_WLIST_SIZE];
+
+  // Identify the QW position of every value, and the bit position inside this QW.
+  for(size_t i = 0; i < w_size; i++) {
+    int32_t w  = wlist[i] - first_pos;
+    pos_qw[i]  = w >> 6;
+    pos_bit[i] = BIT(w & MASK(6));
+  }
+
+  // Fill each QW in constant time
+  for(size_t i = 0; i < (sizeof(*r) / sizeof(uint64_t)); i++) {
+    val = 0;
+    for(size_t j = 0; j < w_size; j++) {
+      mask = (-1ULL) + (!secure_cmp32(pos_qw[j], i));
+      val |= (pos_bit[j] & mask);
+    }
+    a64[i] = val;
+  }
+}
+
+int is_new_port(IN const idx_t *wlist, IN const size_t ctr)
+{
+  for(size_t i = 0; i < ctr; i++) {
+    if(wlist[i] == wlist[ctr]) {
+      return 0;
+    }
+  }
+
+  return 1;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sha.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sha.h
new file mode 100644
index 0000000000..1857d6e638
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sha.h
@@ -0,0 +1,43 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include "cleanup.h"
+#include "error.h"
+#include "types.h"
+#include "utilities.h"
+
+#include <openssl/sha.h>
+
+#define SHA384_DGST_BYTES  48ULL
+#define SHA384_DGST_QWORDS (SHA384_DGST_BYTES / 8)
+
+#define SHA512_DGST_BYTES  64ULL
+#define SHA512_DGST_QWORDS (SHA512_DGST_BYTES / 8)
+
+typedef struct sha384_dgst_s {
+  union {
+    uint8_t  raw[SHA384_DGST_BYTES];
+    uint64_t qw[SHA384_DGST_QWORDS];
+  } u;
+} sha384_dgst_t;
+bike_static_assert(sizeof(sha384_dgst_t) == SHA384_DGST_BYTES, sha384_dgst_size);
+
+typedef sha384_dgst_t sha_dgst_t;
+CLEANUP_FUNC(sha_dgst, sha_dgst_t)
+
+_INLINE_ ret_t sha(OUT sha_dgst_t *  dgst,
+                   IN const uint32_t byte_len,
+                   IN const uint8_t *msg)
+{
+  if(SHA384(msg, byte_len, dgst->u.raw) != NULL) {
+    return SUCCESS;
+  }
+
+  return FAIL;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/types.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/types.h
new file mode 100644
index 0000000000..436a584f3e
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/types.h
@@ -0,0 +1,120 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "bike_defs.h"
+#include "error.h"
+
+typedef struct uint128_s {
+  union {
+    uint8_t  bytes[16]; // NOLINT
+    uint32_t dw[4];     // NOLINT
+    uint64_t qw[2];     // NOLINT
+  } u;
+} uint128_t;
+
+// Make sure no compiler optimizations.
+#pragma pack(push, 1)
+
+typedef struct seed_s {
+  uint8_t raw[SEED_BYTES];
+} seed_t;
+
+typedef struct seeds_s {
+  seed_t seed[NUM_OF_SEEDS];
+} seeds_t;
+
+typedef struct r_s {
+  uint8_t raw[R_BYTES];
+} r_t;
+
+typedef struct m_s {
+  uint8_t raw[M_BYTES];
+} m_t;
+
+typedef struct e_s {
+  r_t val[N0];
+} e_t;
+
+#define E0_RAW(e) ((e)->val[0].raw)
+#define E1_RAW(e) ((e)->val[1].raw)
+
+typedef struct ct_s {
+  r_t c0;
+  m_t c1;
+} ct_t;
+
+typedef r_t pk_t;
+
+typedef struct ss_st {
+  uint8_t raw[SS_BYTES];
+} ss_t;
+
+typedef uint32_t idx_t;
+
+typedef struct compressed_idx_d_s {
+  idx_t val[DV];
+} compressed_idx_d_t;
+
+typedef compressed_idx_d_t compressed_idx_d_ar_t[N0];
+
+// The secret key holds both representations, to avoid
+// the compression in Decaps.
+typedef struct sk_s {
+  compressed_idx_d_ar_t wlist;
+  r_t                   bin[N0];
+  pk_t                  pk;
+  m_t                   sigma;
+} sk_t;
+
+typedef ALIGN(sizeof(idx_t)) sk_t aligned_sk_t;
+
+// Pad r to the next Block
+typedef struct pad_r_s {
+  r_t     val;
+  uint8_t pad[R_PADDED_BYTES - sizeof(r_t)];
+} ALIGN(ALIGN_BYTES) pad_r_t;
+
+// Double padded r, required for multiplication and squaring
+typedef struct dbl_pad_r_s {
+  uint8_t raw[2 * R_PADDED_BYTES];
+} ALIGN(ALIGN_BYTES) dbl_pad_r_t;
+
+typedef struct pad_e_s {
+  pad_r_t val[N0];
+} ALIGN(ALIGN_BYTES) pad_e_t;
+
+#define PE0_RAW(e) ((e)->val[0].val.raw)
+#define PE1_RAW(e) ((e)->val[1].val.raw)
+
+typedef struct func_k_s {
+  m_t m;
+  r_t c0;
+  m_t c1;
+} func_k_t;
+
+// For a faster rotate we triplicate the syndrome (into 3 copies)
+typedef struct syndrome_s {
+  uint64_t qw[3 * R_QWORDS];
+} ALIGN(ALIGN_BYTES) syndrome_t;
+
+typedef struct upc_slice_s {
+  union {
+    pad_r_t  r;
+    uint64_t qw[sizeof(pad_r_t) / sizeof(uint64_t)];
+  } ALIGN(ALIGN_BYTES) u;
+} ALIGN(ALIGN_BYTES) upc_slice_t;
+
+typedef struct upc_s {
+  upc_slice_t slice[SLICES];
+} upc_t;
+
+#pragma pack(pop)
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.c
new file mode 100644
index 0000000000..0c6ad3ea0f
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.c
@@ -0,0 +1,24 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#include <inttypes.h>
+
+#include "utilities.h"
+
+#define BITS_IN_QWORD 64ULL
+#define BITS_IN_BYTE  8ULL
+
+uint64_t r_bits_vector_weight(IN const r_t *in)
+{
+  uint64_t acc = 0;
+  for(size_t i = 0; i < (R_BYTES - 1); i++) {
+    acc += __builtin_popcount(in->raw[i]);
+  }
+
+  acc += __builtin_popcount(in->raw[R_BYTES - 1] & LAST_R_BYTE_MASK);
+  return acc;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.h
new file mode 100644
index 0000000000..f544990a1a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.h
@@ -0,0 +1,139 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+#pragma once
+
+// For memset
+#include <string.h>
+
+#include "types.h"
+
+uint64_t r_bits_vector_weight(IN const r_t *in);
+
+// "VALUE_BARRIER returns |a|, but prevents GCC and Clang from reasoning about
+// the returned value. This is used to mitigate compilers undoing constant-time
+// code, until we can express our requirements directly in the language.
+// Note the compiler is aware that |VALUE_BARRIER| has no side effects and
+// always has the same output for a given input. This allows it to eliminate
+// dead code, move computations across loops, and vectorize."
+// See:
+// https://github.com/google/boringssl/commit/92b7c89e6e8ba82924b57153bea68241cc45f658
+#if(defined(__GNUC__) || defined(__clang__))
+#  define VALUE_BARRIER(name, type)            \
+    _INLINE_ type name##_barrier(type a)       \
+    {                                          \
+      __asm__("" : "+r"(a) : /* no inputs */); \
+      return a;                                \
+    }
+#else
+#  define VALUE_BARRIER(name, type) \
+    _INLINE_ type name##_barrier(type a) { return a; }
+#endif
+
+VALUE_BARRIER(u8, uint8_t)
+VALUE_BARRIER(u32, uint32_t)
+VALUE_BARRIER(u64, uint64_t)
+
+// Comparing value in a constant time manner
+_INLINE_ uint32_t secure_cmp(IN const uint8_t *a,
+                             IN const uint8_t *b,
+                             IN const uint32_t size)
+{
+  volatile uint8_t res = 0;
+
+  for(uint32_t i = 0; i < size; ++i) {
+    res |= (a[i] ^ b[i]);
+  }
+
+  return (0 == res);
+}
+
+// Return 1 if the arguments are equal to each other. Return 0 otherwise.
+_INLINE_ uint32_t secure_cmp32(IN const uint32_t v1, IN const uint32_t v2)
+{
+#if defined(__aarch64__)
+  uint32_t res;
+  __asm__ __volatile__("cmp  %w[V1], %w[V2]; \n "
+                       "cset %w[RES], EQ; \n"
+                       : [RES] "=r"(res)
+                       : [V1] "r"(v1), [V2] "r"(v2)
+                       : "cc" /*The condition code flag*/);
+  return res;
+#elif defined(__x86_64__) || defined(__i386__)
+  uint32_t res;
+  __asm__ __volatile__("xor  %%edx, %%edx; \n"
+                       "cmp  %1, %2; \n "
+                       "sete %%dl; \n"
+                       "mov %%edx, %0; \n"
+                       : "=r"(res)
+                       : "r"(v1), "r"(v2)
+                       : "rdx");
+  return res;
+#else
+  // Insecure comparison: The main purpose of secure_cmp32 is to avoid
+  // branches to prevent potential side channel leaks. To do that,
+  // we normally leverage some special CPU instructions such as "sete"
+  // (for __x86_64__) and "cset" (for __aarch64__). When dealing with general
+  // CPU architectures, the interpretation of the line below is left for the
+  // compiler. It could lead to an "insecure" branch. This case needs to be
+  // checked individually on such platforms
+  // (e.g., by checking the compiler-generated assembly).
+  return (v1 == v2 ? 1 : 0);
+#endif
+}
+
+// Return 0 if v1 < v2, (-1) otherwise
+_INLINE_ uint32_t secure_l32_mask(IN const uint32_t v1, IN const uint32_t v2)
+{
+#if defined(__aarch64__)
+  uint32_t res;
+  __asm__ __volatile__("cmp  %w[V2], %w[V1]; \n "
+                       "cset %w[RES], HI; \n"
+                       : [RES] "=r"(res)
+                       : [V1] "r"(v1), [V2] "r"(v2)
+                       : "cc" /*The condition code flag*/);
+  return (res - 1);
+#elif defined(__x86_64__) || defined(__i386__)
+  uint32_t res;
+  __asm__ __volatile__("xor  %%edx, %%edx; \n"
+                       "cmp  %1, %2; \n "
+                       "setl %%dl; \n"
+                       "dec %%edx; \n"
+                       "mov %%edx, %0; \n"
+
+                       : "=r"(res)
+                       : "r"(v2), "r"(v1)
+                       : "rdx");
+
+  return res;
+#else
+  // If v1 >= v2 then the subtraction result is 0^32||(v1-v2).
+  // else it is 1^32||(v2-v1+1). Subsequently, negating the upper
+  // 32 bits gives 0 if v1 < v2 and otherwise (-1).
+  return ~((uint32_t)(((uint64_t)v1 - (uint64_t)v2) >> 32));
+#endif
+}
+
+// bike_memcpy avoids the undefined behaviour of memcpy when byte_len=0
+_INLINE_ void *bike_memcpy(void *dst, const void *src, size_t byte_len)
+{
+  if(byte_len == 0) {
+    return dst;
+  }
+
+  return memcpy(dst, src, byte_len);
+}
+
+// bike_memset avoids the undefined behaviour of memset when byte_len=0
+_INLINE_ void *bike_memset(void *dst, const int ch, size_t byte_len)
+{
+  if(byte_len == 0) {
+    return dst;
+  }
+
+  return memset(dst, ch, byte_len);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/x86_64_intrinsic.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/x86_64_intrinsic.h
new file mode 100644
index 0000000000..b5c1e989bd
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/x86_64_intrinsic.h
@@ -0,0 +1,132 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0"
+ *
+ * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
+ * AWS Cryptographic Algorithms Group.
+ */
+
+// This file contains definitions of macros for SIMD intrinsic functions, used
+// throughout the code package. Where necessary, we add a suffix to a macro,
+// and denote the type of the elements (operateds). For example,
+//   - I16 denotes 16-bit wide integers,
+//   - U64 denotes 64-bit wide unsigned integers.
+
+#pragma once
+
+#if defined(S2N_BIKE_R3_AVX2) || defined(S2N_BIKE_R3_AVX512)
+#  include <immintrin.h>
+#endif
+
+// clang 3.9 doesn't recognize this macro
+#if !defined(_MM_CMPINT_EQ)
+#  define _MM_CMPINT_EQ (0)
+#endif
+
+// For functions in gf2x_mul.c we use exactly the same code for
+// PORTABLE, AVX2, AVX512 implementations. Based on the implementation,
+// we define macros for the different data types (uint64_t, __m256i, __m512i),
+// and all the required operations (LOAD, STORE, >>, <<) on these types.
+#if defined(AVX2_INTERNAL)
+
+#  define REG_T __m256i
+
+#  define LOAD(mem)       _mm256_loadu_si256((const void *)(mem))
+#  define STORE(mem, reg) _mm256_storeu_si256((void *)(mem), (reg))
+
+#  define SLLI_I64(a, imm) _mm256_slli_epi64(a, imm)
+#  define SRLI_I64(a, imm) _mm256_srli_epi64(a, imm)
+
+#elif defined(AVX512_INTERNAL)
+
+#  define REG_T __m512i
+
+#  define LOAD(mem)       _mm512_loadu_si512((mem))
+#  define STORE(mem, reg) _mm512_storeu_si512((mem), (reg))
+
+#  define SLLI_I64(a, imm) _mm512_slli_epi64(a, imm)
+#  define SRLI_I64(a, imm) _mm512_srli_epi64(a, imm)
+
+#elif defined(PORTABLE_INTERNAL)
+
+#  define REG_T uint64_t
+
+#  define LOAD(mem)       (mem)[0]
+#  define STORE(mem, val) (mem)[0] = val
+
+#  define SLLI_I64(a, imm) ((a) << (imm))
+#  define SRLI_I64(a, imm) ((a) >> (imm))
+
+#endif
+
+// NOLINT is used to avoid the sizeof(T)/sizeof(T) warning when REG_T is defined
+// to be uint64_t
+#define REG_QWORDS (sizeof(REG_T) / sizeof(uint64_t)) // NOLINT
+#define REG_DWORDS (sizeof(REG_T) / sizeof(uint32_t)) // NOLINT
+
+// The rest of the SIMD macros that are
+// required for AVX2 and AVX512 implementation.
+#if defined(AVX2_INTERNAL)
+
+#  define SET_I8(...)  _mm256_set_epi8(__VA_ARGS__)
+#  define SET_I32(...) _mm256_set_epi32(__VA_ARGS__)
+#  define SET_I64(...) _mm256_set_epi64x(__VA_ARGS__)
+#  define SET1_I8(a)   _mm256_set1_epi8(a)
+#  define SET1_I16(a)  _mm256_set1_epi16(a)
+#  define SET1_I32(a)  _mm256_set1_epi32(a)
+#  define SET1_I64(a)  _mm256_set1_epi64x(a)
+#  define SET_ZERO     _mm256_setzero_si256()
+
+#  define ADD_I8(a, b)     _mm256_add_epi8(a, b)
+#  define SUB_I8(a, b)     _mm256_sub_epi8(a, b)
+#  define ADD_I16(a, b)    _mm256_add_epi16(a, b)
+#  define SUB_I16(a, b)    _mm256_sub_epi16(a, b)
+#  define ADD_I64(a, b)    _mm256_add_epi64(a, b)
+#  define SRLI_I16(a, imm) _mm256_srli_epi16(a, imm)
+#  define SLLI_I32(a, imm) _mm256_slli_epi32(a, imm)
+#  define SLLV_I32(a, b)   _mm256_sllv_epi32(a, b)
+
+#  define CMPGT_I16(a, b) _mm256_cmpgt_epi16(a, b)
+#  define CMPEQ_I16(a, b) _mm256_cmpeq_epi16(a, b)
+#  define CMPEQ_I32(a, b) _mm256_cmpeq_epi32(a, b)
+#  define CMPEQ_I64(a, b) _mm256_cmpeq_epi64(a, b)
+
+#  define SHUF_I8(a, b)         _mm256_shuffle_epi8(a, b)
+#  define BLENDV_I8(a, b, mask) _mm256_blendv_epi8(a, b, mask)
+#  define PERMVAR_I32(a, idx)   _mm256_permutevar8x32_epi32(a, idx)
+#  define PERM_I64(a, imm)      _mm256_permute4x64_epi64(a, imm)
+
+#  define MOVEMASK(a) _mm256_movemask_epi8(a)
+
+#elif defined(AVX512_INTERNAL)
+
+#  define MSTORE(mem, mask, reg) _mm512_mask_store_epi64((mem), (mask), (reg))
+
+#  define SET1_I8(a)         _mm512_set1_epi8(a)
+#  define SET1_I32(a)        _mm512_set1_epi32(a)
+#  define SET1_I64(a)        _mm512_set1_epi64(a)
+#  define SET1MZ_I8(mask, a) _mm512_maskz_set1_epi8(mask, a)
+#  define SET1_I16(a)        _mm512_set1_epi16(a)
+#  define SET_I64(...)       _mm512_set_epi64(__VA_ARGS__)
+#  define SET_ZERO           _mm512_setzero_si512()
+
+#  define ADD_I16(a, b)             _mm512_add_epi16(a, b)
+#  define ADD_I64(a, b)             _mm512_add_epi64(a, b)
+#  define MSUB_I16(src, k, a, b)    _mm512_mask_sub_epi16(src, k, a, b)
+#  define SRLI_I16(a, imm)          _mm512_srli_epi16(a, imm)
+#  define SRLV_I64(a, cnt)          _mm512_srlv_epi64(a, cnt)
+#  define SLLV_I64(a, cnt)          _mm512_sllv_epi64(a, cnt)
+#  define MOR_I64(src, mask, a, b)  _mm512_mask_or_epi64(src, mask, a, b)
+#  define MXOR_I64(src, mask, a, b) _mm512_mask_xor_epi64(src, mask, a, b)
+#  define VALIGN(a, b, count)       _mm512_alignr_epi64(a, b, count)
+
+#  define CMPM_U8(a, b, cmp_op)  _mm512_cmp_epu8_mask(a, b, cmp_op)
+#  define CMPM_U16(a, b, cmp_op) _mm512_cmp_epu16_mask(a, b, cmp_op)
+#  define CMPMEQ_I64(a, b)       _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_EQ)
+#  define MCMPMEQ_I32(mask, a, b) \
+    _mm512_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ)
+
+#  define PERMX_I64(a, imm)        _mm512_permutex_epi64(a, imm)
+#  define PERMX2VAR_I64(a, idx, b) _mm512_permutex2var_epi64(a, idx, b)
+#  define PERMXVAR_I64(idx, a)     _mm512_permutexvar_epi64(idx, a)
+
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c
index c37548326d..4c520b693f 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c
@@ -188,7 +188,7 @@ int PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) {
     uint8_t *noiseseed = buf + KYBER_SYMBYTES;
     uint8_t nonce = 0;
 
-    GUARD_AS_POSIX(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
     hash_g(buf, buf, KYBER_SYMBYTES);
 
     gen_a(a, publicseed);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c
index 9de3c1daef..5b4c088b11 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c
@@ -22,14 +22,14 @@
 * Returns 0 (success)
 **************************************************/
 int kyber_512_90s_r2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
-    ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
     size_t i;
     PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(pk, sk);
     for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
         sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i];
     }
     hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-    GUARD_AS_POSIX(s2n_get_random_bytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES)); /* Value z for pseudo-random output on reject */
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES)); /* Value z for pseudo-random output on reject */
     return 0;
 }
 
@@ -46,11 +46,11 @@ int kyber_512_90s_r2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
 * Returns 0 (success)
 **************************************************/
 int kyber_512_90s_r2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-    ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
     uint8_t  kr[2 * KYBER_SYMBYTES];                                   /* Will contain key, coins */
     uint8_t buf[2 * KYBER_SYMBYTES];
 
-    GUARD_AS_POSIX(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
     hash_h(buf, buf, KYBER_SYMBYTES);                                        /* Don't release system RNG output */
 
     hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);                  /* Multitarget countermeasure for coins + contributory KEM */
@@ -78,7 +78,7 @@ int kyber_512_90s_r2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk)
 * On failure, ss will contain a pseudo-random value.
 **************************************************/
 int kyber_512_90s_r2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-    ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
     size_t i;
     uint8_t fail;
     uint8_t cmp[KYBER_CIPHERTEXTBYTES];
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h
index 720bee975a..66fc5a9484 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h
@@ -6,8 +6,8 @@
 extern const int16_t PQCLEAN_KYBER51290S_CLEAN_zetas[128];
 extern const int16_t PQCLEAN_KYBER51290S_CLEAN_zetasinv[128];
 
-void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t *poly);
-void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t *poly);
+void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t poly[256]);
+void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t poly[256]);
 void PQCLEAN_KYBER51290S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);
 
 #endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c
index 233b5d8515..1b76bb9b0c 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c
@@ -188,7 +188,7 @@ int PQCLEAN_KYBER512_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) {
     uint8_t *noiseseed = buf + KYBER_SYMBYTES;
     uint8_t nonce = 0;
 
-    GUARD_AS_POSIX(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
     hash_g(buf, buf, KYBER_SYMBYTES);
 
     gen_a(a, publicseed);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c
index 9871084bb4..140ec352d4 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c
@@ -22,14 +22,14 @@
 * Returns 0 (success)
 **************************************************/
 int kyber_512_r2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
-    ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
     size_t i;
     PQCLEAN_KYBER512_CLEAN_indcpa_keypair(pk, sk);
     for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
         sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i];
     }
     hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-    GUARD_AS_POSIX(s2n_get_random_bytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES));	/* Value z for pseudo-random output on reject */
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES));	/* Value z for pseudo-random output on reject */
     return 0;
 }
 
@@ -46,11 +46,11 @@ int kyber_512_r2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
 * Returns 0 (success)
 **************************************************/
 int kyber_512_r2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-    ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
     uint8_t  kr[2 * KYBER_SYMBYTES];                                   /* Will contain key, coins */
     uint8_t buf[2 * KYBER_SYMBYTES];
 
-    GUARD_AS_POSIX(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, KYBER_SYMBYTES));
     hash_h(buf, buf, KYBER_SYMBYTES);                                        /* Don't release system RNG output */
 
     hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);                  /* Multitarget countermeasure for coins + contributory KEM */
@@ -78,7 +78,7 @@ int kyber_512_r2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
 * On failure, ss will contain a pseudo-random value.
 **************************************************/
 int kyber_512_r2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-    ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
     size_t i;
     uint8_t fail;
     uint8_t cmp[KYBER_CIPHERTEXTBYTES];
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h
index 13e976f7d0..7885e7cdc6 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h
@@ -6,8 +6,8 @@
 extern const int16_t PQCLEAN_KYBER512_CLEAN_zetas[128];
 extern const int16_t PQCLEAN_KYBER512_CLEAN_zetasinv[128];
 
-void PQCLEAN_KYBER512_CLEAN_ntt(int16_t *poly);
-void PQCLEAN_KYBER512_CLEAN_invntt(int16_t *poly);
+void PQCLEAN_KYBER512_CLEAN_ntt(int16_t poly[256]);
+void PQCLEAN_KYBER512_CLEAN_invntt(int16_t poly[256]);
 void PQCLEAN_KYBER512_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);
 
 #endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SIMD256_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SIMD256_avx2.c
new file mode 100644
index 0000000000..349442f65c
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SIMD256_avx2.c
@@ -0,0 +1,1284 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+// extra headers are removed: smmintrin.h, wmmintrin.h and emmintrin.h
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+#include "KeccakP-align_avx2.h"
+#include "KeccakP-1600-times4-SnP_avx2.h"
+#include "KeccakP-SIMD256-config_avx2.h"
+
+#include "KeccakP-brg_endian_avx2.h"
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+#error Expecting a little-endian platform
+#endif
+
+typedef unsigned char UINT8;
+typedef unsigned long long int UINT64;
+typedef __m128i V128;
+typedef __m256i V256;
+
+#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
+
+#if defined(KeccakP1600times4_useAVX2)
+    #define ANDnu256(a, b)          _mm256_andnot_si256(a, b)
+    // correcting cast-align error
+    // old version: #define CONST256(a) _mm256_load_si256((const V256 *)&(a))
+    #define CONST256(a)             _mm256_load_si256((const void *)&(a))
+    #define CONST256_64(a)          (V256)_mm256_broadcast_sd((const double*)(&a))
+    #define LOAD256(a)              _mm256_load_si256((const V256 *)&(a))
+    // correcting cast-align error
+    // old version: #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
+    #define LOAD256u(a)             _mm256_loadu_si256((const void *)&(a))
+    #define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
+    #define ROL64in256(d, a, o)     d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
+    #define ROL64in256_8(d, a)      d = _mm256_shuffle_epi8(a, CONST256(rho8))
+    #define ROL64in256_56(d, a)     d = _mm256_shuffle_epi8(a, CONST256(rho56))
+static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
+static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
+    #define STORE256(a, b)          _mm256_store_si256((V256 *)&(a), b)
+    // correcting cast-align error
+    // old version: #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
+    #define STORE256u(a, b)         _mm256_storeu_si256((void *)&(a), b)
+    #define STORE2_128(ah, al, v)   _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)
+    #define XOR256(a, b)            _mm256_xor_si256(a, b)
+    #define XOReq256(a, b)          a = _mm256_xor_si256(a, b)
+    #define UNPACKL( a, b )         _mm256_unpacklo_epi64((a), (b))
+    #define UNPACKH( a, b )         _mm256_unpackhi_epi64((a), (b))
+    #define PERM128( a, b, c )      (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
+    #define SHUFFLE64( a, b, c )    (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
+
+    #define UNINTLEAVE()            lanesL01 = UNPACKL( lanes0, lanes1 ),                   \
+                                    lanesH01 = UNPACKH( lanes0, lanes1 ),                   \
+                                    lanesL23 = UNPACKL( lanes2, lanes3 ),                   \
+                                    lanesH23 = UNPACKH( lanes2, lanes3 ),                   \
+                                    lanes0 = PERM128( lanesL01, lanesL23, 0x20 ),           \
+                                    lanes2 = PERM128( lanesL01, lanesL23, 0x31 ),           \
+                                    lanes1 = PERM128( lanesH01, lanesH23, 0x20 ),           \
+                                    lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
+
+    #define INTLEAVE()              lanesL01 = PERM128( lanes0, lanes2, 0x20 ),             \
+                                    lanesH01 = PERM128( lanes1, lanes3, 0x20 ),             \
+                                    lanesL23 = PERM128( lanes0, lanes2, 0x31 ),             \
+                                    lanesH23 = PERM128( lanes1, lanes3, 0x31 ),             \
+                                    lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ),         \
+                                    lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ),         \
+                                    lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ),         \
+                                    lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
+
+#endif
+
+#define SnP_laneLengthInBytes 8
+
+void KeccakP1600times4_InitializeAll(void *states)
+{
+    memset(states, 0, KeccakP1600times4_statesSizeInBytes);
+}
+
+void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    unsigned int sizeLeft = length;
+    unsigned int lanePosition = offset/SnP_laneLengthInBytes;
+    unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
+    const unsigned char *curData = data;
+    UINT64 *statesAsLanes = (UINT64 *)states;
+
+    if ((sizeLeft > 0) && (offsetInLane != 0)) {
+        unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
+        UINT64 lane = 0;
+        if (bytesInLane > sizeLeft)
+            bytesInLane = sizeLeft;
+        memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
+        statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
+        sizeLeft -= bytesInLane;
+        lanePosition++;
+        curData += bytesInLane;
+    }
+
+    while(sizeLeft >= SnP_laneLengthInBytes) {
+        // correcting cast-align error
+        // old version: UINT64 lane = *((const UINT64*)curData);
+        UINT64 lane = *((const UINT64*)(const void *)curData);
+        statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
+        sizeLeft -= SnP_laneLengthInBytes;
+        lanePosition++;
+        curData += SnP_laneLengthInBytes;
+    }
+
+    if (sizeLeft > 0) {
+        UINT64 lane = 0;
+        memcpy(&lane, curData, sizeLeft);
+        statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
+    }
+}
+
+void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+    V256 *stateAsLanes = (V256 *)states;
+    unsigned int i;
+    // correcting cast-align errors
+    // old version: const UINT64 *curData0 = (const UINT64 *)data;
+    const UINT64 *curData0 = (const void *)data;
+    // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
+    const UINT64 *curData1 = (const void *)(data+laneOffset*SnP_laneLengthInBytes);
+    // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
+    const UINT64 *curData2 = (const void *)(data+laneOffset*2*SnP_laneLengthInBytes);
+    // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
+    const UINT64 *curData3 = (const void *)(data+laneOffset*3*SnP_laneLengthInBytes);
+    V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
+
+    #define Xor_In( argIndex )  XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
+
+    #define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
+                                lanes1 = LOAD256u( curData1[argIndex]),\
+                                lanes2 = LOAD256u( curData2[argIndex]),\
+                                lanes3 = LOAD256u( curData3[argIndex]),\
+                                INTLEAVE(),\
+                                XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
+                                XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
+                                XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
+                                XOReq256( stateAsLanes[argIndex+3], lanes3 )
+
+    if ( laneCount >= 16 )  {
+        Xor_In4( 0 );
+        Xor_In4( 4 );
+        Xor_In4( 8 );
+        Xor_In4( 12 );
+        if ( laneCount >= 20 )  {
+            Xor_In4( 16 );
+            for(i=20; i<laneCount; i++)
+                Xor_In( i );
+        }
+        else {
+            for(i=16; i<laneCount; i++)
+                Xor_In( i );
+        }
+    }
+    else {
+        for(i=0; i<laneCount; i++)
+            Xor_In( i );
+    }
+    #undef  Xor_In
+    #undef  Xor_In4
+}
+
+void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    unsigned int sizeLeft = length;
+    unsigned int lanePosition = offset/SnP_laneLengthInBytes;
+    unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
+    const unsigned char *curData = data;
+    UINT64 *statesAsLanes = (UINT64 *)states;
+
+    if ((sizeLeft > 0) && (offsetInLane != 0)) {
+        unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
+        if (bytesInLane > sizeLeft)
+            bytesInLane = sizeLeft;
+        memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
+        sizeLeft -= bytesInLane;
+        lanePosition++;
+        curData += bytesInLane;
+    }
+
+    while(sizeLeft >= SnP_laneLengthInBytes) {
+        // correcting cast-align error
+        // old version: UINT64 lane = *((const UINT64*)curData);
+        UINT64 lane = *((const UINT64*)(const void*)curData);
+        statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
+        sizeLeft -= SnP_laneLengthInBytes;
+        lanePosition++;
+        curData += SnP_laneLengthInBytes;
+    }
+
+    if (sizeLeft > 0) {
+        memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
+    }
+}
+
+void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+    V256 *stateAsLanes = (V256 *)states;
+    unsigned int i;
+    // correcting cast-align errors
+    // old version: const UINT64 *curData0 = (const UINT64 *)data;
+    const UINT64 *curData0 = (const void *)data;
+    // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
+    const UINT64 *curData1 = (const void *)(data+laneOffset*SnP_laneLengthInBytes);
+    // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
+    const UINT64 *curData2 = (const void *)(data+laneOffset*2*SnP_laneLengthInBytes);
+    // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
+    const UINT64 *curData3 = (const void *)(data+laneOffset*3*SnP_laneLengthInBytes);
+    V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
+
+    #define OverWr( argIndex )  STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
+
+    #define OverWr4( argIndex )     lanes0 = LOAD256u( curData0[argIndex]),\
+                                    lanes1 = LOAD256u( curData1[argIndex]),\
+                                    lanes2 = LOAD256u( curData2[argIndex]),\
+                                    lanes3 = LOAD256u( curData3[argIndex]),\
+                                    INTLEAVE(),\
+                                    STORE256( stateAsLanes[argIndex+0], lanes0 ),\
+                                    STORE256( stateAsLanes[argIndex+1], lanes1 ),\
+                                    STORE256( stateAsLanes[argIndex+2], lanes2 ),\
+                                    STORE256( stateAsLanes[argIndex+3], lanes3 )
+
+    if ( laneCount >= 16 )  {
+        OverWr4( 0 );
+        OverWr4( 4 );
+        OverWr4( 8 );
+        OverWr4( 12 );
+        if ( laneCount >= 20 )  {
+            OverWr4( 16 );
+            for(i=20; i<laneCount; i++)
+                OverWr( i );
+        }
+        else {
+            for(i=16; i<laneCount; i++)
+                OverWr( i );
+        }
+    }
+    else {
+        for(i=0; i<laneCount; i++)
+            OverWr( i );
+    }
+    #undef  OverWr
+    #undef  OverWr4
+}
+
+void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
+{
+    unsigned int sizeLeft = byteCount;
+    unsigned int lanePosition = 0;
+    UINT64 *statesAsLanes = (UINT64 *)states;
+
+    while(sizeLeft >= SnP_laneLengthInBytes) {
+        statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
+        sizeLeft -= SnP_laneLengthInBytes;
+        lanePosition++;
+    }
+
+    if (sizeLeft > 0) {
+        memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
+    }
+}
+
+void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    unsigned int sizeLeft = length;
+    unsigned int lanePosition = offset/SnP_laneLengthInBytes;
+    unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
+    unsigned char *curData = data;
+    const UINT64 *statesAsLanes = (const UINT64 *)states;
+
+    if ((sizeLeft > 0) && (offsetInLane != 0)) {
+        unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
+        if (bytesInLane > sizeLeft)
+            bytesInLane = sizeLeft;
+        // correcting cast-qual error
+        // old version: memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
+        memcpy( curData, ((const unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
+        sizeLeft -= bytesInLane;
+        lanePosition++;
+        curData += bytesInLane;
+    }
+
+    while(sizeLeft >= SnP_laneLengthInBytes) {
+        // correcting cast-align error
+        // old version: *(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+        *(UINT64*)(void*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+        sizeLeft -= SnP_laneLengthInBytes;
+        lanePosition++;
+        curData += SnP_laneLengthInBytes;
+    }
+
+    if (sizeLeft > 0) {
+        memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
+    }
+}
+
+void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+    // correcting cast-align errors
+    // old version: UINT64 *curData0 = (UINT64 *)data;
+    UINT64 *curData0 = (void *)data;
+    // old version: UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes);
+    UINT64 *curData1 = (void *)(data+laneOffset*1*SnP_laneLengthInBytes);
+    // old version: UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
+    UINT64 *curData2 = (void *)(data+laneOffset*2*SnP_laneLengthInBytes);
+    // old version: UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
+    UINT64 *curData3 = (void *)(data+laneOffset*3*SnP_laneLengthInBytes);
+
+    const V256 *stateAsLanes = (const V256 *)states;
+    const UINT64 *stateAsLanes64 = (const UINT64*)states;
+    V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
+    unsigned int i;
+
+    #define Extr( argIndex )    curData0[argIndex] = stateAsLanes64[4*(argIndex)],      \
+                                curData1[argIndex] = stateAsLanes64[4*(argIndex)+1],    \
+                                curData2[argIndex] = stateAsLanes64[4*(argIndex)+2],    \
+                                curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]
+
+    #define Extr4( argIndex )   lanes0 = LOAD256( stateAsLanes[argIndex+0] ),           \
+                                lanes1 = LOAD256( stateAsLanes[argIndex+1] ),           \
+                                lanes2 = LOAD256( stateAsLanes[argIndex+2] ),           \
+                                lanes3 = LOAD256( stateAsLanes[argIndex+3] ),           \
+                                UNINTLEAVE(),                                           \
+                                STORE256u( curData0[argIndex], lanes0 ),                \
+                                STORE256u( curData1[argIndex], lanes1 ),                \
+                                STORE256u( curData2[argIndex], lanes2 ),                \
+                                STORE256u( curData3[argIndex], lanes3 )
+
+    if ( laneCount >= 16 )  {
+        Extr4( 0 );
+        Extr4( 4 );
+        Extr4( 8 );
+        Extr4( 12 );
+        if ( laneCount >= 20 )  {
+            Extr4( 16 );
+            for(i=20; i<laneCount; i++)
+                Extr( i );
+        }
+        else {
+            for(i=16; i<laneCount; i++)
+                Extr( i );
+        }
+    }
+    else {
+        for(i=0; i<laneCount; i++)
+            Extr( i );
+    }
+    #undef  Extr
+    #undef  Extr4
+}
+
+void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+    unsigned int sizeLeft = length;
+    unsigned int lanePosition = offset/SnP_laneLengthInBytes;
+    unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
+    const unsigned char *curInput = input;
+    unsigned char *curOutput = output;
+    const UINT64 *statesAsLanes = (const UINT64 *)states;
+
+    if ((sizeLeft > 0) && (offsetInLane != 0)) {
+        unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
+        UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
+        if (bytesInLane > sizeLeft)
+            bytesInLane = sizeLeft;
+        sizeLeft -= bytesInLane;
+        do {
+            *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
+            lane >>= 8;
+        } while ( --bytesInLane != 0);
+        lanePosition++;
+    }
+
+    while(sizeLeft >= SnP_laneLengthInBytes) {
+        // correcting cast-align and cast-qual errors
+        // old version: *((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+        *((UINT64*)(void*)curOutput) = *((const UINT64*)(const void*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+        sizeLeft -= SnP_laneLengthInBytes;
+        lanePosition++;
+        curInput += SnP_laneLengthInBytes;
+        curOutput += SnP_laneLengthInBytes;
+    }
+
+    if (sizeLeft != 0) {
+        UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+        do {
+            *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
+            lane >>= 8;
+        } while ( --sizeLeft != 0);
+    }
+}
+
+void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
+{
+    // correcting cast-align and cast-qual errors
+    // old version: const UINT64 *curInput0 = (UINT64 *)input;
+    const UINT64 *curInput0 = (const void *)input;
+    // old version: const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
+    const UINT64 *curInput1 = (const void *)(input+laneOffset*1*SnP_laneLengthInBytes);
+    // old version: const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
+    const UINT64 *curInput2 = (const void *)(input+laneOffset*2*SnP_laneLengthInBytes);
+    // old version: const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
+    const UINT64 *curInput3 = (const void *)(input+laneOffset*3*SnP_laneLengthInBytes);
+    // correcting cast-align errors
+    // old version: UINT64 *curOutput0 = (UINT64 *)output;
+    UINT64 *curOutput0 = (void *)output;
+    // old version: UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
+    UINT64 *curOutput1 = (void *)(output+laneOffset*1*SnP_laneLengthInBytes);
+    // old version: UUINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
+    UINT64 *curOutput2 = (void *)(output+laneOffset*2*SnP_laneLengthInBytes);
+    // old version: UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);
+    UINT64 *curOutput3 = (void *)(output+laneOffset*3*SnP_laneLengthInBytes);
+
+    const V256 *stateAsLanes = (const V256 *)states;
+    const UINT64 *stateAsLanes64 = (const UINT64*)states;
+    V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
+    unsigned int i;
+
+    #define ExtrXor( argIndex ) \
+                                curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
+                                curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
+                                curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
+                                curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
+
+    #define ExtrXor4( argIndex ) \
+                                    lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
+                                    lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
+                                    lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
+                                    lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
+                                    UNINTLEAVE(),\
+                                    lanesL01 = LOAD256u( curInput0[argIndex]),\
+                                    lanesH01 = LOAD256u( curInput1[argIndex]),\
+                                    lanesL23 = LOAD256u( curInput2[argIndex]),\
+                                    lanesH23 = LOAD256u( curInput3[argIndex]),\
+                                    XOReq256( lanes0, lanesL01 ),\
+                                    XOReq256( lanes1, lanesH01 ),\
+                                    XOReq256( lanes2, lanesL23 ),\
+                                    XOReq256( lanes3, lanesH23 ),\
+                                    STORE256u( curOutput0[argIndex], lanes0 ),\
+                                    STORE256u( curOutput1[argIndex], lanes1 ),\
+                                    STORE256u( curOutput2[argIndex], lanes2 ),\
+                                    STORE256u( curOutput3[argIndex], lanes3 )
+
+    if ( laneCount >= 16 )  {
+        ExtrXor4( 0 );
+        ExtrXor4( 4 );
+        ExtrXor4( 8 );
+        ExtrXor4( 12 );
+        if ( laneCount >= 20 )  {
+            ExtrXor4( 16 );
+            for(i=20; i<laneCount; i++)
+                ExtrXor( i );
+        }
+        else {
+            for(i=16; i<laneCount; i++)
+                ExtrXor( i );
+        }
+    }
+    else {
+        for(i=0; i<laneCount; i++)
+            ExtrXor( i );
+    }
+    #undef  ExtrXor
+    #undef  ExtrXor4
+}
+
+#define declareABCDE \
+    V256 Aba, Abe, Abi, Abo, Abu; \
+    V256 Aga, Age, Agi, Ago, Agu; \
+    V256 Aka, Ake, Aki, Ako, Aku; \
+    V256 Ama, Ame, Ami, Amo, Amu; \
+    V256 Asa, Ase, Asi, Aso, Asu; \
+    V256 Bba, Bbe, Bbi, Bbo, Bbu; \
+    V256 Bga, Bge, Bgi, Bgo, Bgu; \
+    V256 Bka, Bke, Bki, Bko, Bku; \
+    V256 Bma, Bme, Bmi, Bmo, Bmu; \
+    V256 Bsa, Bse, Bsi, Bso, Bsu; \
+    V256 Ca, Ce, Ci, Co, Cu; \
+    V256 Ca1, Ce1, Ci1, Co1, Cu1; \
+    V256 Da, De, Di, Do, Du; \
+    V256 Eba, Ebe, Ebi, Ebo, Ebu; \
+    V256 Ega, Ege, Egi, Ego, Egu; \
+    V256 Eka, Eke, Eki, Eko, Eku; \
+    V256 Ema, Eme, Emi, Emo, Emu; \
+    V256 Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+    Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
+    Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
+    Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
+    Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
+    Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
+
+/* --- Theta Rho Pi Chi Iota Prepare-theta */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    ROL64in256(Ce1, Ce, 1); \
+    Da = XOR256(Cu, Ce1); \
+    ROL64in256(Ci1, Ci, 1); \
+    De = XOR256(Ca, Ci1); \
+    ROL64in256(Co1, Co, 1); \
+    Di = XOR256(Ce, Co1); \
+    ROL64in256(Cu1, Cu, 1); \
+    Do = XOR256(Ci, Cu1); \
+    ROL64in256(Ca1, Ca, 1); \
+    Du = XOR256(Co, Ca1); \
+\
+    XOReq256(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq256(A##ge, De); \
+    ROL64in256(Bbe, A##ge, 44); \
+    XOReq256(A##ki, Di); \
+    ROL64in256(Bbi, A##ki, 43); \
+    E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
+    XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
+    Ca = E##ba; \
+    XOReq256(A##mo, Do); \
+    ROL64in256(Bbo, A##mo, 21); \
+    E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
+    Ce = E##be; \
+    XOReq256(A##su, Du); \
+    ROL64in256(Bbu, A##su, 14); \
+    E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
+    Ci = E##bi; \
+    E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
+    Co = E##bo; \
+    E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
+    Cu = E##bu; \
+\
+    XOReq256(A##bo, Do); \
+    ROL64in256(Bga, A##bo, 28); \
+    XOReq256(A##gu, Du); \
+    ROL64in256(Bge, A##gu, 20); \
+    XOReq256(A##ka, Da); \
+    ROL64in256(Bgi, A##ka, 3); \
+    E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
+    XOReq256(Ca, E##ga); \
+    XOReq256(A##me, De); \
+    ROL64in256(Bgo, A##me, 45); \
+    E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
+    XOReq256(Ce, E##ge); \
+    XOReq256(A##si, Di); \
+    ROL64in256(Bgu, A##si, 61); \
+    E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
+    XOReq256(Ci, E##gi); \
+    E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
+    XOReq256(Co, E##go); \
+    E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
+    XOReq256(Cu, E##gu); \
+\
+    XOReq256(A##be, De); \
+    ROL64in256(Bka, A##be, 1); \
+    XOReq256(A##gi, Di); \
+    ROL64in256(Bke, A##gi, 6); \
+    XOReq256(A##ko, Do); \
+    ROL64in256(Bki, A##ko, 25); \
+    E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
+    XOReq256(Ca, E##ka); \
+    XOReq256(A##mu, Du); \
+    ROL64in256_8(Bko, A##mu); \
+    E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
+    XOReq256(Ce, E##ke); \
+    XOReq256(A##sa, Da); \
+    ROL64in256(Bku, A##sa, 18); \
+    E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
+    XOReq256(Ci, E##ki); \
+    E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
+    XOReq256(Co, E##ko); \
+    E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
+    XOReq256(Cu, E##ku); \
+\
+    XOReq256(A##bu, Du); \
+    ROL64in256(Bma, A##bu, 27); \
+    XOReq256(A##ga, Da); \
+    ROL64in256(Bme, A##ga, 36); \
+    XOReq256(A##ke, De); \
+    ROL64in256(Bmi, A##ke, 10); \
+    E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
+    XOReq256(Ca, E##ma); \
+    XOReq256(A##mi, Di); \
+    ROL64in256(Bmo, A##mi, 15); \
+    E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
+    XOReq256(Ce, E##me); \
+    XOReq256(A##so, Do); \
+    ROL64in256_56(Bmu, A##so); \
+    E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
+    XOReq256(Ci, E##mi); \
+    E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
+    XOReq256(Co, E##mo); \
+    E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
+    XOReq256(Cu, E##mu); \
+\
+    XOReq256(A##bi, Di); \
+    ROL64in256(Bsa, A##bi, 62); \
+    XOReq256(A##go, Do); \
+    ROL64in256(Bse, A##go, 55); \
+    XOReq256(A##ku, Du); \
+    ROL64in256(Bsi, A##ku, 39); \
+    E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
+    XOReq256(Ca, E##sa); \
+    XOReq256(A##ma, Da); \
+    ROL64in256(Bso, A##ma, 41); \
+    E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
+    XOReq256(Ce, E##se); \
+    XOReq256(A##se, De); \
+    ROL64in256(Bsu, A##se, 2); \
+    E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
+    XOReq256(Ci, E##si); \
+    E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
+    XOReq256(Co, E##so); \
+    E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
+    XOReq256(Cu, E##su); \
+\
+
+/* --- Theta Rho Pi Chi Iota */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    ROL64in256(Ce1, Ce, 1); \
+    Da = XOR256(Cu, Ce1); \
+    ROL64in256(Ci1, Ci, 1); \
+    De = XOR256(Ca, Ci1); \
+    ROL64in256(Co1, Co, 1); \
+    Di = XOR256(Ce, Co1); \
+    ROL64in256(Cu1, Cu, 1); \
+    Do = XOR256(Ci, Cu1); \
+    ROL64in256(Ca1, Ca, 1); \
+    Du = XOR256(Co, Ca1); \
+\
+    XOReq256(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq256(A##ge, De); \
+    ROL64in256(Bbe, A##ge, 44); \
+    XOReq256(A##ki, Di); \
+    ROL64in256(Bbi, A##ki, 43); \
+    E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
+    XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
+    XOReq256(A##mo, Do); \
+    ROL64in256(Bbo, A##mo, 21); \
+    E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
+    XOReq256(A##su, Du); \
+    ROL64in256(Bbu, A##su, 14); \
+    E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
+    E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
+    E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
+\
+    XOReq256(A##bo, Do); \
+    ROL64in256(Bga, A##bo, 28); \
+    XOReq256(A##gu, Du); \
+    ROL64in256(Bge, A##gu, 20); \
+    XOReq256(A##ka, Da); \
+    ROL64in256(Bgi, A##ka, 3); \
+    E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
+    XOReq256(A##me, De); \
+    ROL64in256(Bgo, A##me, 45); \
+    E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
+    XOReq256(A##si, Di); \
+    ROL64in256(Bgu, A##si, 61); \
+    E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
+    E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
+    E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
+\
+    XOReq256(A##be, De); \
+    ROL64in256(Bka, A##be, 1); \
+    XOReq256(A##gi, Di); \
+    ROL64in256(Bke, A##gi, 6); \
+    XOReq256(A##ko, Do); \
+    ROL64in256(Bki, A##ko, 25); \
+    E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
+    XOReq256(A##mu, Du); \
+    ROL64in256_8(Bko, A##mu); \
+    E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
+    XOReq256(A##sa, Da); \
+    ROL64in256(Bku, A##sa, 18); \
+    E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
+    E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
+    E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
+\
+    XOReq256(A##bu, Du); \
+    ROL64in256(Bma, A##bu, 27); \
+    XOReq256(A##ga, Da); \
+    ROL64in256(Bme, A##ga, 36); \
+    XOReq256(A##ke, De); \
+    ROL64in256(Bmi, A##ke, 10); \
+    E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
+    XOReq256(A##mi, Di); \
+    ROL64in256(Bmo, A##mi, 15); \
+    E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
+    XOReq256(A##so, Do); \
+    ROL64in256_56(Bmu, A##so); \
+    E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
+    E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
+    E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
+\
+    XOReq256(A##bi, Di); \
+    ROL64in256(Bsa, A##bi, 62); \
+    XOReq256(A##go, Do); \
+    ROL64in256(Bse, A##go, 55); \
+    XOReq256(A##ku, Du); \
+    ROL64in256(Bsi, A##ku, 39); \
+    E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
+    XOReq256(A##ma, Da); \
+    ROL64in256(Bso, A##ma, 41); \
+    E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
+    XOReq256(A##se, De); \
+    ROL64in256(Bsu, A##se, 2); \
+    E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
+    E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
+    E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
+\
+
+static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL};
+
+#define copyFromState(X, state) \
+    X##ba = LOAD256(state[ 0]); \
+    X##be = LOAD256(state[ 1]); \
+    X##bi = LOAD256(state[ 2]); \
+    X##bo = LOAD256(state[ 3]); \
+    X##bu = LOAD256(state[ 4]); \
+    X##ga = LOAD256(state[ 5]); \
+    X##ge = LOAD256(state[ 6]); \
+    X##gi = LOAD256(state[ 7]); \
+    X##go = LOAD256(state[ 8]); \
+    X##gu = LOAD256(state[ 9]); \
+    X##ka = LOAD256(state[10]); \
+    X##ke = LOAD256(state[11]); \
+    X##ki = LOAD256(state[12]); \
+    X##ko = LOAD256(state[13]); \
+    X##ku = LOAD256(state[14]); \
+    X##ma = LOAD256(state[15]); \
+    X##me = LOAD256(state[16]); \
+    X##mi = LOAD256(state[17]); \
+    X##mo = LOAD256(state[18]); \
+    X##mu = LOAD256(state[19]); \
+    X##sa = LOAD256(state[20]); \
+    X##se = LOAD256(state[21]); \
+    X##si = LOAD256(state[22]); \
+    X##so = LOAD256(state[23]); \
+    X##su = LOAD256(state[24]); \
+
+#define copyToState(state, X) \
+    STORE256(state[ 0], X##ba); \
+    STORE256(state[ 1], X##be); \
+    STORE256(state[ 2], X##bi); \
+    STORE256(state[ 3], X##bo); \
+    STORE256(state[ 4], X##bu); \
+    STORE256(state[ 5], X##ga); \
+    STORE256(state[ 6], X##ge); \
+    STORE256(state[ 7], X##gi); \
+    STORE256(state[ 8], X##go); \
+    STORE256(state[ 9], X##gu); \
+    STORE256(state[10], X##ka); \
+    STORE256(state[11], X##ke); \
+    STORE256(state[12], X##ki); \
+    STORE256(state[13], X##ko); \
+    STORE256(state[14], X##ku); \
+    STORE256(state[15], X##ma); \
+    STORE256(state[16], X##me); \
+    STORE256(state[17], X##mi); \
+    STORE256(state[18], X##mo); \
+    STORE256(state[19], X##mu); \
+    STORE256(state[20], X##sa); \
+    STORE256(state[21], X##se); \
+    STORE256(state[22], X##si); \
+    STORE256(state[23], X##so); \
+    STORE256(state[24], X##su); \
+
+#define copyStateVariables(X, Y) \
+    X##ba = Y##ba; \
+    X##be = Y##be; \
+    X##bi = Y##bi; \
+    X##bo = Y##bo; \
+    X##bu = Y##bu; \
+    X##ga = Y##ga; \
+    X##ge = Y##ge; \
+    X##gi = Y##gi; \
+    X##go = Y##go; \
+    X##gu = Y##gu; \
+    X##ka = Y##ka; \
+    X##ke = Y##ke; \
+    X##ki = Y##ki; \
+    X##ko = Y##ko; \
+    X##ku = Y##ku; \
+    X##ma = Y##ma; \
+    X##me = Y##me; \
+    X##mi = Y##mi; \
+    X##mo = Y##mo; \
+    X##mu = Y##mu; \
+    X##sa = Y##sa; \
+    X##se = Y##se; \
+    X##si = Y##si; \
+    X##so = Y##so; \
+    X##su = Y##su; \
+
+ #ifdef KeccakP1600times4_fullUnrolling
+#define FullUnrolling
+#else
+#define Unrolling KeccakP1600times4_unrolling
+#endif
+// The macro file is combined with source file directly
+/*****#include "KeccakP-1600-unrolling_avx2.macros"*****/
+/*******************************************************/
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#if (defined(FullUnrolling))
+#define rounds24 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(10, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(11, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#define rounds12 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 12)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=12) { \
+        thetaRhoPiChiIotaPrepareTheta(i   , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 6)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=6) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=6) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+    } \
+
+#elif (Unrolling == 4)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=4) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=4) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+    } \
+
+#elif (Unrolling == 3)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=3) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=3) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#elif (Unrolling == 2)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
+#elif (Unrolling == 1)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#else
+#error "Unrolling is not correctly specified!"
+#endif
+
+#define roundsN(__nrounds) \
+    prepareTheta \
+    i = 24 - (__nrounds); \
+    if ((i&1) != 0) { \
+        thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+        copyStateVariables(A, E) \
+        ++i; \
+    } \
+    for( /* empty */; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    }
+
+/*******************************************************/
+
+void KeccakP1600times4_PermuteAll_24rounds(void *states)
+{
+    V256 *statesAsLanes = (V256 *)states;
+    declareABCDE
+    #ifndef KeccakP1600times4_fullUnrolling
+    unsigned int i;
+    #endif
+
+    copyFromState(A, statesAsLanes)
+    rounds24
+    copyToState(statesAsLanes, A)
+}
+
+void KeccakP1600times4_PermuteAll_12rounds(void *states)
+{
+    V256 *statesAsLanes = (V256 *)states;
+    declareABCDE
+    #ifndef KeccakP1600times4_fullUnrolling
+    unsigned int i;
+    #endif
+
+    copyFromState(A, statesAsLanes)
+    rounds12
+    copyToState(statesAsLanes, A)
+}
+
+size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
+{
+    if (laneCount == 21) {
+#if 0
+        const unsigned char *dataStart = data;
+        const UINT64 *curData0 = (const UINT64 *)data;
+        const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
+        const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
+        const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
+
+        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
+            V256 *stateAsLanes = (V256 *)states;
+            V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
+            #define Xor_In( argIndex ) \
+                XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
+            #define Xor_In4( argIndex ) \
+                lanes0 = LOAD256u( curData0[argIndex]),\
+                lanes1 = LOAD256u( curData1[argIndex]),\
+                lanes2 = LOAD256u( curData2[argIndex]),\
+                lanes3 = LOAD256u( curData3[argIndex]),\
+                INTLEAVE(),\
+                XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
+                XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
+                XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
+                XOReq256( stateAsLanes[argIndex+3], lanes3 )
+            Xor_In4( 0 );
+            Xor_In4( 4 );
+            Xor_In4( 8 );
+            Xor_In4( 12 );
+            Xor_In4( 16 );
+            Xor_In( 20 );
+            #undef  Xor_In
+            #undef  Xor_In4
+            KeccakP1600times4_PermuteAll_24rounds(states);
+            curData0 += laneOffsetSerial;
+            curData1 += laneOffsetSerial;
+            curData2 += laneOffsetSerial;
+            curData3 += laneOffsetSerial;
+            dataByteLen -= laneOffsetSerial*8;
+        }
+        return (const unsigned char *)curData0 - dataStart;
+#else
+//        unsigned int i;
+        const unsigned char *dataStart = data;
+        // correcting cast-align errors
+        // old version: const UINT64 *curData0 = (const UINT64 *)data;
+        const UINT64 *curData0 = (const void *)data;
+        // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
+        const UINT64 *curData1 = (const void *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
+        // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
+        const UINT64 *curData2 = (const void *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
+        // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
+        const UINT64 *curData3 = (const void *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
+        V256 *statesAsLanes = (V256 *)states;
+        declareABCDE
+
+        copyFromState(A, statesAsLanes)
+        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
+            #define XOR_In( Xxx, argIndex ) \
+                XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
+            XOR_In( Aba, 0 );
+            XOR_In( Abe, 1 );
+            XOR_In( Abi, 2 );
+            XOR_In( Abo, 3 );
+            XOR_In( Abu, 4 );
+            XOR_In( Aga, 5 );
+            XOR_In( Age, 6 );
+            XOR_In( Agi, 7 );
+            XOR_In( Ago, 8 );
+            XOR_In( Agu, 9 );
+            XOR_In( Aka, 10 );
+            XOR_In( Ake, 11 );
+            XOR_In( Aki, 12 );
+            XOR_In( Ako, 13 );
+            XOR_In( Aku, 14 );
+            XOR_In( Ama, 15 );
+            XOR_In( Ame, 16 );
+            XOR_In( Ami, 17 );
+            XOR_In( Amo, 18 );
+            XOR_In( Amu, 19 );
+            XOR_In( Asa, 20 );
+            #undef XOR_In
+            rounds24
+            curData0 += laneOffsetSerial;
+            curData1 += laneOffsetSerial;
+            curData2 += laneOffsetSerial;
+            curData3 += laneOffsetSerial;
+            dataByteLen -= laneOffsetSerial*8;
+        }
+        copyToState(statesAsLanes, A)
+        return (const unsigned char *)curData0 - dataStart;
+#endif
+    }
+    else {
+//        unsigned int i;
+        const unsigned char *dataStart = data;
+
+        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
+            KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
+            KeccakP1600times4_PermuteAll_24rounds(states);
+            data += laneOffsetSerial*8;
+            dataByteLen -= laneOffsetSerial*8;
+        }
+        return data - dataStart;
+    }
+}
+
+size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
+{
+    if (laneCount == 21) {
+#if 0
+        const unsigned char *dataStart = data;
+        const UINT64 *curData0 = (const UINT64 *)data;
+        const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
+        const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
+        const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
+
+        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
+            V256 *stateAsLanes = states;
+            V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
+            #define Xor_In( argIndex ) \
+                XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
+            #define Xor_In4( argIndex ) \
+                lanes0 = LOAD256u( curData0[argIndex]),\
+                lanes1 = LOAD256u( curData1[argIndex]),\
+                lanes2 = LOAD256u( curData2[argIndex]),\
+                lanes3 = LOAD256u( curData3[argIndex]),\
+                INTLEAVE(),\
+                XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
+                XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
+                XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
+                XOReq256( stateAsLanes[argIndex+3], lanes3 )
+            Xor_In4( 0 );
+            Xor_In4( 4 );
+            Xor_In4( 8 );
+            Xor_In4( 12 );
+            Xor_In4( 16 );
+            Xor_In( 20 );
+            #undef  Xor_In
+            #undef  Xor_In4
+            KeccakP1600times4_PermuteAll_12rounds(states);
+            curData0 += laneOffsetSerial;
+            curData1 += laneOffsetSerial;
+            curData2 += laneOffsetSerial;
+            curData3 += laneOffsetSerial;
+            dataByteLen -= laneOffsetSerial*8;
+        }
+        return (const unsigned char *)curData0 - dataStart;
+#else
+//        unsigned int i;
+        const unsigned char *dataStart = data;
+        // correcting cast-align errors
+        // old version: const UINT64 *curData0 = (const UINT64 *)data;
+        const UINT64 *curData0 = (const void *)data;
+        // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
+        const UINT64 *curData1 = (const void *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
+        // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
+        const UINT64 *curData2 = (const void *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
+        // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
+        const UINT64 *curData3 = (const void *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
+        V256 *statesAsLanes = states;
+        declareABCDE
+
+        copyFromState(A, statesAsLanes)
+        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
+            #define XOR_In( Xxx, argIndex ) \
+                XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
+            XOR_In( Aba, 0 );
+            XOR_In( Abe, 1 );
+            XOR_In( Abi, 2 );
+            XOR_In( Abo, 3 );
+            XOR_In( Abu, 4 );
+            XOR_In( Aga, 5 );
+            XOR_In( Age, 6 );
+            XOR_In( Agi, 7 );
+            XOR_In( Ago, 8 );
+            XOR_In( Agu, 9 );
+            XOR_In( Aka, 10 );
+            XOR_In( Ake, 11 );
+            XOR_In( Aki, 12 );
+            XOR_In( Ako, 13 );
+            XOR_In( Aku, 14 );
+            XOR_In( Ama, 15 );
+            XOR_In( Ame, 16 );
+            XOR_In( Ami, 17 );
+            XOR_In( Amo, 18 );
+            XOR_In( Amu, 19 );
+            XOR_In( Asa, 20 );
+            #undef XOR_In
+            rounds12
+            curData0 += laneOffsetSerial;
+            curData1 += laneOffsetSerial;
+            curData2 += laneOffsetSerial;
+            curData3 += laneOffsetSerial;
+            dataByteLen -= laneOffsetSerial*8;
+        }
+        copyToState(statesAsLanes, A)
+        return (const unsigned char *)curData0 - dataStart;
+#endif
+    }
+    else {
+//        unsigned int i;
+        const unsigned char *dataStart = data;
+
+        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
+            KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
+            KeccakP1600times4_PermuteAll_12rounds(states);
+            data += laneOffsetSerial*8;
+            dataByteLen -= laneOffsetSerial*8;
+        }
+        return data - dataStart;
+    }
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SnP_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SnP_avx2.h
new file mode 100644
index 0000000000..2640191779
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SnP_avx2.h
@@ -0,0 +1,63 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#pragma once
+
+/** For the documentation, see PlSnP-documentation.h.
+ */
+
+#include "KeccakP-SIMD256-config_avx2.h"
+#include "kyber512r3_params.h"
+#include "kyber512r3_fips202x4_avx2.h"
+
+#define KeccakP1600times4_implementation        "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")"
+#define KeccakP1600times4_statesSizeInBytes     800
+#define KeccakP1600times4_statesAlignment       32
+#define KeccakF1600times4_FastLoop_supported
+#define KeccakP1600times4_12rounds_FastLoop_supported
+
+#include <stddef.h>
+
+#define KeccakP1600times4_StaticInitialize()
+#define KeccakP1600times4_InitializeAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_InitializeAll)
+void KeccakP1600times4_InitializeAll(void *states);
+#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \
+    ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte)
+#define KeccakP1600times4_AddBytes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_AddBytes)
+void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
+#define KeccakP1600times4_AddLanesAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_AddLanesAll)
+void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
+#define KeccakP1600times4_OverwriteBytes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_OverwriteBytes)
+void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
+#define KeccakP1600times4_OverwriteLanesAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_OverwriteLanesAll)
+void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
+#define KeccakP1600times4_OverwriteWithZeroes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_OverwriteWithZeroes)
+void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount);
+#define KeccakP1600times4_PermuteAll_12rounds S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_PermuteAll_12rounds)
+void KeccakP1600times4_PermuteAll_12rounds(void *states);
+#define KeccakP1600times4_PermuteAll_24rounds S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds)
+void KeccakP1600times4_PermuteAll_24rounds(void *states);
+#define KeccakP1600times4_ExtractBytes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_ExtractBytes)
+void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length);
+#define KeccakP1600times4_ExtractLanesAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_ExtractLanesAll)
+void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
+#define KeccakP1600times4_ExtractAndAddBytes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_ExtractAndAddBytes)
+void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex,  const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
+#define KeccakP1600times4_ExtractAndAddLanesAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_ExtractAndAddLanesAll)
+void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset);
+#define KeccakF1600times4_FastLoop_Absorb S2N_KYBER_512_R3_NAMESPACE(KeccakF1600times4_FastLoop_Absorb)
+size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
+#define KeccakP1600times4_12rounds_FastLoop_Absorb S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_12rounds_FastLoop_Absorb)
+size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-SIMD256-config_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-SIMD256-config_avx2.h
new file mode 100644
index 0000000000..1c65fe29b4
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-SIMD256-config_avx2.h
@@ -0,0 +1,3 @@
+#define KeccakP1600times4_implementation_config "AVX2, all rounds unrolled"
+#define KeccakP1600times4_fullUnrolling
+#define KeccakP1600times4_useAVX2
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-align_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-align_avx2.h
new file mode 100644
index 0000000000..be08e84af2
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-align_avx2.h
@@ -0,0 +1,31 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#pragma once
+
+/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
+#ifdef ALIGN
+#undef ALIGN
+#endif
+
+#if defined(__GNUC__)
+#define ALIGN(x) __attribute__ ((aligned(x)))
+#elif defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#elif defined(__ARMCC_VERSION)
+#define ALIGN(x) __align(x)
+#else
+#define ALIGN(x)
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-brg_endian_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-brg_endian_avx2.h
new file mode 100644
index 0000000000..8e8b73cf2a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-brg_endian_avx2.h
@@ -0,0 +1,139 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+ Changes for ARM 9/9/2010
+*/
+
+#pragma once
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+#if 0
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif defined(__arm__)
+# ifdef __BIG_ENDIAN
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# else
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif 1     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_align_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_align_avx2.h
new file mode 100644
index 0000000000..79e6d9ec0c
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_align_avx2.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <stdint.h>
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+#define ALIGNED_UINT8(N)        \
+    union {                     \
+        uint8_t coeffs[N];      \
+        __m256i vec[(N+31)/32]; \
+    }
+
+#define ALIGNED_INT16(N)        \
+    union {                     \
+        int16_t coeffs[N];      \
+        __m256i vec[(N+15)/16]; \
+    }
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_basemul_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_basemul_avx2.S
new file mode 100644
index 0000000000..ed2a65be20
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_basemul_avx2.S
@@ -0,0 +1,105 @@
+#include "kyber512r3_consts_avx2.h"
+
+.macro schoolbook off
+vmovdqa     _16XQINV*2(%rcx),%ymm0
+vmovdqa     (64*\off+ 0)*2(%rsi),%ymm1       # a0
+vmovdqa     (64*\off+16)*2(%rsi),%ymm2       # b0
+vmovdqa     (64*\off+32)*2(%rsi),%ymm3       # a1
+vmovdqa     (64*\off+48)*2(%rsi),%ymm4       # b1
+
+vpmullw     %ymm0,%ymm1,%ymm9                # a0.lo
+vpmullw     %ymm0,%ymm2,%ymm10               # b0.lo
+vpmullw     %ymm0,%ymm3,%ymm11               # a1.lo
+vpmullw     %ymm0,%ymm4,%ymm12               # b1.lo
+
+vmovdqa     (64*\off+ 0)*2(%rdx),%ymm5       # c0
+vmovdqa     (64*\off+16)*2(%rdx),%ymm6       # d0
+
+vpmulhw     %ymm5,%ymm1,%ymm13               # a0c0.hi
+vpmulhw     %ymm6,%ymm1,%ymm1                # a0d0.hi
+vpmulhw     %ymm5,%ymm2,%ymm14               # b0c0.hi
+vpmulhw     %ymm6,%ymm2,%ymm2                # b0d0.hi
+
+vmovdqa     (64*\off+32)*2(%rdx),%ymm7       # c1
+vmovdqa     (64*\off+48)*2(%rdx),%ymm8       # d1
+
+vpmulhw     %ymm7,%ymm3,%ymm15               # a1c1.hi
+vpmulhw     %ymm8,%ymm3,%ymm3                # a1d1.hi
+vpmulhw     %ymm7,%ymm4,%ymm0                # b1c1.hi
+vpmulhw     %ymm8,%ymm4,%ymm4                # b1d1.hi
+
+vmovdqa     %ymm13,(%rsp)
+
+vpmullw     %ymm5,%ymm9,%ymm13               # a0c0.lo
+vpmullw     %ymm6,%ymm9,%ymm9                # a0d0.lo
+vpmullw     %ymm5,%ymm10,%ymm5               # b0c0.lo
+vpmullw     %ymm6,%ymm10,%ymm10              # b0d0.lo
+
+vpmullw     %ymm7,%ymm11,%ymm6               # a1c1.lo
+vpmullw     %ymm8,%ymm11,%ymm11              # a1d1.lo
+vpmullw     %ymm7,%ymm12,%ymm7               # b1c1.lo
+vpmullw     %ymm8,%ymm12,%ymm12              # b1d1.lo
+
+vmovdqa     _16XQ*2(%rcx),%ymm8
+vpmulhw     %ymm8,%ymm13,%ymm13
+vpmulhw     %ymm8,%ymm9,%ymm9
+vpmulhw     %ymm8,%ymm5,%ymm5
+vpmulhw     %ymm8,%ymm10,%ymm10
+vpmulhw     %ymm8,%ymm6,%ymm6
+vpmulhw     %ymm8,%ymm11,%ymm11
+vpmulhw     %ymm8,%ymm7,%ymm7
+vpmulhw     %ymm8,%ymm12,%ymm12
+
+vpsubw      (%rsp),%ymm13,%ymm13              # -a0c0
+vpsubw      %ymm9,%ymm1,%ymm9                 # a0d0
+vpsubw      %ymm5,%ymm14,%ymm5                # b0c0
+vpsubw      %ymm10,%ymm2,%ymm10               # b0d0
+
+vpsubw      %ymm6,%ymm15,%ymm6                # a1c1
+vpsubw      %ymm11,%ymm3,%ymm11               # a1d1
+vpsubw      %ymm7,%ymm0,%ymm7                 # b1c1
+vpsubw      %ymm12,%ymm4,%ymm12               # b1d1
+
+vmovdqa     (%r9),%ymm0
+vmovdqa     32(%r9),%ymm1
+vpmullw     %ymm0,%ymm10,%ymm2
+vpmullw     %ymm0,%ymm12,%ymm3
+vpmulhw     %ymm1,%ymm10,%ymm10
+vpmulhw     %ymm1,%ymm12,%ymm12
+vpmulhw     %ymm8,%ymm2,%ymm2
+vpmulhw     %ymm8,%ymm3,%ymm3
+vpsubw      %ymm2,%ymm10,%ymm10               # rb0d0
+vpsubw      %ymm3,%ymm12,%ymm12               # rb1d1
+
+vpaddw      %ymm5,%ymm9,%ymm9
+vpaddw      %ymm7,%ymm11,%ymm11
+vpsubw      %ymm13,%ymm10,%ymm13
+vpsubw      %ymm12,%ymm6,%ymm6
+
+vmovdqa     %ymm13,(64*\off+ 0)*2(%rdi)
+vmovdqa     %ymm9,(64*\off+16)*2(%rdi)
+vmovdqa     %ymm6,(64*\off+32)*2(%rdi)
+vmovdqa     %ymm11,(64*\off+48)*2(%rdi)
+.endm
+
+.text
+.global cdecl(basemul_avx2_asm)
+cdecl(basemul_avx2_asm):
+mov         %rsp,%r8
+and         $-32,%rsp
+sub         $32,%rsp
+
+lea         (_ZETAS_EXP+176)*2(%rcx),%r9
+schoolbook  0
+
+add         $32*2,%r9
+schoolbook  1
+
+add         $192*2,%r9
+schoolbook  2
+
+add         $32*2,%r9
+schoolbook  3
+
+mov         %r8,%rsp
+ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.c
new file mode 100644
index 0000000000..ef0bb87946
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.c
@@ -0,0 +1,104 @@
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_cbd.h"
+
+/*************************************************
+* Name:        load32_littleendian
+*
+* Description: load 4 bytes into a 32-bit integer
+*              in little-endian order
+*
+* Arguments:   - const uint8_t *x: pointer to input byte array
+*
+* Returns 32-bit unsigned integer loaded from x
+**************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4]) {
+    uint32_t r;
+    r  = (uint32_t)x[0];
+    r |= (uint32_t)x[1] << 8;
+    r |= (uint32_t)x[2] << 16;
+    r |= (uint32_t)x[3] << 24;
+    return r;
+}
+
+/*************************************************
+* Name:        load24_littleendian
+*
+* Description: load 3 bytes into a 32-bit integer
+*              in little-endian order
+*              This function is only needed for Kyber-512
+*
+* Arguments:   - const uint8_t *x: pointer to input byte array
+*
+* Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+**************************************************/
+static uint32_t load24_littleendian(const uint8_t x[3]) {
+    uint32_t r;
+    r  = (uint32_t)x[0];
+    r |= (uint32_t)x[1] << 8;
+    r |= (uint32_t)x[2] << 16;
+    return r;
+}
+
+
+/*************************************************
+* Name:        cbd2
+*
+* Description: Given an array of uniformly random bytes, compute
+*              polynomial with coefficients distributed according to
+*              a centered binomial distribution with parameter eta=2
+*
+* Arguments:   - poly *r:            pointer to output polynomial
+*              - const uint8_t *buf: pointer to input byte array
+**************************************************/
+static void cbd2(poly *r, const uint8_t buf[2 * S2N_KYBER_512_R3_N / 4]) {
+    unsigned int i, j;
+
+    for (i = 0; i < S2N_KYBER_512_R3_N / 8; i++) {
+        uint32_t t  = load32_littleendian(buf + 4 * i);
+        uint32_t d  = t & 0x55555555;
+        d += (t >> 1) & 0x55555555;
+
+        for (j = 0; j < 8; j++) {
+            int16_t a = (d >> (4 * j + 0)) & 0x3;
+            int16_t b = (d >> (4 * j + 2)) & 0x3;
+            r->coeffs[8 * i + j] = a - b;
+        }
+    }
+}
+
+/*************************************************
+* Name:        cbd3
+*
+* Description: Given an array of uniformly random bytes, compute
+*              polynomial with coefficients distributed according to
+*              a centered binomial distribution with parameter eta=3
+*              This function is only needed for Kyber-512
+*
+* Arguments:   - poly *r:            pointer to output polynomial
+*              - const uint8_t *buf: pointer to input byte array
+**************************************************/
+static void cbd3(poly *r, const uint8_t buf[3 * S2N_KYBER_512_R3_N / 4]) {
+    unsigned int i, j;
+
+    for (i = 0; i < S2N_KYBER_512_R3_N / 4; i++) {
+        uint32_t t  = load24_littleendian(buf + 3 * i);
+        uint32_t d  = t & 0x00249249;
+        d += (t >> 1) & 0x00249249;
+        d += (t >> 2) & 0x00249249;
+
+        for (j = 0; j < 4; j++) {
+            int16_t a = (d >> (6 * j + 0)) & 0x7;
+            int16_t b = (d >> (6 * j + 3)) & 0x7;
+            r->coeffs[4 * i + j] = a - b;
+        }
+    }
+}
+
+void cbd_eta1(poly *r, const uint8_t buf[S2N_KYBER_512_R3_ETA1 * S2N_KYBER_512_R3_N / 4]) {
+    cbd3(r, buf);
+}
+
+void cbd_eta2(poly *r, const uint8_t buf[S2N_KYBER_512_R3_ETA2 * S2N_KYBER_512_R3_N / 4]) {
+    cbd2(r, buf);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.h
new file mode 100644
index 0000000000..631821956c
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_poly.h"
+
+#define cbd_eta1 S2N_KYBER_512_R3_NAMESPACE(cbd_eta1)
+void cbd_eta1(poly *r, const uint8_t buf[S2N_KYBER_512_R3_ETA1 * S2N_KYBER_512_R3_N / 4]);
+
+#define cbd_eta2 S2N_KYBER_512_R3_NAMESPACE(cbd_eta2)
+void cbd_eta2(poly *r, const uint8_t buf[S2N_KYBER_512_R3_ETA2 * S2N_KYBER_512_R3_N / 4]);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.c
new file mode 100644
index 0000000000..a922bd220f
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.c
@@ -0,0 +1,137 @@
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_cbd_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+/*************************************************
+* Name:        cbd2
+*
+* Description: Given an array of uniformly random bytes, compute
+*              polynomial with coefficients distributed according to
+*              a centered binomial distribution with parameter eta=2
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const __m256i *buf: pointer to aligned input byte array
+**************************************************/
+static void cbd2(poly * restrict r, const __m256i buf[2*S2N_KYBER_512_R3_N/128])
+{ 
+  unsigned int i;
+  __m256i f0, f1, f2, f3;
+  const __m256i mask55 = _mm256_set1_epi32(0x55555555);
+  const __m256i mask33 = _mm256_set1_epi32(0x33333333);
+  const __m256i mask03 = _mm256_set1_epi32(0x03030303);
+  const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F);
+
+  for(i = 0; i < S2N_KYBER_512_R3_N/64; i++) {
+    f0 = _mm256_load_si256(&buf[i]);
+
+    f1 = _mm256_srli_epi16(f0, 1);
+    f0 = _mm256_and_si256(mask55, f0);
+    f1 = _mm256_and_si256(mask55, f1);
+    f0 = _mm256_add_epi8(f0, f1);
+
+    f1 = _mm256_srli_epi16(f0, 2);
+    f0 = _mm256_and_si256(mask33, f0);
+    f1 = _mm256_and_si256(mask33, f1);
+    f0 = _mm256_add_epi8(f0, mask33);
+    f0 = _mm256_sub_epi8(f0, f1);
+
+    f1 = _mm256_srli_epi16(f0, 4);
+    f0 = _mm256_and_si256(mask0F, f0);
+    f1 = _mm256_and_si256(mask0F, f1);
+    f0 = _mm256_sub_epi8(f0, mask03);
+    f1 = _mm256_sub_epi8(f1, mask03);
+
+    f2 = _mm256_unpacklo_epi8(f0, f1);
+    f3 = _mm256_unpackhi_epi8(f0, f1);
+
+    f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2));
+    f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2,1));
+    f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3));
+    f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3,1));
+
+    _mm256_store_si256(&r->vec[4*i+0], f0);
+    _mm256_store_si256(&r->vec[4*i+1], f2);
+    _mm256_store_si256(&r->vec[4*i+2], f1);
+    _mm256_store_si256(&r->vec[4*i+3], f3);
+  }
+}
+
+/*************************************************
+* Name:        cbd3
+*
+* Description: Given an array of uniformly random bytes, compute
+*              polynomial with coefficients distributed according to
+*              a centered binomial distribution with parameter eta=3
+*              This function is only needed for Kyber-512
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const __m256i *buf: pointer to aligned input byte array
+**************************************************/
+static void cbd3(poly * restrict r, const uint8_t buf[3*S2N_KYBER_512_R3_N/4+8])
+{
+  unsigned int i;
+  __m256i f0, f1, f2, f3;
+  const __m256i mask249 = _mm256_set1_epi32(0x249249);
+  const __m256i mask6DB = _mm256_set1_epi32(0x6DB6DB);
+  const __m256i mask07 = _mm256_set1_epi32(7);
+  const __m256i mask70 = _mm256_set1_epi32(7 << 16);
+  const __m256i mask3 = _mm256_set1_epi16(3);
+  const __m256i shufbidx = _mm256_set_epi8(-1,15,14,13,-1,12,11,10,-1, 9, 8, 7,-1, 6, 5, 4,
+                                           -1,11,10, 9,-1, 8, 7, 6,-1, 5, 4, 3,-1, 2, 1, 0);
+
+  for(i = 0; i < S2N_KYBER_512_R3_N/32; i++) {
+    // correcting cast-align and cast-qual errors
+    // old version: f0 = _mm256_loadu_si256((__m256i *)&buf[24*i]);
+    f0 = _mm256_loadu_si256((const void *)&buf[24*i]);
+    f0 = _mm256_permute4x64_epi64(f0,0x94);
+    f0 = _mm256_shuffle_epi8(f0,shufbidx);
+
+    f1 = _mm256_srli_epi32(f0,1);
+    f2 = _mm256_srli_epi32(f0,2);
+    f0 = _mm256_and_si256(mask249,f0);
+    f1 = _mm256_and_si256(mask249,f1);
+    f2 = _mm256_and_si256(mask249,f2);
+    f0 = _mm256_add_epi32(f0,f1);
+    f0 = _mm256_add_epi32(f0,f2);
+
+    f1 = _mm256_srli_epi32(f0,3);
+    f0 = _mm256_add_epi32(f0,mask6DB);
+    f0 = _mm256_sub_epi32(f0,f1);
+
+    f1 = _mm256_slli_epi32(f0,10);
+    f2 = _mm256_srli_epi32(f0,12);
+    f3 = _mm256_srli_epi32(f0, 2);
+    f0 = _mm256_and_si256(f0,mask07);
+    f1 = _mm256_and_si256(f1,mask70);
+    f2 = _mm256_and_si256(f2,mask07);
+    f3 = _mm256_and_si256(f3,mask70);
+    f0 = _mm256_add_epi16(f0,f1);
+    f1 = _mm256_add_epi16(f2,f3);
+    f0 = _mm256_sub_epi16(f0,mask3);
+    f1 = _mm256_sub_epi16(f1,mask3);
+
+    f2 = _mm256_unpacklo_epi32(f0,f1);
+    f3 = _mm256_unpackhi_epi32(f0,f1);
+
+    f0 = _mm256_permute2x128_si256(f2,f3,0x20);
+    f1 = _mm256_permute2x128_si256(f2,f3,0x31);
+
+    _mm256_store_si256(&r->vec[2*i+0], f0);
+    _mm256_store_si256(&r->vec[2*i+1], f1);
+  }
+}
+
+/* buf 32 bytes longer for cbd3 */
+void poly_cbd_eta1_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/128+1])
+{
+  // correcting cast-align and cast-qual errors
+  // old version: cbd3(r, (uint8_t *)buf);
+  cbd3(r, (const void *)buf);
+}
+
+void poly_cbd_eta2_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/128])
+{
+  cbd2(r, buf);
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.h
new file mode 100644
index 0000000000..972c71fbf5
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_poly_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+#define poly_cbd_eta1_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_cbd_eta1_avx2)
+void poly_cbd_eta1_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/128+1]);
+
+#define poly_cbd_eta2_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_cbd_eta2_avx2)
+void poly_cbd_eta2_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/128]);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.c
new file mode 100644
index 0000000000..cdc0b817df
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.c
@@ -0,0 +1,122 @@
+#include "kyber512r3_align_avx2.h"
+#include "kyber512r3_consts_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#define Q S2N_KYBER_512_R3_Q
+#define MONT -1044 // 2^16 mod q
+#define QINV -3327 // q^-1 mod 2^16
+#define V 20159 // floor(2^26/q + 0.5)
+#define FHI 1441 // mont^2/128
+#define FLO -10079 // qinv*FHI
+#define MONTSQHI 1353 // mont^2
+#define MONTSQLO 20553 // qinv*MONTSQHI
+#define MASK 4095
+#define SHIFT 32
+
+const qdata_t qdata = {{
+#define _16XQ 0
+  Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q,
+
+#define _16XQINV 16
+  QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
+  QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
+
+#define _16XV 32
+  V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+
+#define _16XFLO 48
+  FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
+  FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
+
+#define _16XFHI 64
+  FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
+  FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
+
+#define _16XMONTSQLO 80
+  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
+  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
+  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
+  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
+
+#define _16XMONTSQHI 96
+  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
+  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
+  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
+  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
+
+#define _16XMASK 112
+  MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
+  MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
+
+#define _REVIDXB 128
+  3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
+  3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
+
+#define _REVIDXD 144
+  7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0,
+
+#define _ZETAS_EXP 160
+   31498,  31498,  31498,  31498,   -758,   -758,   -758,   -758,
+    5237,   5237,   5237,   5237,   1397,   1397,   1397,   1397,
+   14745,  14745,  14745,  14745,  14745,  14745,  14745,  14745,
+   14745,  14745,  14745,  14745,  14745,  14745,  14745,  14745,
+    -359,   -359,   -359,   -359,   -359,   -359,   -359,   -359,
+    -359,   -359,   -359,   -359,   -359,   -359,   -359,   -359,
+   13525,  13525,  13525,  13525,  13525,  13525,  13525,  13525,
+  -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402,
+    1493,   1493,   1493,   1493,   1493,   1493,   1493,   1493,
+    1422,   1422,   1422,   1422,   1422,   1422,   1422,   1422,
+  -20907, -20907, -20907, -20907,  27758,  27758,  27758,  27758,
+   -3799,  -3799,  -3799,  -3799, -15690, -15690, -15690, -15690,
+    -171,   -171,   -171,   -171,    622,    622,    622,    622,
+    1577,   1577,   1577,   1577,    182,    182,    182,    182,
+   -5827,  -5827,  17363,  17363, -26360, -26360, -29057, -29057,
+    5571,   5571,  -1102,  -1102,  21438,  21438, -26242, -26242,
+     573,    573,  -1325,  -1325,    264,    264,    383,    383,
+    -829,   -829,   1458,   1458,  -1602,  -1602,   -130,   -130,
+   -5689,  -6516,   1496,  30967, -23565,  20179,  20710,  25080,
+  -12796,  26616,  16064, -12442,   9134,   -650, -25986,  27837,
+    1223,    652,   -552,   1015,  -1293,   1491,   -282,  -1544,
+     516,     -8,   -320,   -666,  -1618,  -1162,    126,   1469,
+    -335, -11477, -32227,  20494, -27738,    945, -14883,   6182,
+   32010,  10631,  29175, -28762, -18486,  17560, -14430,  -5276,
+   -1103,    555,  -1251,   1550,    422,    177,   -291,   1574,
+    -246,   1159,   -777,   -602,  -1590,   -872,    418,   -156,
+   11182,  13387, -14233, -21655,  13131,  -4587,  23092,   5493,
+  -32502,  30317, -18741,  12639,  20100,  18525,  19529, -12619,
+     430,    843,    871,    105,    587,   -235,   -460,   1653,
+     778,   -147,   1483,   1119,    644,    349,    329,    -75,
+     787,    787,    787,    787,    787,    787,    787,    787,
+     787,    787,    787,    787,    787,    787,    787,    787,
+   -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,
+   -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,
+   28191,  28191,  28191,  28191,  28191,  28191,  28191,  28191,
+  -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694,
+     287,    287,    287,    287,    287,    287,    287,    287,
+     202,    202,    202,    202,    202,    202,    202,    202,
+   10690,  10690,  10690,  10690,   1358,   1358,   1358,   1358,
+  -11202, -11202, -11202, -11202,  31164,  31164,  31164,  31164,
+     962,    962,    962,    962,  -1202,  -1202,  -1202,  -1202,
+   -1474,  -1474,  -1474,  -1474,   1468,   1468,   1468,   1468,
+  -28073, -28073,  24313,  24313, -10532, -10532,   8800,   8800,
+   18426,  18426,   8859,   8859,  26675,  26675, -16163, -16163,
+    -681,   -681,   1017,   1017,    732,    732,    608,    608,
+   -1542,  -1542,    411,    411,   -205,   -205,  -1571,  -1571,
+   19883, -28250, -15887,  -8898, -28309,   9075, -30199,  18249,
+   13426,  14017, -29156, -12757,  16832,   4311, -24155, -17915,
+    -853,    -90,   -271,    830,    107,  -1421,   -247,   -951,
+    -398,    961,  -1508,   -725,    448,  -1065,    677,  -1275,
+  -31183,  25435,  -7382,  24391, -20927,  10946,  24214,  16989,
+   10335,  -7934, -22502,  10906,  31636,  28644,  23998, -17422,
+     817,    603,   1322,  -1465,  -1215,   1218,   -874,  -1187,
+   -1185,  -1278,  -1510,   -870,   -108,    996,    958,   1522,
+   20297,   2146,  15355, -32384,  -6280, -14903, -11044,  14469,
+  -21498, -20198,  23210, -17442, -23860, -20257,   7756,  23132,
+    1097,    610,  -1285,    384,   -136,  -1335,    220,  -1659,
+   -1530,    794,   -854,    478,   -308,    991,  -1460,   1628,
+
+#define _16XSHIFT 624
+  SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT,
+  SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT
+}};
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.h
new file mode 100644
index 0000000000..1983ba44d6
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include "kyber512r3_params.h"
+
+#define _16XQ            0
+#define _16XQINV        16
+#define _16XV           32
+#define _16XFLO         48
+#define _16XFHI         64
+#define _16XMONTSQLO    80
+#define _16XMONTSQHI    96
+#define _16XMASK       112
+#define _REVIDXB       128
+#define _REVIDXD       144
+#define _ZETAS_EXP     160
+#define	_16XSHIFT      624
+
+/* The C ABI on MacOS exports all symbols with a leading
+ * underscore. This means that any symbols we refer to from
+ * C files (functions) can't be found, and all symbols we
+ * refer to from ASM also can't be found.
+ *
+ * This define helps us get around this
+ */
+#ifdef __ASSEMBLER__
+#if defined(__WIN32__) || defined(__APPLE__)
+#define decorate(s) _##s
+#define cdecl2(s) decorate(s)
+#define cdecl(s) cdecl2(S2N_KYBER_512_R3_NAMESPACE(##s))
+#else
+#define cdecl(s) S2N_KYBER_512_R3_NAMESPACE(##s)
+#endif
+#endif
+
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#ifndef __ASSEMBLER__
+#include "kyber512r3_align_avx2.h"
+typedef ALIGNED_INT16(640) qdata_t;
+#define qdata S2N_KYBER_512_R3_NAMESPACE(qdata)
+extern const qdata_t qdata;
+#endif
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.c
index 8289a526b3..c5ce0c91f2 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.c
@@ -7,7 +7,9 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include "fips202.h"
+
+#include "kyber512r3_params.h"
+#include "kyber512r3_fips202.h"
 
 #define NROUNDS 24
 #define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset))))
@@ -24,7 +26,7 @@
 static uint64_t load64(const uint8_t *x) {
     uint64_t r = 0;
     for (size_t i = 0; i < 8; ++i) {
-        r |= (uint64_t) x[i] << 8 * i;
+        r |= (uint64_t)x[i] << 8 * i;
     }
 
     return r;
@@ -46,18 +48,19 @@ static void store64(uint8_t *x, uint64_t u) {
 
 /* Keccak round constants */
 static const uint64_t KeccakF_RoundConstants[NROUNDS] = {
-        0x0000000000000001ULL, 0x0000000000008082ULL,
-        0x800000000000808aULL, 0x8000000080008000ULL,
-        0x000000000000808bULL, 0x0000000080000001ULL,
-        0x8000000080008081ULL, 0x8000000000008009ULL,
-        0x000000000000008aULL, 0x0000000000000088ULL,
-        0x0000000080008009ULL, 0x000000008000000aULL,
-        0x000000008000808bULL, 0x800000000000008bULL,
-        0x8000000000008089ULL, 0x8000000000008003ULL,
-        0x8000000000008002ULL, 0x8000000000000080ULL,
-        0x000000000000800aULL, 0x800000008000000aULL,
-        0x8000000080008081ULL, 0x8000000000008080ULL,
-        0x0000000080000001ULL, 0x8000000080008008ULL};
+    0x0000000000000001ULL, 0x0000000000008082ULL,
+    0x800000000000808aULL, 0x8000000080008000ULL,
+    0x000000000000808bULL, 0x0000000080000001ULL,
+    0x8000000080008081ULL, 0x8000000000008009ULL,
+    0x000000000000008aULL, 0x0000000000000088ULL,
+    0x0000000080008009ULL, 0x000000008000000aULL,
+    0x000000008000808bULL, 0x800000000000008bULL,
+    0x8000000000008089ULL, 0x8000000000008003ULL,
+    0x8000000000008002ULL, 0x8000000000000080ULL,
+    0x000000000000800aULL, 0x800000008000000aULL,
+    0x8000000080008081ULL, 0x8000000000008080ULL,
+    0x0000000080000001ULL, 0x8000000080008008ULL,
+};
 
 /*************************************************
  * Name:        KeccakF1600_StatePermute
@@ -74,9 +77,8 @@ static void KeccakF1600_StatePermute(uint64_t *state) {
     uint64_t Aka, Ake, Aki, Ako, Aku;
     uint64_t Ama, Ame, Ami, Amo, Amu;
     uint64_t Asa, Ase, Asi, Aso, Asu;
-    uint64_t BCa, BCe, BCi, BCo, BCu;
 
-    // copyFromState(A, state)
+    /* copyFromState(A, state) */
     Aba = state[0];
     Abe = state[1];
     Abi = state[2];
@@ -104,6 +106,7 @@ static void KeccakF1600_StatePermute(uint64_t *state) {
     Asu = state[24];
 
     for (round = 0; round < NROUNDS; round += 2) {
+        uint64_t BCa, BCe, BCi, BCo, BCu;
         uint64_t Da, De, Di, Do, Du;
         uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
         uint64_t Ega, Ege, Egi, Ego, Egu;
@@ -111,14 +114,14 @@ static void KeccakF1600_StatePermute(uint64_t *state) {
         uint64_t Ema, Eme, Emi, Emo, Emu;
         uint64_t Esa, Ese, Esi, Eso, Esu;
 
-        //    prepareTheta
+        /* prepareTheta */
         BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
         BCe = Abe ^ Age ^ Ake ^ Ame ^ Ase;
         BCi = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
         BCo = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
         BCu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;
 
-        // thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+        /* thetaRhoPiChiIotaPrepareTheta(round  , A, E) */
         Da = BCu ^ ROL(BCe, 1);
         De = BCa ^ ROL(BCi, 1);
         Di = BCe ^ ROL(BCo, 1);
@@ -206,14 +209,14 @@ static void KeccakF1600_StatePermute(uint64_t *state) {
         Eso = BCo ^ ((~BCu) & BCa);
         Esu = BCu ^ ((~BCa) & BCe);
 
-        //    prepareTheta
+        /* prepareTheta */
         BCa = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
         BCe = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
         BCi = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
         BCo = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
         BCu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;
 
-        // thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+        /* thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */
         Da = BCu ^ ROL(BCe, 1);
         De = BCa ^ ROL(BCi, 1);
         Di = BCe ^ ROL(BCo, 1);
@@ -302,7 +305,7 @@ static void KeccakF1600_StatePermute(uint64_t *state) {
         Asu = BCu ^ ((~BCa) & BCe);
     }
 
-    // copyToState(state, A)
+    /* copyToState(state, A) */
     state[0] = Aba;
     state[1] = Abe;
     state[2] = Abi;
@@ -400,6 +403,37 @@ static void keccak_squeezeblocks(uint8_t *h, size_t nblocks, uint64_t *s, uint32
 }
 
 /*************************************************
+ * Name:        shake128_absorb
+ *
+ * Description: Absorb step of the SHAKE128 XOF.
+ *              non-incremental, starts by zeroeing the state.
+ *
+ * Arguments:   - uint64_t *s: pointer to (uninitialized) output Keccak state
+ *              - const uint8_t *input: pointer to input to be absorbed
+ *                                            into s
+ *              - size_t inlen: length of input in bytes
+ **************************************************/
+void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen) {
+    keccak_absorb(state->ctx, S2N_KYBER_512_R3_SHAKE128_RATE, input, inlen, 0x1F);
+}
+
+/*************************************************
+ * Name:        shake128_squeezeblocks
+ *
+ * Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of
+ *              SHAKE128_RATE bytes each. Modifies the state. Can be called
+ *              multiple times to keep squeezing, i.e., is incremental.
+ *
+ * Arguments:   - uint8_t *output: pointer to output blocks
+ *              - size_t nblocks: number of blocks to be squeezed
+ *                                            (written to output)
+ *              - shake128ctx *state: pointer to input/output Keccak state
+ **************************************************/
+void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state) {
+    keccak_squeezeblocks(output, nblocks, state->ctx, S2N_KYBER_512_R3_SHAKE128_RATE);
+}
+
+/*************************************************
  * Name:        shake256_absorb
  *
  * Description: Absorb step of the SHAKE256 XOF.
@@ -410,8 +444,8 @@ static void keccak_squeezeblocks(uint8_t *h, size_t nblocks, uint64_t *s, uint32
  *                                            into s
  *              - size_t inlen: length of input in bytes
  **************************************************/
-static void shake256_absorb(shake256_ctx *state, const uint8_t *input, size_t inlen) {
-    keccak_absorb(state->ctx, SHAKE256_RATE, input, inlen, 0x1F);
+void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen) {
+    keccak_absorb(state->ctx, S2N_KYBER_512_R3_SHAKE256_RATE, input, inlen, 0x1F);
 }
 
 /*************************************************
@@ -426,8 +460,8 @@ static void shake256_absorb(shake256_ctx *state, const uint8_t *input, size_t in
  *                                (written to output)
  *              - shake256ctx *state: pointer to input/output Keccak state
  **************************************************/
-static void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256_ctx *state) {
-    keccak_squeezeblocks(output, nblocks, state->ctx, SHAKE256_RATE);
+void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state) {
+    keccak_squeezeblocks(output, nblocks, state->ctx, S2N_KYBER_512_R3_SHAKE256_RATE);
 }
 
 /*************************************************
@@ -441,15 +475,15 @@ static void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256_ctx
  *              - size_t inlen: length of input in bytes
  **************************************************/
 void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen) {
-    size_t nblocks = outlen / SHAKE256_RATE;
-    uint8_t t[SHAKE256_RATE];
-    shake256_ctx s;
+    size_t nblocks = outlen / S2N_KYBER_512_R3_SHAKE256_RATE;
+    uint8_t t[S2N_KYBER_512_R3_SHAKE256_RATE];
+    shake256ctx s;
 
     shake256_absorb(&s, input, inlen);
     shake256_squeezeblocks(output, nblocks, &s);
 
-    output += nblocks * SHAKE256_RATE;
-    outlen -= nblocks * SHAKE256_RATE;
+    output += nblocks * S2N_KYBER_512_R3_SHAKE256_RATE;
+    outlen -= nblocks * S2N_KYBER_512_R3_SHAKE256_RATE;
 
     if (outlen) {
         shake256_squeezeblocks(t, 1, &s);
@@ -459,3 +493,50 @@ void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen
     }
 }
 
+/*************************************************
+ * Name:        sha3_256
+ *
+ * Description: SHA3-256 with non-incremental API
+ *
+ * Arguments:   - uint8_t *output:      pointer to output
+ *              - const uint8_t *input: pointer to input
+ *              - size_t inlen:   length of input in bytes
+ **************************************************/
+void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen) {
+    uint64_t s[25];
+    uint8_t t[S2N_KYBER_512_R3_SHA3_256_RATE];
+
+    /* Absorb input */
+    keccak_absorb(s, S2N_KYBER_512_R3_SHA3_256_RATE, input, inlen, 0x06);
+
+    /* Squeeze output */
+    keccak_squeezeblocks(t, 1, s, S2N_KYBER_512_R3_SHA3_256_RATE);
+
+    for (size_t i = 0; i < 32; i++) {
+        output[i] = t[i];
+    }
+}
+
+/*************************************************
+ * Name:        sha3_512
+ *
+ * Description: SHA3-512 with non-incremental API
+ *
+ * Arguments:   - uint8_t *output:      pointer to output
+ *              - const uint8_t *input: pointer to input
+ *              - size_t inlen:   length of input in bytes
+ **************************************************/
+void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen) {
+    uint64_t s[25];
+    uint8_t t[S2N_KYBER_512_R3_SHA3_512_RATE];
+
+    /* Absorb input */
+    keccak_absorb(s, S2N_KYBER_512_R3_SHA3_512_RATE, input, inlen, 0x06);
+
+    /* Squeeze output */
+    keccak_squeezeblocks(t, 1, s, S2N_KYBER_512_R3_SHA3_512_RATE);
+
+    for (size_t i = 0; i < 64; i++) {
+        output[i] = t[i];
+    }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.h
new file mode 100644
index 0000000000..1f4f395f72
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include "kyber512r3_params.h"
+
+#define S2N_KYBER_512_R3_SHAKE128_RATE 168
+#define S2N_KYBER_512_R3_SHAKE256_RATE 136
+#define S2N_KYBER_512_R3_SHA3_256_RATE 136
+#define S2N_KYBER_512_R3_SHA3_384_RATE 104
+#define S2N_KYBER_512_R3_SHA3_512_RATE 72
+
+#define S2N_KYBER_512_R3_PQC_SHAKECTX_SIZE 25
+
+/* Context for non-incremental API */
+#define shake128ctx S2N_KYBER_512_R3_NAMESPACE(shake128ctx)
+typedef struct {
+    uint64_t ctx[S2N_KYBER_512_R3_PQC_SHAKECTX_SIZE];
+} shake128ctx;
+
+/* Context for non-incremental API */
+#define shake256ctx S2N_KYBER_512_R3_NAMESPACE(shake256ctx)
+typedef struct {
+    uint64_t ctx[S2N_KYBER_512_R3_PQC_SHAKECTX_SIZE];
+} shake256ctx;
+
+/* Initialize the state and absorb the provided input.
+ *
+ * This function does not support being called multiple times
+ * with the same state.
+ */
+#define shake128_absorb S2N_KYBER_512_R3_NAMESPACE(shake128_absorb)
+void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen);
+/* Squeeze output out of the sponge.
+ *
+ * Supports being called multiple times
+ */
+#define shake128_squeezeblocks S2N_KYBER_512_R3_NAMESPACE(shake128_squeezeblocks)
+void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state);
+
+/* Copy the state. */
+#define shake128_ctx_clone S2N_KYBER_512_R3_NAMESPACE(shake128_ctx_clone)
+void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src);
+
+/* Initialize the state and absorb the provided input.
+ *
+ * This function does not support being called multiple times
+ * with the same state.
+ */
+#define shake256_absorb S2N_KYBER_512_R3_NAMESPACE(shake256_absorb)
+void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen);
+/* Squeeze output out of the sponge.
+ *
+ * Supports being called multiple times
+ */
+#define shake256_squeezeblocks S2N_KYBER_512_R3_NAMESPACE(shake256_squeezeblocks)
+void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state);
+
+/* One-stop SHAKE256 call */
+#define shake256 S2N_KYBER_512_R3_NAMESPACE(shake256)
+void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen);
+
+#define sha3_256 S2N_KYBER_512_R3_NAMESPACE(sha3_256)
+void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen);
+
+/* One-stop SHA3-512 shop */
+#define sha3_512 S2N_KYBER_512_R3_NAMESPACE(sha3_512)
+void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.c
new file mode 100644
index 0000000000..5f07fb44a3
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.c
@@ -0,0 +1,210 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "kyber512r3_fips202.h"
+#include "kyber512r3_fips202x4_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+#define KeccakF1600_StatePermute4x S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds)
+extern void KeccakF1600_StatePermute4x(__m256i *s);
+
+/*  Implementation is used from Crystal Kyber Repository
+ *  See for more details: https://github.com/XKCP/XKCP */
+ 
+static void keccakx4_absorb_once(__m256i s[25],
+                                 unsigned int r,
+                                 const uint8_t *in0,
+                                 const uint8_t *in1,
+                                 const uint8_t *in2,
+                                 const uint8_t *in3,
+                                 size_t inlen,
+                                 uint8_t p)
+{
+  size_t i;
+  uint64_t pos = 0;
+  __m256i t, idx;
+
+  for(i = 0; i < 25; ++i)
+    s[i] = _mm256_setzero_si256();
+
+  idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0);
+  while(inlen >= r) {
+    for(i = 0; i < r/8; ++i) {
+      t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
+      s[i] = _mm256_xor_si256(s[i], t);
+      pos += 8;
+    }
+    inlen -= r;
+
+    KeccakF1600_StatePermute4x(s);
+  }
+
+  for(i = 0; i < inlen/8; ++i) {
+    t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
+    s[i] = _mm256_xor_si256(s[i], t);
+    pos += 8;
+  }
+  inlen -= 8*i;
+
+  if(inlen) {
+    t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
+    idx = _mm256_set1_epi64x((1ULL << (8*inlen)) - 1);
+    t = _mm256_and_si256(t, idx);
+    s[i] = _mm256_xor_si256(s[i], t);
+  }
+
+  t = _mm256_set1_epi64x((uint64_t)p << 8*inlen);
+  s[i] = _mm256_xor_si256(s[i], t);
+  t = _mm256_set1_epi64x(1ULL << 63);
+  s[r/8 - 1] = _mm256_xor_si256(s[r/8 - 1], t);
+}
+
+static void keccakx4_squeezeblocks(uint8_t *out0,
+                                   uint8_t *out1,
+                                   uint8_t *out2,
+                                   uint8_t *out3,
+                                   size_t nblocks,
+                                   unsigned int r,
+                                   __m256i s[25])
+{
+  unsigned int i;
+  __m128d t;
+
+  while(nblocks > 0) {
+    KeccakF1600_StatePermute4x(s);
+    for(i=0; i < r/8; ++i) {
+      t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i]));
+      // correcting cast-align errors
+      // old version: _mm_storel_pd((__attribute__((__may_alias__)) double *)&out0[8*i], t);
+      _mm_storel_pd((__attribute__((__may_alias__)) void *)&out0[8*i], t);
+      // old version: _mm_storeh_pd((__attribute__((__may_alias__)) double *)&out1[8*i], t);
+      _mm_storeh_pd((__attribute__((__may_alias__)) void *)&out1[8*i], t);
+      t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i],1));
+      // old version: _mm_storel_pd((__attribute__((__may_alias__)) double *)&out2[8*i], t);
+      _mm_storel_pd((__attribute__((__may_alias__)) void *)&out2[8*i], t);
+      // old version: _mm_storeh_pd((__attribute__((__may_alias__)) double *)&out3[8*i], t);
+      _mm_storeh_pd((__attribute__((__may_alias__)) void *)&out3[8*i], t);
+    }
+
+    out0 += r;
+    out1 += r;
+    out2 += r;
+    out3 += r;
+    --nblocks;
+  }
+}
+
+void shake128x4_absorb_once(keccakx4_state *state,
+                            const uint8_t *in0,
+                            const uint8_t *in1,
+                            const uint8_t *in2,
+                            const uint8_t *in3,
+                            size_t inlen)
+{
+  keccakx4_absorb_once(state->s, S2N_KYBER_512_R3_SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
+}
+
+void shake128x4_squeezeblocks(uint8_t *out0,
+                              uint8_t *out1,
+                              uint8_t *out2,
+                              uint8_t *out3,
+                              size_t nblocks,
+                              keccakx4_state *state)
+{
+  keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, S2N_KYBER_512_R3_SHAKE128_RATE, state->s);
+}
+
+void shake256x4_absorb_once(keccakx4_state *state,
+                            const uint8_t *in0,
+                            const uint8_t *in1,
+                            const uint8_t *in2,
+                            const uint8_t *in3,
+                            size_t inlen)
+{
+  keccakx4_absorb_once(state->s, S2N_KYBER_512_R3_SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);
+}
+
+void shake256x4_squeezeblocks(uint8_t *out0,
+                              uint8_t *out1,
+                              uint8_t *out2,
+                              uint8_t *out3,
+                              size_t nblocks,
+                              keccakx4_state *state)
+{
+  keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, S2N_KYBER_512_R3_SHAKE256_RATE, state->s);
+}
+
+void shake128x4(uint8_t *out0,
+                uint8_t *out1,
+                uint8_t *out2,
+                uint8_t *out3,
+                size_t outlen,
+                const uint8_t *in0,
+                const uint8_t *in1,
+                const uint8_t *in2,
+                const uint8_t *in3,
+                size_t inlen)
+{
+  unsigned int i;
+  size_t nblocks = outlen/S2N_KYBER_512_R3_SHAKE128_RATE;
+  uint8_t t[4][S2N_KYBER_512_R3_SHAKE128_RATE];
+  keccakx4_state state;
+
+  shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen);
+  shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);
+
+  out0 += nblocks*S2N_KYBER_512_R3_SHAKE128_RATE;
+  out1 += nblocks*S2N_KYBER_512_R3_SHAKE128_RATE;
+  out2 += nblocks*S2N_KYBER_512_R3_SHAKE128_RATE;
+  out3 += nblocks*S2N_KYBER_512_R3_SHAKE128_RATE;
+  outlen -= nblocks*S2N_KYBER_512_R3_SHAKE128_RATE;
+
+  if(outlen) {
+    shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
+    for(i = 0; i < outlen; ++i) {
+      out0[i] = t[0][i];
+      out1[i] = t[1][i];
+      out2[i] = t[2][i];
+      out3[i] = t[3][i];
+    }
+  }
+}
+
+void shake256x4(uint8_t *out0,
+                uint8_t *out1,
+                uint8_t *out2,
+                uint8_t *out3,
+                size_t outlen,
+                const uint8_t *in0,
+                const uint8_t *in1,
+                const uint8_t *in2,
+                const uint8_t *in3,
+                size_t inlen)
+{
+  unsigned int i;
+  size_t nblocks = outlen/S2N_KYBER_512_R3_SHAKE256_RATE;
+  uint8_t t[4][S2N_KYBER_512_R3_SHAKE256_RATE];
+  keccakx4_state state;
+
+  shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen);
+  shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);
+
+  out0 += nblocks*S2N_KYBER_512_R3_SHAKE256_RATE;
+  out1 += nblocks*S2N_KYBER_512_R3_SHAKE256_RATE;
+  out2 += nblocks*S2N_KYBER_512_R3_SHAKE256_RATE;
+  out3 += nblocks*S2N_KYBER_512_R3_SHAKE256_RATE;
+  outlen -= nblocks*S2N_KYBER_512_R3_SHAKE256_RATE;
+
+  if(outlen) {
+    shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
+    for(i = 0; i < outlen; ++i) {
+      out0[i] = t[0][i];
+      out1[i] = t[1][i];
+      out2[i] = t[2][i];
+      out3[i] = t[3][i];
+    }
+  }
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.h
new file mode 100644
index 0000000000..8c4896724c
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include "kyber512r3_params.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+#define keccakx4_state S2N_KYBER_512_R3_NAMESPACE(keccakx4_state)
+typedef struct {
+  __m256i s[25];
+} keccakx4_state;
+
+#define shake128x4_absorb_once S2N_KYBER_512_R3_NAMESPACE(shake128x4_absorb_once)
+void shake128x4_absorb_once(keccakx4_state *state,
+                            const uint8_t *in0,
+                            const uint8_t *in1,
+                            const uint8_t *in2,
+                            const uint8_t *in3,
+                            size_t inlen);
+
+#define shake128x4_squeezeblocks S2N_KYBER_512_R3_NAMESPACE(shake128x4_squeezeblocks)
+void shake128x4_squeezeblocks(uint8_t *out0,
+                              uint8_t *out1,
+                              uint8_t *out2,
+                              uint8_t *out3,
+                              size_t nblocks,
+                              keccakx4_state *state);
+
+#define shake256x4_absorb_once S2N_KYBER_512_R3_NAMESPACE(shake256x4_absorb_once)
+void shake256x4_absorb_once(keccakx4_state *state,
+                            const uint8_t *in0,
+                            const uint8_t *in1,
+                            const uint8_t *in2,
+                            const uint8_t *in3,
+                            size_t inlen);
+
+#define shake256x4_squeezeblocks S2N_KYBER_512_R3_NAMESPACE(shake256x4_squeezeblocks)
+void shake256x4_squeezeblocks(uint8_t *out0,
+                              uint8_t *out1,
+                              uint8_t *out2,
+                              uint8_t *out3,
+                              size_t nblocks,
+                              keccakx4_state *state);
+
+#define shake128x4 S2N_KYBER_512_R3_NAMESPACE(shake128x4)
+void shake128x4(uint8_t *out0,
+                uint8_t *out1,
+                uint8_t *out2,
+                uint8_t *out3,
+                size_t outlen,
+                const uint8_t *in0,
+                const uint8_t *in1,
+                const uint8_t *in2,
+                const uint8_t *in3,
+                size_t inlen);
+
+#define shake256x4 S2N_KYBER_512_R3_NAMESPACE(shake256x4)
+void shake256x4(uint8_t *out0,
+                uint8_t *out1,
+                uint8_t *out2,
+                uint8_t *out3,
+                size_t outlen,
+                const uint8_t *in0,
+                const uint8_t *in1,
+                const uint8_t *in2,
+                const uint8_t *in3,
+                size_t inlen);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fq_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fq_avx2.S
new file mode 100644
index 0000000000..3492489a67
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fq_avx2.S
@@ -0,0 +1,122 @@
+#include "kyber512r3_consts_avx2.h"
+
+// The small macros (.inc files) are combined with .S files directly
+/*****.include "fq.inc"*****/
+/***************************/
+.macro red16 r,rs=0,x=12
+vpmulhw     %ymm1,%ymm\r,%ymm\x
+.if \rs
+vpmulhrsw   %ymm\rs,%ymm\x,%ymm\x
+.else
+vpsraw      $10,%ymm\x,%ymm\x
+.endif
+vpmullw     %ymm0,%ymm\x,%ymm\x
+vpsubw      %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro csubq r,x=12
+vpsubw      %ymm0,%ymm\r,%ymm\r
+vpsraw      $15,%ymm\r,%ymm\x
+vpand       %ymm0,%ymm\x,%ymm\x
+vpaddw      %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro caddq r,x=12
+vpsraw      $15,%ymm\r,%ymm\x
+vpand       %ymm0,%ymm\x,%ymm\x
+vpaddw      %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro fqmulprecomp al,ah,b,x=12
+vpmullw     %ymm\al,%ymm\b,%ymm\x
+vpmulhw     %ymm\ah,%ymm\b,%ymm\b
+vpmulhw     %ymm0,%ymm\x,%ymm\x
+vpsubw      %ymm\x,%ymm\b,%ymm\b
+.endm
+/***************************/
+
+.text
+reduce128_avx:
+#load
+vmovdqa     (%rdi),%ymm2
+vmovdqa     32(%rdi),%ymm3
+vmovdqa     64(%rdi),%ymm4
+vmovdqa     96(%rdi),%ymm5
+vmovdqa     128(%rdi),%ymm6
+vmovdqa     160(%rdi),%ymm7
+vmovdqa     192(%rdi),%ymm8
+vmovdqa     224(%rdi),%ymm9
+
+red16       2
+red16       3
+red16       4
+red16       5
+red16       6
+red16       7
+red16       8
+red16       9
+
+#store
+vmovdqa     %ymm2,(%rdi)
+vmovdqa     %ymm3,32(%rdi)
+vmovdqa     %ymm4,64(%rdi)
+vmovdqa     %ymm5,96(%rdi)
+vmovdqa     %ymm6,128(%rdi)
+vmovdqa     %ymm7,160(%rdi)
+vmovdqa     %ymm8,192(%rdi)
+vmovdqa     %ymm9,224(%rdi)
+
+ret
+
+.global cdecl(reduce_avx2_asm)
+cdecl(reduce_avx2_asm):
+#consts
+vmovdqa     _16XQ*2(%rsi),%ymm0
+vmovdqa     _16XV*2(%rsi),%ymm1
+call        reduce128_avx
+add         $256,%rdi
+call        reduce128_avx
+ret
+
+tomont128_avx:
+#load
+vmovdqa     (%rdi),%ymm3
+vmovdqa     32(%rdi),%ymm4
+vmovdqa     64(%rdi),%ymm5
+vmovdqa     96(%rdi),%ymm6
+vmovdqa     128(%rdi),%ymm7
+vmovdqa     160(%rdi),%ymm8
+vmovdqa     192(%rdi),%ymm9
+vmovdqa     224(%rdi),%ymm10
+
+fqmulprecomp    1,2,3,11
+fqmulprecomp    1,2,4,12
+fqmulprecomp    1,2,5,13
+fqmulprecomp    1,2,6,14
+fqmulprecomp    1,2,7,15
+fqmulprecomp    1,2,8,11
+fqmulprecomp    1,2,9,12
+fqmulprecomp    1,2,10,13
+
+#store
+vmovdqa     %ymm3,(%rdi)
+vmovdqa     %ymm4,32(%rdi)
+vmovdqa     %ymm5,64(%rdi)
+vmovdqa     %ymm6,96(%rdi)
+vmovdqa     %ymm7,128(%rdi)
+vmovdqa     %ymm8,160(%rdi)
+vmovdqa     %ymm9,192(%rdi)
+vmovdqa     %ymm10,224(%rdi)
+
+ret
+
+.global cdecl(tomont_avx2_asm)
+cdecl(tomont_avx2_asm):
+#consts
+vmovdqa     _16XQ*2(%rsi),%ymm0
+vmovdqa     _16XMONTSQLO*2(%rsi),%ymm1
+vmovdqa     _16XMONTSQHI*2(%rsi),%ymm2
+call        tomont128_avx
+add         $256,%rdi
+call        tomont128_avx
+ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.c
new file mode 100644
index 0000000000..ace1783448
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.c
@@ -0,0 +1,323 @@
+#include <stddef.h>
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_indcpa.h"
+#include "kyber512r3_poly.h"
+#include "kyber512r3_polyvec.h"
+#include "kyber512r3_fips202.h"
+#include "kyber512r3_symmetric.h"
+#include "pq-crypto/s2n_pq_random.h"
+#include "utils/s2n_safety.h"
+
+/*************************************************
+* Name:        pack_pk
+*
+* Description: Serialize the public key as concatenation of the
+*              serialized vector of polynomials pk
+*              and the public seed used to generate the matrix A.
+*
+* Arguments:   uint8_t *r:          pointer to the output serialized public key
+*              polyvec *pk:         pointer to the input public-key polyvec
+*              const uint8_t *seed: pointer to the input public seed
+**************************************************/
+static void pack_pk(uint8_t r[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], polyvec *pk, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES]) {
+    polyvec_tobytes(r, pk);
+    for (size_t i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) {
+        r[i + S2N_KYBER_512_R3_POLYVECBYTES] = seed[i];
+    }
+}
+
+/*************************************************
+* Name:        unpack_pk
+*
+* Description: De-serialize public key from a byte array;
+*              approximate inverse of pack_pk
+*
+* Arguments:   - polyvec *pk:             pointer to output public-key
+*                                         polynomial vector
+*              - uint8_t *seed:           pointer to output seed to generate
+*                                         matrix A
+*              - const uint8_t *packedpk: pointer to input serialized public key
+**************************************************/
+static void unpack_pk(polyvec *pk, uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], const uint8_t packedpk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES]) {
+    polyvec_frombytes(pk, packedpk);
+    for (size_t i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) {
+        seed[i] = packedpk[i + S2N_KYBER_512_R3_POLYVECBYTES];
+    }
+}
+
+/*************************************************
+* Name:        pack_sk
+*
+* Description: Serialize the secret key
+*
+* Arguments:   - uint8_t *r:  pointer to output serialized secret key
+*              - polyvec *sk: pointer to input vector of polynomials (secret key)
+**************************************************/
+static void pack_sk(uint8_t r[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES], polyvec *sk) {
+    polyvec_tobytes(r, sk);
+}
+
+/*************************************************
+* Name:        unpack_sk
+*
+* Description: De-serialize the secret key;
+*              inverse of pack_sk
+*
+* Arguments:   - polyvec *sk:             pointer to output vector of
+*                                         polynomials (secret key)
+*              - const uint8_t *packedsk: pointer to input serialized secret key
+**************************************************/
+static void unpack_sk(polyvec *sk, const uint8_t packedsk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]) {
+    polyvec_frombytes(sk, packedsk);
+}
+
+/*************************************************
+* Name:        pack_ciphertext
+*
+* Description: Serialize the ciphertext as concatenation of the
+*              compressed and serialized vector of polynomials b
+*              and the compressed and serialized polynomial v
+*
+* Arguments:   uint8_t *r: pointer to the output serialized ciphertext
+*              poly *pk:   pointer to the input vector of polynomials b
+*              poly *v:    pointer to the input polynomial v
+**************************************************/
+static void pack_ciphertext(uint8_t r[S2N_KYBER_512_R3_INDCPA_BYTES], polyvec *b, poly *v) {
+    polyvec_compress(r, b);
+    poly_compress(r + S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES, v);
+}
+
+/*************************************************
+* Name:        unpack_ciphertext
+*
+* Description: De-serialize and decompress ciphertext from a byte array;
+*              approximate inverse of pack_ciphertext
+*
+* Arguments:   - polyvec *b:       pointer to the output vector of polynomials b
+*              - poly *v:          pointer to the output polynomial v
+*              - const uint8_t *c: pointer to the input serialized ciphertext
+**************************************************/
+static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES]) {
+    polyvec_decompress(b, c);
+    poly_decompress(v, c + S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES);
+}
+
+/*************************************************
+* Name:        rej_uniform
+*
+* Description: Run rejection sampling on uniform random bytes to generate
+*              uniform random integers mod q
+*
+* Arguments:   - int16_t *r:          pointer to output buffer
+*              - unsigned int len:    requested number of 16-bit integers
+*                                     (uniform mod q)
+*              - const uint8_t *buf:  pointer to input buffer
+*                                     (assumed to be uniform random bytes)
+*              - unsigned int buflen: length of input buffer in bytes
+*
+* Returns number of sampled 16-bit integers (at most len)
+**************************************************/
+static unsigned int rej_uniform(int16_t *r, unsigned int len, const uint8_t *buf, unsigned int buflen) {
+    unsigned int ctr, pos;
+
+    ctr = pos = 0;
+    while (ctr < len && pos + 3 <= buflen) {
+        uint16_t val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
+        uint16_t  val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+        pos += 3;
+
+        if (val0 < S2N_KYBER_512_R3_Q) {
+            r[ctr++] = val0;
+        }
+        if (ctr < len && val1 < S2N_KYBER_512_R3_Q) {
+            r[ctr++] = val1;
+        }
+    }
+
+    return ctr;
+}
+
+/*************************************************
+* Name:        gen_matrix
+*
+* Description: Deterministically generate matrix A (or the transpose of A)
+*              from a seed. Entries of the matrix are polynomials that look
+*              uniformly random. Performs rejection sampling on output of
+*              a XOF
+*
+* Arguments:   - polyvec *a:          pointer to ouptput matrix A
+*              - const uint8_t *seed: pointer to input seed
+*              - int transposed:      boolean deciding whether A or A^T
+*                                     is generated
+**************************************************/
+#define XOF_BLOCKBYTES 168
+#define GEN_MATRIX_NBLOCKS ((12*S2N_KYBER_512_R3_N/8*(1 << 12)/S2N_KYBER_512_R3_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
+static void gen_matrix(polyvec *a, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], int transposed) {
+    unsigned int ctr, buflen, off;
+    uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
+    xof_state state;
+
+    for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        for (unsigned int j = 0; j < S2N_KYBER_512_R3_K; j++) {
+            if (transposed) {
+                kyber_shake128_absorb(&state, seed, i, j);
+            } else {
+                kyber_shake128_absorb(&state, seed, j, i);
+            }
+
+            shake128_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state);
+            buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
+            ctr = rej_uniform(a[i].vec[j].coeffs, S2N_KYBER_512_R3_N, buf, buflen);
+
+            while (ctr < S2N_KYBER_512_R3_N) {
+                off = buflen % 3;
+                for (unsigned int k = 0; k < off; k++) {
+                    buf[k] = buf[buflen - off + k];
+                }
+                shake128_squeezeblocks(buf + off, 1, &state);
+                buflen = off + XOF_BLOCKBYTES;
+                ctr += rej_uniform(a[i].vec[j].coeffs + ctr, S2N_KYBER_512_R3_N - ctr, buf, buflen);
+            }
+        }
+    }
+}
+
+/*************************************************
+* Name:        indcpa_keypair
+*
+* Description: Generates public and private key for the CPA-secure
+*              public-key encryption scheme underlying Kyber
+*
+* Arguments:   - uint8_t *pk: pointer to output public key
+*                             (of length S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key
+*                             (of length S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES bytes)
+*
+* Returns:     0 on success
+*              !0 on failure
+**************************************************/
+int indcpa_keypair(uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]) {
+    uint8_t buf[2 * S2N_KYBER_512_R3_SYMBYTES];
+    const uint8_t *publicseed = buf;
+    const uint8_t *noiseseed = buf + S2N_KYBER_512_R3_SYMBYTES;
+    uint8_t nonce = 0;
+    polyvec a[S2N_KYBER_512_R3_K], e, pkpv, skpv;
+
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, S2N_KYBER_512_R3_SYMBYTES));
+    sha3_512(buf, buf, S2N_KYBER_512_R3_SYMBYTES);
+
+    gen_matrix(a, publicseed, 0);
+
+    for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++);
+    }
+    for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++);
+    }
+
+    polyvec_ntt(&skpv);
+    polyvec_ntt(&e);
+
+    //* matrix-vector multiplication */
+    for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
+        poly_tomont(&pkpv.vec[i]);
+    }
+
+    polyvec_add(&pkpv, &pkpv, &e);
+    polyvec_reduce(&pkpv);
+
+    pack_sk(sk, &skpv);
+    pack_pk(pk, &pkpv, publicseed);
+
+    return 0;
+}
+
+/*************************************************
+* Name:        indcpa_enc
+*
+* Description: Encryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - uint8_t *c:           pointer to output ciphertext
+*                                      (of length S2N_KYBER_512_R3_INDCPA_BYTES bytes)
+*              - const uint8_t *m:     pointer to input message
+*                                      (of length S2N_KYBER_512_R3_INDCPA_MSGBYTES bytes)
+*              - const uint8_t *pk:    pointer to input public key
+*                                      (of length S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES)
+*              - const uint8_t *coins: pointer to input random coins
+*                                      used as seed (of length S2N_KYBER_512_R3_SYMBYTES)
+*                                      to deterministically generate all
+*                                      randomness
+**************************************************/
+void indcpa_enc(uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES], const uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES],
+        const uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], const uint8_t coins[S2N_KYBER_512_R3_SYMBYTES]) {
+    uint8_t seed[S2N_KYBER_512_R3_SYMBYTES];
+    uint8_t nonce = 0;
+    polyvec sp, pkpv, ep, at[S2N_KYBER_512_R3_K], bp;
+    poly v, k, epp;
+
+    unpack_pk(&pkpv, seed, pk);
+    poly_frommsg(&k, m);
+    gen_matrix(at, seed, 1);
+
+    for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        poly_getnoise_eta1(sp.vec + i, coins, nonce++);
+    }
+    for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        poly_getnoise_eta2(ep.vec + i, coins, nonce++);
+    }
+    poly_getnoise_eta2(&epp, coins, nonce++);
+
+    polyvec_ntt(&sp);
+
+    /* matrix-vector multiplication */
+    for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp);
+    }
+
+    polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp);
+
+    polyvec_invntt_tomont(&bp);
+    poly_invntt_tomont(&v);
+
+    polyvec_add(&bp, &bp, &ep);
+    poly_add(&v, &v, &epp);
+    poly_add(&v, &v, &k);
+    polyvec_reduce(&bp);
+    poly_reduce(&v);
+
+    pack_ciphertext(c, &bp, &v);
+}
+
+/*************************************************
+* Name:        indcpa_dec
+*
+* Description: Decryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - uint8_t *m:        pointer to output decrypted message
+*                                   (of length S2N_KYBER_512_R3_INDCPA_MSGBYTES)
+*              - const uint8_t *c:  pointer to input ciphertext
+*                                   (of length S2N_KYBER_512_R3_INDCPA_BYTES)
+*              - const uint8_t *sk: pointer to input secret key
+*                                   (of length S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES)
+**************************************************/
+void indcpa_dec(uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES], const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES],
+        const uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]) {
+    polyvec bp, skpv;
+    poly v, mp;
+
+    unpack_ciphertext(&bp, &v, c);
+    unpack_sk(&skpv, sk);
+
+    polyvec_ntt(&bp);
+    polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp);
+    poly_invntt_tomont(&mp);
+
+    poly_sub(&mp, &v, &mp);
+    poly_reduce(&mp);
+
+    poly_tomsg(m, &mp);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.h
new file mode 100644
index 0000000000..f8b9e401a0
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+
+#define indcpa_keypair S2N_KYBER_512_R3_NAMESPACE(indcpa_keypair)
+int indcpa_keypair(uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]);
+
+#define indcpa_enc S2N_KYBER_512_R3_NAMESPACE(indcpa_enc)
+void indcpa_enc(uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES], const uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES],
+        const uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], const uint8_t coins[S2N_KYBER_512_R3_SYMBYTES]);
+
+#define indcpa_dec S2N_KYBER_512_R3_NAMESPACE(indcpa_dec)
+void indcpa_dec(uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES], const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES],
+        const uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.c
new file mode 100644
index 0000000000..91e7513881
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.c
@@ -0,0 +1,363 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "kyber512r3_align_avx2.h"
+#include "kyber512r3_params.h"
+#include "kyber512r3_indcpa_avx2.h"
+#include "kyber512r3_polyvec_avx2.h"
+#include "kyber512r3_poly_avx2.h"
+#include "kyber512r3_rejsample_avx2.h"
+#include "kyber512r3_fips202.h"
+#include "kyber512r3_fips202x4_avx2.h"
+#include "pq-crypto/s2n_pq_random.h"
+#include "utils/s2n_safety.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+/*************************************************
+* Name:        pack_pk
+*
+* Description: Serialize the public key as concatenation of the
+*              serialized vector of polynomials pk and the
+*              public seed used to generate the matrix A.
+*              The polynomial coefficients in pk are assumed to
+*              lie in the invertal [0,q], i.e. pk must be reduced
+*              by polyvec_reduce_avx2().
+*
+* Arguments:   uint8_t *r: pointer to the output serialized public key
+*              polyvec *pk: pointer to the input public-key polyvec
+*              const uint8_t *seed: pointer to the input public seed
+**************************************************/
+static void pack_pk(uint8_t r[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES],
+                    polyvec *pk,
+                    const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES])
+{
+  polyvec_tobytes_avx2(r, pk);
+  memcpy(r+S2N_KYBER_512_R3_POLYVECBYTES, seed, S2N_KYBER_512_R3_SYMBYTES);
+}
+
+/*************************************************
+* Name:        unpack_pk
+*
+* Description: De-serialize public key from a byte array;
+*              approximate inverse of pack_pk
+*
+* Arguments:   - polyvec *pk: pointer to output public-key polynomial vector
+*              - uint8_t *seed: pointer to output seed to generate matrix A
+*              - const uint8_t *packedpk: pointer to input serialized public key
+**************************************************/
+static void unpack_pk(polyvec *pk,
+                      uint8_t seed[S2N_KYBER_512_R3_SYMBYTES],
+                      const uint8_t packedpk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES])
+{
+  polyvec_frombytes_avx2(pk, packedpk);
+  memcpy(seed, packedpk+S2N_KYBER_512_R3_POLYVECBYTES, S2N_KYBER_512_R3_SYMBYTES);
+}
+
+/*************************************************
+* Name:        pack_sk
+*
+* Description: Serialize the secret key.
+*              The polynomial coefficients in sk are assumed to
+*              lie in the invertal [0,q], i.e. sk must be reduced
+*              by polyvec_reduce_avx2().
+*
+* Arguments:   - uint8_t *r: pointer to output serialized secret key
+*              - polyvec *sk: pointer to input vector of polynomials (secret key)
+**************************************************/
+static void pack_sk(uint8_t r[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES], polyvec *sk)
+{
+  polyvec_tobytes_avx2(r, sk);
+}
+
+/*************************************************
+* Name:        unpack_sk
+*
+* Description: De-serialize the secret key; inverse of pack_sk
+*
+* Arguments:   - polyvec *sk: pointer to output vector of polynomials (secret key)
+*              - const uint8_t *packedsk: pointer to input serialized secret key
+**************************************************/
+static void unpack_sk(polyvec *sk, const uint8_t packedsk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES])
+{
+  polyvec_frombytes_avx2(sk, packedsk);
+}
+
+/*************************************************
+* Name:        pack_ciphertext
+*
+* Description: Serialize the ciphertext as concatenation of the
+*              compressed and serialized vector of polynomials b
+*              and the compressed and serialized polynomial v.
+*              The polynomial coefficients in b and v are assumed to
+*              lie in the invertal [0,q], i.e. b and v must be reduced
+*              by polyvec_reduce_avx2() and poly_reduce_avx2(), respectively.
+*
+* Arguments:   uint8_t *r: pointer to the output serialized ciphertext
+*              poly *pk: pointer to the input vector of polynomials b
+*              poly *v: pointer to the input polynomial v
+**************************************************/
+static void pack_ciphertext(uint8_t r[S2N_KYBER_512_R3_INDCPA_BYTES], polyvec *b, poly *v)
+{
+  polyvec_compress_avx2(r, b);
+  poly_compress_avx2(r+S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES, v);
+}
+
+/*************************************************
+* Name:        unpack_ciphertext
+*
+* Description: De-serialize and decompress ciphertext from a byte array;
+*              approximate inverse of pack_ciphertext
+*
+* Arguments:   - polyvec *b: pointer to the output vector of polynomials b
+*              - poly *v: pointer to the output polynomial v
+*              - const uint8_t *c: pointer to the input serialized ciphertext
+**************************************************/
+static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES])
+{
+  polyvec_decompress_avx2(b, c);
+  poly_decompress_avx2(v, c+S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES);
+}
+
+/*************************************************
+* Name:        rej_uniform
+*
+* Description: Run rejection sampling on uniform random bytes to generate
+*              uniform random integers mod q
+*
+* Arguments:   - int16_t *r: pointer to output array
+*              - unsigned int len: requested number of 16-bit integers (uniform mod q)
+*              - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
+*              - unsigned int buflen: length of input buffer in bytes
+*
+* Returns number of sampled 16-bit integers (at most len)
+**************************************************/
+static unsigned int rej_uniform(int16_t *r,
+                                unsigned int len,
+                                const uint8_t *buf,
+                                unsigned int buflen)
+{
+  unsigned int ctr, pos;
+  uint16_t val0, val1;
+
+  ctr = pos = 0;
+  while(ctr < len && pos <= buflen - 3) {  // buflen is always at least 3
+    val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
+    val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4)) & 0xFFF;
+    pos += 3;
+
+    if(val0 < S2N_KYBER_512_R3_Q)
+      r[ctr++] = val0;
+    if(ctr < len && val1 < S2N_KYBER_512_R3_Q)
+      r[ctr++] = val1;
+  }
+
+  return ctr;
+}
+
+#define gen_a(A,B)  gen_matrix_avx2(A,B,0)
+#define gen_at(A,B) gen_matrix_avx2(A,B,1)
+
+/*************************************************
+* Name:        gen_matrix_avx2
+*
+* Description: Deterministically generate matrix A (or the transpose of A)
+*              from a seed. Entries of the matrix are polynomials that look
+*              uniformly random. Performs rejection sampling on output of
+*              a XOF
+*
+* Arguments:   - polyvec *a: pointer to ouptput matrix A
+*              - const uint8_t *seed: pointer to input seed
+*              - int transposed: boolean deciding whether A or A^T is generated
+**************************************************/
+void gen_matrix_avx2(polyvec *a, const uint8_t seed[32], int transposed)
+{
+  unsigned int ctr0, ctr1, ctr2, ctr3;
+  ALIGNED_UINT8(S2N_KYBER_512_R3_REJ_UNIFORM_AVX_NBLOCKS*S2N_KYBER_512_R3_SHAKE128_RATE) buf[4];
+  __m256i f;
+  keccakx4_state state;
+
+  // correcting cast-align and cast-qual errors
+  // old version: f = _mm256_loadu_si256((__m256i *)seed);
+  f = _mm256_loadu_si256((const void *)seed);
+  _mm256_store_si256(buf[0].vec, f);
+  _mm256_store_si256(buf[1].vec, f);
+  _mm256_store_si256(buf[2].vec, f);
+  _mm256_store_si256(buf[3].vec, f);
+
+  if(transposed) {
+    buf[0].coeffs[32] = 0;
+    buf[0].coeffs[33] = 0;
+    buf[1].coeffs[32] = 0;
+    buf[1].coeffs[33] = 1;
+    buf[2].coeffs[32] = 1;
+    buf[2].coeffs[33] = 0;
+    buf[3].coeffs[32] = 1;
+    buf[3].coeffs[33] = 1;
+  }
+  else {
+    buf[0].coeffs[32] = 0;
+    buf[0].coeffs[33] = 0;
+    buf[1].coeffs[32] = 1;
+    buf[1].coeffs[33] = 0;
+    buf[2].coeffs[32] = 0;
+    buf[2].coeffs[33] = 1;
+    buf[3].coeffs[32] = 1;
+    buf[3].coeffs[33] = 1;
+  }
+
+  shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
+  shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, S2N_KYBER_512_R3_REJ_UNIFORM_AVX_NBLOCKS, &state);
+
+  ctr0 = rej_uniform_avx2(a[0].vec[0].coeffs, buf[0].coeffs);
+  ctr1 = rej_uniform_avx2(a[0].vec[1].coeffs, buf[1].coeffs);
+  ctr2 = rej_uniform_avx2(a[1].vec[0].coeffs, buf[2].coeffs);
+  ctr3 = rej_uniform_avx2(a[1].vec[1].coeffs, buf[3].coeffs);
+
+  while(ctr0 < S2N_KYBER_512_R3_N || ctr1 < S2N_KYBER_512_R3_N || ctr2 < S2N_KYBER_512_R3_N || ctr3 < S2N_KYBER_512_R3_N) {
+    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
+
+    ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, S2N_KYBER_512_R3_N - ctr0, buf[0].coeffs, S2N_KYBER_512_R3_SHAKE128_RATE);
+    ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, S2N_KYBER_512_R3_N - ctr1, buf[1].coeffs, S2N_KYBER_512_R3_SHAKE128_RATE);
+    ctr2 += rej_uniform(a[1].vec[0].coeffs + ctr2, S2N_KYBER_512_R3_N - ctr2, buf[2].coeffs, S2N_KYBER_512_R3_SHAKE128_RATE);
+    ctr3 += rej_uniform(a[1].vec[1].coeffs + ctr3, S2N_KYBER_512_R3_N - ctr3, buf[3].coeffs, S2N_KYBER_512_R3_SHAKE128_RATE);
+  }
+
+  poly_nttunpack_avx2(&a[0].vec[0]);
+  poly_nttunpack_avx2(&a[0].vec[1]);
+  poly_nttunpack_avx2(&a[1].vec[0]);
+  poly_nttunpack_avx2(&a[1].vec[1]);
+}
+
+/*************************************************
+* Name:        indcpa_keypair_avx2
+*
+* Description: Generates public and private key for the CPA-secure
+*              public-key encryption scheme underlying Kyber
+*
+* Arguments:   - uint8_t *pk: pointer to output public key
+*                             (of length S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key
+                              (of length S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES bytes)
+**************************************************/
+int indcpa_keypair_avx2(uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES],
+                    uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES])
+{
+  unsigned int i;
+  uint8_t buf[2*S2N_KYBER_512_R3_SYMBYTES];
+  const uint8_t *publicseed = buf;
+  const uint8_t *noiseseed = buf + S2N_KYBER_512_R3_SYMBYTES;
+  polyvec a[S2N_KYBER_512_R3_K], e, pkpv, skpv;
+
+  POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, S2N_KYBER_512_R3_SYMBYTES));
+  sha3_512(buf, buf, S2N_KYBER_512_R3_SYMBYTES);
+
+  gen_a(a, publicseed);
+
+  poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, e.vec+0, e.vec+1, noiseseed, 0, 1, 2, 3);
+
+  polyvec_ntt_avx2(&skpv);
+  polyvec_reduce_avx2(&skpv);
+  polyvec_ntt_avx2(&e);
+
+  // matrix-vector multiplication
+  for(i=0;i<S2N_KYBER_512_R3_K;i++) {
+    polyvec_basemul_acc_montgomery_avx2(&pkpv.vec[i], &a[i], &skpv);
+    poly_tomont_avx2(&pkpv.vec[i]);
+  }
+
+  polyvec_add_avx2(&pkpv, &pkpv, &e);
+  polyvec_reduce_avx2(&pkpv);
+
+  pack_sk(sk, &skpv);
+  pack_pk(pk, &pkpv, publicseed);
+
+  return 0;
+}
+
+/*************************************************
+* Name:        indcpa_enc_avx2
+*
+* Description: Encryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - uint8_t *c: pointer to output ciphertext
+*                            (of length S2N_KYBER_512_R3_INDCPA_BYTES bytes)
+*              - const uint8_t *m: pointer to input message
+*                                  (of length S2N_KYBER_512_R3_INDCPA_MSGBYTES bytes)
+*              - const uint8_t *pk: pointer to input public key
+*                                   (of length S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES)
+*              - const uint8_t *coins: pointer to input random coins used as seed
+*                                      (of length S2N_KYBER_512_R3_SYMBYTES) to deterministically
+*                                      generate all randomness
+**************************************************/
+void indcpa_enc_avx2(uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES],
+                const uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES],
+                const uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES],
+                const uint8_t coins[S2N_KYBER_512_R3_SYMBYTES])
+{
+  unsigned int i;
+  uint8_t seed[S2N_KYBER_512_R3_SYMBYTES];
+  polyvec sp, pkpv, ep, at[S2N_KYBER_512_R3_K], b;
+  poly v, k, epp;
+
+  unpack_pk(&pkpv, seed, pk);
+  poly_frommsg_avx2(&k, m);
+  gen_at(at, seed);
+
+  poly_getnoise_eta1122_4x(sp.vec+0, sp.vec+1, ep.vec+0, ep.vec+1, coins, 0, 1, 2, 3);
+  poly_getnoise_eta2_avx2(&epp, coins, 4);
+
+  polyvec_ntt_avx2(&sp);
+
+  // matrix-vector multiplication
+  for(i=0;i<S2N_KYBER_512_R3_K;i++)
+    polyvec_basemul_acc_montgomery_avx2(&b.vec[i], &at[i], &sp);
+  polyvec_basemul_acc_montgomery_avx2(&v, &pkpv, &sp);
+
+  polyvec_invntt_tomont_avx2(&b);
+  poly_invntt_tomont_avx2(&v);
+
+  polyvec_add_avx2(&b, &b, &ep);
+  poly_add_avx2(&v, &v, &epp);
+  poly_add_avx2(&v, &v, &k);
+  polyvec_reduce_avx2(&b);
+  poly_reduce_avx2(&v);
+
+  pack_ciphertext(c, &b, &v);
+}
+
+/*************************************************
+* Name:        indcpa_dec_avx2
+*
+* Description: Decryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - uint8_t *m: pointer to output decrypted message
+*                            (of length S2N_KYBER_512_R3_INDCPA_MSGBYTES)
+*              - const uint8_t *c: pointer to input ciphertext
+*                                  (of length S2N_KYBER_512_R3_INDCPA_BYTES)
+*              - const uint8_t *sk: pointer to input secret key
+*                                   (of length S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES)
+**************************************************/
+void indcpa_dec_avx2(uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES],
+                const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES],
+                const uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES])
+{
+  polyvec b, skpv;
+  poly v, mp;
+
+  unpack_ciphertext(&b, &v, c);
+  unpack_sk(&skpv, sk);
+
+  polyvec_ntt_avx2(&b);
+  polyvec_basemul_acc_montgomery_avx2(&mp, &skpv, &b);
+  poly_invntt_tomont_avx2(&mp);
+
+  poly_sub_avx2(&mp, &v, &mp);
+  poly_reduce_avx2(&mp);
+
+  poly_tomsg_avx2(m, &mp);
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.h
new file mode 100644
index 0000000000..127e5bc4f6
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_polyvec_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#define gen_matrix_avx2 S2N_KYBER_512_R3_NAMESPACE(gen_matrix_avx2)
+void gen_matrix_avx2(polyvec *a, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], int transposed);
+
+#define indcpa_keypair_avx2 S2N_KYBER_512_R3_NAMESPACE(indcpa_keypair_avx2)
+int indcpa_keypair_avx2(uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES],
+                    uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]);
+
+#define indcpa_enc_avx2 S2N_KYBER_512_R3_NAMESPACE(indcpa_enc_avx2)
+void indcpa_enc_avx2(uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES],
+                const uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES],
+                const uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES],
+                const uint8_t coins[S2N_KYBER_512_R3_SYMBYTES]);
+
+#define indcpa_dec_avx2 S2N_KYBER_512_R3_NAMESPACE(indcpa_dec_avx2)
+void indcpa_dec_avx2(uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES],
+                const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES],
+                const uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_invntt_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_invntt_avx2.S
new file mode 100644
index 0000000000..8f131668ff
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_invntt_avx2.S
@@ -0,0 +1,255 @@
+#include "kyber512r3_consts_avx2.h"
+
+// The small macros (.inc files) are combined with .S files directly
+/*****.include "shuffle.inc"*****/
+/********************************/
+.macro shuffle8 r0,r1,r2,r3
+vperm2i128  $0x20,%ymm\r1,%ymm\r0,%ymm\r2
+vperm2i128  $0x31,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle4 r0,r1,r2,r3
+vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
+vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle2 r0,r1,r2,r3
+#vpsllq     $32,%ymm\r1,%ymm\r2
+vmovsldup   %ymm\r1,%ymm\r2
+vpblendd    $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
+vpsrlq      $32,%ymm\r0,%ymm\r0
+#vmovshdup  %ymm\r0,%ymm\r0
+vpblendd    $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle1 r0,r1,r2,r3
+vpslld      $16,%ymm\r1,%ymm\r2
+vpblendw    $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
+vpsrld      $16,%ymm\r0,%ymm\r0
+vpblendw    $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+/********************************/
+
+/*****.include "fq.inc"*****/
+/***************************/
+.macro red16 r,rs=0,x=12
+vpmulhw     %ymm1,%ymm\r,%ymm\x
+.if \rs
+vpmulhrsw   %ymm\rs,%ymm\x,%ymm\x
+.else
+vpsraw      $10,%ymm\x,%ymm\x
+.endif
+vpmullw     %ymm0,%ymm\x,%ymm\x
+vpsubw      %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro csubq r,x=12
+vpsubw      %ymm0,%ymm\r,%ymm\r
+vpsraw      $15,%ymm\r,%ymm\x
+vpand       %ymm0,%ymm\x,%ymm\x
+vpaddw      %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro caddq r,x=12
+vpsraw      $15,%ymm\r,%ymm\x
+vpand       %ymm0,%ymm\x,%ymm\x
+vpaddw      %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro fqmulprecomp al,ah,b,x=12
+vpmullw     %ymm\al,%ymm\b,%ymm\x
+vpmulhw     %ymm\ah,%ymm\b,%ymm\b
+vpmulhw     %ymm0,%ymm\x,%ymm\x
+vpsubw      %ymm\x,%ymm\b,%ymm\b
+.endm
+/***************************/
+
+.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
+vpsubw      %ymm\rl0,%ymm\rh0,%ymm12
+vpaddw      %ymm\rh0,%ymm\rl0,%ymm\rl0
+vpsubw      %ymm\rl1,%ymm\rh1,%ymm13
+
+vpmullw     %ymm\zl0,%ymm12,%ymm\rh0
+vpaddw      %ymm\rh1,%ymm\rl1,%ymm\rl1
+vpsubw      %ymm\rl2,%ymm\rh2,%ymm14
+
+vpmullw     %ymm\zl0,%ymm13,%ymm\rh1
+vpaddw      %ymm\rh2,%ymm\rl2,%ymm\rl2
+vpsubw      %ymm\rl3,%ymm\rh3,%ymm15
+
+vpmullw     %ymm\zl1,%ymm14,%ymm\rh2
+vpaddw      %ymm\rh3,%ymm\rl3,%ymm\rl3
+vpmullw     %ymm\zl1,%ymm15,%ymm\rh3
+
+vpmulhw     %ymm\zh0,%ymm12,%ymm12
+vpmulhw     %ymm\zh0,%ymm13,%ymm13
+
+vpmulhw     %ymm\zh1,%ymm14,%ymm14
+vpmulhw     %ymm\zh1,%ymm15,%ymm15
+
+vpmulhw     %ymm0,%ymm\rh0,%ymm\rh0
+
+vpmulhw     %ymm0,%ymm\rh1,%ymm\rh1
+
+vpmulhw     %ymm0,%ymm\rh2,%ymm\rh2
+vpmulhw     %ymm0,%ymm\rh3,%ymm\rh3
+
+#
+
+#
+
+vpsubw      %ymm\rh0,%ymm12,%ymm\rh0
+
+vpsubw      %ymm\rh1,%ymm13,%ymm\rh1
+
+vpsubw      %ymm\rh2,%ymm14,%ymm\rh2
+vpsubw      %ymm\rh3,%ymm15,%ymm\rh3
+.endm
+
+.macro intt_levels0t5 off
+/* level 0 */
+vmovdqa     _16XFLO*2(%rsi),%ymm2
+vmovdqa     _16XFHI*2(%rsi),%ymm3
+
+vmovdqa     (128*\off+  0)*2(%rdi),%ymm4
+vmovdqa     (128*\off+ 32)*2(%rdi),%ymm6
+vmovdqa     (128*\off+ 16)*2(%rdi),%ymm5
+vmovdqa     (128*\off+ 48)*2(%rdi),%ymm7
+
+fqmulprecomp    2,3,4
+fqmulprecomp    2,3,6
+fqmulprecomp    2,3,5
+fqmulprecomp    2,3,7
+
+vmovdqa     (128*\off+ 64)*2(%rdi),%ymm8
+vmovdqa     (128*\off+ 96)*2(%rdi),%ymm10
+vmovdqa     (128*\off+ 80)*2(%rdi),%ymm9
+vmovdqa     (128*\off+112)*2(%rdi),%ymm11
+
+fqmulprecomp    2,3,8
+fqmulprecomp    2,3,10
+fqmulprecomp    2,3,9
+fqmulprecomp    2,3,11
+
+vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
+vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
+vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
+vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
+vmovdqa     _REVIDXB*2(%rsi),%ymm12
+vpshufb     %ymm12,%ymm15,%ymm15
+vpshufb     %ymm12,%ymm1,%ymm1
+vpshufb     %ymm12,%ymm2,%ymm2
+vpshufb     %ymm12,%ymm3,%ymm3
+
+butterfly   4,5,8,9,6,7,10,11,15,1,2,3
+
+/* level 1 */
+vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
+vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
+vmovdqa     _REVIDXB*2(%rsi),%ymm1
+vpshufb     %ymm1,%ymm2,%ymm2
+vpshufb     %ymm1,%ymm3,%ymm3
+
+butterfly   4,5,6,7,8,9,10,11,2,2,3,3
+
+shuffle1    4,5,3,5
+shuffle1    6,7,4,7
+shuffle1    8,9,6,9
+shuffle1    10,11,8,11
+
+/* level 2 */
+vmovdqa     _REVIDXD*2(%rsi),%ymm12
+vpermd      (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
+vpermd      (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10
+
+butterfly   3,4,6,8,5,7,9,11,2,2,10,10
+
+vmovdqa     _16XV*2(%rsi),%ymm1
+red16       3
+
+shuffle2    3,4,10,4
+shuffle2    6,8,3,8
+shuffle2    5,7,6,7
+shuffle2    9,11,5,11
+
+/* level 3 */
+vpermq      $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
+vpermq      $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9
+
+butterfly   10,3,6,5,4,8,7,11,2,2,9,9
+
+shuffle4    10,3,9,3
+shuffle4    6,5,10,5
+shuffle4    4,8,6,8
+shuffle4    7,11,4,11
+
+/* level 4 */
+vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
+vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7
+
+butterfly   9,10,6,4,3,5,8,11,2,2,7,7
+
+red16       9
+
+shuffle8    9,10,7,10
+shuffle8    6,4,9,4
+shuffle8    3,5,6,5
+shuffle8    8,11,3,11
+
+/* level 5 */
+vmovdqa     (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
+vmovdqa     (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8
+
+butterfly   7,9,6,3,10,4,5,11,2,2,8,8
+
+vmovdqa     %ymm7,(128*\off+  0)*2(%rdi)
+vmovdqa     %ymm9,(128*\off+ 16)*2(%rdi)
+vmovdqa     %ymm6,(128*\off+ 32)*2(%rdi)
+vmovdqa     %ymm3,(128*\off+ 48)*2(%rdi)
+vmovdqa     %ymm10,(128*\off+ 64)*2(%rdi)
+vmovdqa     %ymm4,(128*\off+ 80)*2(%rdi)
+vmovdqa     %ymm5,(128*\off+ 96)*2(%rdi)
+vmovdqa     %ymm11,(128*\off+112)*2(%rdi)
+.endm
+
+.macro intt_level6 off
+/* level 6 */
+vmovdqa         (64*\off+  0)*2(%rdi),%ymm4
+vmovdqa         (64*\off+128)*2(%rdi),%ymm8
+vmovdqa         (64*\off+ 16)*2(%rdi),%ymm5
+vmovdqa         (64*\off+144)*2(%rdi),%ymm9
+vpbroadcastq    (_ZETAS_EXP+0)*2(%rsi),%ymm2
+
+vmovdqa         (64*\off+ 32)*2(%rdi),%ymm6
+vmovdqa         (64*\off+160)*2(%rdi),%ymm10
+vmovdqa         (64*\off+ 48)*2(%rdi),%ymm7
+vmovdqa         (64*\off+176)*2(%rdi),%ymm11
+vpbroadcastq    (_ZETAS_EXP+4)*2(%rsi),%ymm3
+
+butterfly   4,5,6,7,8,9,10,11
+
+.if \off == 0
+red16       4
+.endif
+
+vmovdqa     %ymm4,(64*\off+  0)*2(%rdi)
+vmovdqa     %ymm5,(64*\off+ 16)*2(%rdi)
+vmovdqa     %ymm6,(64*\off+ 32)*2(%rdi)
+vmovdqa     %ymm7,(64*\off+ 48)*2(%rdi)
+vmovdqa     %ymm8,(64*\off+128)*2(%rdi)
+vmovdqa     %ymm9,(64*\off+144)*2(%rdi)
+vmovdqa     %ymm10,(64*\off+160)*2(%rdi)
+vmovdqa     %ymm11,(64*\off+176)*2(%rdi)
+.endm
+
+.text
+.global cdecl(invntt_avx2_asm)
+cdecl(invntt_avx2_asm):
+vmovdqa     _16XQ*2(%rsi),%ymm0
+
+intt_levels0t5  0
+intt_levels0t5  1
+
+intt_level6     0
+intt_level6     1
+ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_kem.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_kem.c
new file mode 100644
index 0000000000..9d6c49b9c4
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_kem.c
@@ -0,0 +1,158 @@
+#include <stddef.h>
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_symmetric.h"
+#include "kyber512r3_indcpa.h"
+#include "kyber512r3_indcpa_avx2.h"
+#include "tls/s2n_kem.h"
+#include "utils/s2n_safety.h"
+#include "pq-crypto/s2n_pq_random.h"
+#include "pq-crypto/s2n_pq.h"
+
+/*************************************************
+* Name:        crypto_kem_keypair
+*
+* Description: Generates public and private key
+*              for CCA-secure Kyber key encapsulation mechanism
+*
+* Arguments:   - unsigned char *pk: pointer to output public key
+*                (an already allocated array of S2N_KYBER_512_R3_PUBLIC_KEY_BYTES bytes)
+*              - unsigned char *sk: pointer to output private key
+*                (an already allocated array of S2N_KYBER_512_R3_SECRET_KEY_BYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int s2n_kyber_512_r3_crypto_kem_keypair(unsigned char *pk, unsigned char *sk)
+{
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+    if (s2n_kyber512r3_is_avx2_bmi2_enabled()) {
+        POSIX_GUARD(indcpa_keypair_avx2(pk, sk));
+    }else
+#endif
+    {
+        POSIX_GUARD(indcpa_keypair(pk, sk));
+    }
+    
+    for(size_t i = 0; i < S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES; i++) {
+        sk[i + S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES] = pk[i];
+    }
+    sha3_256(sk+S2N_KYBER_512_R3_SECRET_KEY_BYTES-2*S2N_KYBER_512_R3_SYMBYTES, pk, S2N_KYBER_512_R3_PUBLIC_KEY_BYTES);
+    /* Value z for pseudo-random output on reject */
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(sk+S2N_KYBER_512_R3_SECRET_KEY_BYTES-S2N_KYBER_512_R3_SYMBYTES, S2N_KYBER_512_R3_SYMBYTES));
+    return S2N_SUCCESS;
+}
+
+/*************************************************
+* Name:        crypto_kem_enc
+*
+* Description: Generates cipher text and shared
+*              secret for given public key
+*
+* Arguments:   - unsigned char *ct: pointer to output cipher text
+*                (an already allocated array of S2N_KYBER_512_R3_CIPHERTEXT_BYTES bytes)
+*              - unsigned char *ss: pointer to output shared secret
+*                (an already allocated array of S2N_KYBER_512_R3_SHARED_SECRET_BYTES bytes)
+*              - const unsigned char *pk: pointer to input public key
+*                (an already allocated array of S2N_KYBER_512_R3_PUBLIC_KEY_BYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int s2n_kyber_512_r3_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk)
+{
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+    uint8_t buf[2*S2N_KYBER_512_R3_SYMBYTES];
+    /* Will contain key, coins */
+    uint8_t kr[2*S2N_KYBER_512_R3_SYMBYTES];
+
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, S2N_KYBER_512_R3_SYMBYTES));
+    /* Don't release system RNG output */
+    sha3_256(buf, buf, S2N_KYBER_512_R3_SYMBYTES);
+
+    /* Multitarget countermeasure for coins + contributory KEM */
+    sha3_256(buf+S2N_KYBER_512_R3_SYMBYTES, pk, S2N_KYBER_512_R3_PUBLIC_KEY_BYTES);
+    sha3_512(kr, buf, 2*S2N_KYBER_512_R3_SYMBYTES);
+
+    /* coins are in kr+S2N_KYBER_512_R3_SYMBYTES */
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+    if (s2n_kyber512r3_is_avx2_bmi2_enabled()) {
+        indcpa_enc_avx2(ct, buf, pk, kr+S2N_KYBER_512_R3_SYMBYTES);
+    }else
+#endif
+    {
+        indcpa_enc(ct, buf, pk, kr+S2N_KYBER_512_R3_SYMBYTES);
+    }
+    
+    /* overwrite coins in kr with H(c) */
+    sha3_256(kr+S2N_KYBER_512_R3_SYMBYTES, ct, S2N_KYBER_512_R3_CIPHERTEXT_BYTES);
+    /* hash concatenation of pre-k and H(c) to k */
+    shake256(ss, S2N_KYBER_512_R3_SSBYTES, kr, 2*S2N_KYBER_512_R3_SYMBYTES);
+    return S2N_SUCCESS;
+}
+
+/*************************************************
+* Name:        crypto_kem_dec
+*
+* Description: Generates shared secret for given
+*              cipher text and private key
+*
+* Arguments:   - unsigned char *ss: pointer to output shared secret
+*                (an already allocated array of S2N_KYBER_512_R3_SHARED_SECRET_BYTES bytes)
+*              - const unsigned char *ct: pointer to input cipher text
+*                (an already allocated array of S2N_KYBER_512_R3_CIPHERTEXT_BYTES bytes)
+*              - const unsigned char *sk: pointer to input private key
+*                (an already allocated array of S2N_KYBER_512_R3_SECRET_KEY_BYTES bytes)
+*
+* Returns 0.
+*
+* On failure, ss will contain a pseudo-random value.
+**************************************************/
+int s2n_kyber_512_r3_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk)
+{
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+    uint8_t buf[2*S2N_KYBER_512_R3_SYMBYTES];
+    /* Will contain key, coins */
+    uint8_t kr[2*S2N_KYBER_512_R3_SYMBYTES];
+    uint8_t cmp[S2N_KYBER_512_R3_CIPHERTEXT_BYTES];
+    const uint8_t *pk = sk+S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES;
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+    if (s2n_kyber512r3_is_avx2_bmi2_enabled()) {
+        indcpa_dec_avx2(buf, ct, sk);
+    }else
+#endif
+    {
+        indcpa_dec(buf, ct, sk);
+    }
+    
+    /* Multitarget countermeasure for coins + contributory KEM */
+    for(size_t i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) {
+        buf[S2N_KYBER_512_R3_SYMBYTES + i] = sk[S2N_KYBER_512_R3_SECRET_KEY_BYTES - 2 * S2N_KYBER_512_R3_SYMBYTES + i];
+    }
+    sha3_512(kr, buf, 2*S2N_KYBER_512_R3_SYMBYTES);
+
+    /* coins are in kr+S2N_KYBER_512_R3_SYMBYTES */
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+    if (s2n_kyber512r3_is_avx2_bmi2_enabled()) {
+        indcpa_enc_avx2(cmp, buf, pk, kr+S2N_KYBER_512_R3_SYMBYTES);
+    }else
+#endif
+    {
+        indcpa_enc(cmp, buf, pk, kr+S2N_KYBER_512_R3_SYMBYTES);
+    }
+    
+    /* If ct and cmp are equal (dont_copy = 1), decryption has succeeded and we do NOT overwrite pre-k below.
+     * If ct and cmp are not equal (dont_copy = 0), decryption fails and we do overwrite pre-k. */
+    int dont_copy = s2n_constant_time_equals(ct, cmp, S2N_KYBER_512_R3_CIPHERTEXT_BYTES);
+
+    /* overwrite coins in kr with H(c) */
+    sha3_256(kr+S2N_KYBER_512_R3_SYMBYTES, ct, S2N_KYBER_512_R3_CIPHERTEXT_BYTES);
+
+    /* Overwrite pre-k with z on re-encryption failure */
+    POSIX_GUARD(s2n_constant_time_copy_or_dont(kr, sk+S2N_KYBER_512_R3_SECRET_KEY_BYTES-S2N_KYBER_512_R3_SYMBYTES,
+            S2N_KYBER_512_R3_SYMBYTES, dont_copy));
+
+    /* hash concatenation of pre-k and H(c) to k */
+    shake256(ss, S2N_KYBER_512_R3_SSBYTES, kr, 2*S2N_KYBER_512_R3_SYMBYTES);
+    return S2N_SUCCESS;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.c
new file mode 100644
index 0000000000..6c82105c19
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.c
@@ -0,0 +1,122 @@
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_ntt.h"
+#include "kyber512r3_reduce.h"
+
+const int16_t zetas[128] = {
+    2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962,
+    2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017,
+    732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047,
+    1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830,
+    107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226,
+    430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574,
+    1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349,
+    418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193,
+    1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459,
+    478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628
+};
+
+const int16_t zetas_inv[128] = {
+    1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535,
+    1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465,
+    1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685,
+    1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235,
+    3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652,
+    1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853,
+    1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552,
+    2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871,
+    829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171,
+    3127, 3042, 1907, 1836, 1517, 359, 758, 1441
+};
+
+/*************************************************
+* Name:        fqmul
+*
+* Description: Multiplication followed by Montgomery reduction
+*
+* Arguments:   - int16_t a: first factor
+*              - int16_t b: second factor
+*
+* Returns 16-bit integer congruent to a*b*R^{-1} mod q
+**************************************************/
+static int16_t fqmul(int16_t a, int16_t b) {
+    return montgomery_reduce((int32_t)a * b);
+}
+
+/*************************************************
+* Name:        ntt
+*
+* Description: Inplace number-theoretic transform (NTT) in Rq
+*              input is in standard order, output is in bitreversed order
+*
+* Arguments:   - int16_t r[256]: pointer to input/output vector of elements
+*                                of Zq
+**************************************************/
+void ntt(int16_t r[256]) {
+    unsigned int len, start, j, k;
+    int16_t t, zeta;
+
+    k = 1;
+    for (len = 128; len >= 2; len >>= 1) {
+        for (start = 0; start < 256; start = j + len) {
+            zeta = zetas[k++];
+            for (j = start; j < start + len; ++j) {
+                t = fqmul(zeta, r[j + len]);
+                r[j + len] = r[j] - t;
+                r[j] = r[j] + t;
+            }
+        }
+    }
+}
+
+/*************************************************
+* Name:        invntt_tomont
+*
+* Description: Inplace inverse number-theoretic transform in Rq and
+*              multiplication by Montgomery factor 2^16.
+*              Input is in bitreversed order, output is in standard order
+*
+* Arguments:   - int16_t r[256]: pointer to input/output vector of elements
+*                                of Zq
+**************************************************/
+void invntt(int16_t r[256]) {
+    unsigned int start, len, j, k;
+    int16_t t, zeta;
+
+    k = 0;
+    for (len = 2; len <= 128; len <<= 1) {
+        for (start = 0; start < 256; start = j + len) {
+            zeta = zetas_inv[k++];
+            for (j = start; j < start + len; ++j) {
+                t = r[j];
+                r[j] = barrett_reduce(t + r[j + len]);
+                r[j + len] = t - r[j + len];
+                r[j + len] = fqmul(zeta, r[j + len]);
+            }
+        }
+    }
+
+    for (j = 0; j < 256; ++j) {
+        r[j] = fqmul(r[j], zetas_inv[127]);
+    }
+}
+
+/*************************************************
+* Name:        basemul
+*
+* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
+*              used for multiplication of elements in Rq in NTT domain
+*
+* Arguments:   - int16_t r[2]:       pointer to the output polynomial
+*              - const int16_t a[2]: pointer to the first factor
+*              - const int16_t b[2]: pointer to the second factor
+*              - int16_t zeta:       integer defining the reduction polynomial
+**************************************************/
+void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) {
+    r[0]  = fqmul(a[1], b[1]);
+    r[0]  = fqmul(r[0], zeta);
+    r[0] += fqmul(a[0], b[0]);
+
+    r[1]  = fqmul(a[0], b[1]);
+    r[1] += fqmul(a[1], b[0]);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.h
new file mode 100644
index 0000000000..98d6235764
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+
+#define zetas S2N_KYBER_512_R3_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
+#define zetas_inv S2N_KYBER_512_R3_NAMESPACE(zetas_inv)
+extern const int16_t zetas_inv[128];
+
+#define ntt S2N_KYBER_512_R3_NAMESPACE(ntt)
+void ntt(int16_t poly[256]);
+
+#define invntt S2N_KYBER_512_R3_NAMESPACE(invntt)
+void invntt(int16_t poly[256]);
+
+#define basemul S2N_KYBER_512_R3_NAMESPACE(basemul)
+void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.S
new file mode 100644
index 0000000000..dc80086cb1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.S
@@ -0,0 +1,218 @@
+#include "kyber512r3_consts_avx2.h"
+
+// The small macros (.inc files) are combined with .S files directly
+/*****.include "shuffle.inc"*****/
+/********************************/
+.macro shuffle8 r0,r1,r2,r3
+vperm2i128  $0x20,%ymm\r1,%ymm\r0,%ymm\r2
+vperm2i128  $0x31,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle4 r0,r1,r2,r3
+vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
+vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle2 r0,r1,r2,r3
+#vpsllq     $32,%ymm\r1,%ymm\r2
+vmovsldup   %ymm\r1,%ymm\r2
+vpblendd    $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
+vpsrlq      $32,%ymm\r0,%ymm\r0
+#vmovshdup  %ymm\r0,%ymm\r0
+vpblendd    $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle1 r0,r1,r2,r3
+vpslld      $16,%ymm\r1,%ymm\r2
+vpblendw    $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
+vpsrld      $16,%ymm\r0,%ymm\r0
+vpblendw    $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+/********************************/
+
+.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2
+vpmullw     %ymm\zl0,%ymm\rh0,%ymm12
+vpmullw     %ymm\zl0,%ymm\rh1,%ymm13
+
+vpmullw     %ymm\zl1,%ymm\rh2,%ymm14
+vpmullw     %ymm\zl1,%ymm\rh3,%ymm15
+
+vpmulhw     %ymm\zh0,%ymm\rh0,%ymm\rh0
+vpmulhw     %ymm\zh0,%ymm\rh1,%ymm\rh1
+
+vpmulhw     %ymm\zh1,%ymm\rh2,%ymm\rh2
+vpmulhw     %ymm\zh1,%ymm\rh3,%ymm\rh3
+.endm
+
+.macro reduce
+vpmulhw     %ymm0,%ymm12,%ymm12
+vpmulhw     %ymm0,%ymm13,%ymm13
+
+vpmulhw     %ymm0,%ymm14,%ymm14
+vpmulhw     %ymm0,%ymm15,%ymm15
+.endm
+
+.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3
+vpaddw      %ymm\rh0,%ymm\rl0,%ymm\rln
+vpsubw      %ymm\rh0,%ymm\rl0,%ymm\rh0
+vpaddw      %ymm\rh1,%ymm\rl1,%ymm\rl0
+
+vpsubw      %ymm\rh1,%ymm\rl1,%ymm\rh1
+vpaddw      %ymm\rh2,%ymm\rl2,%ymm\rl1
+vpsubw      %ymm\rh2,%ymm\rl2,%ymm\rh2
+
+vpaddw      %ymm\rh3,%ymm\rl3,%ymm\rl2
+vpsubw      %ymm\rh3,%ymm\rl3,%ymm\rh3
+
+vpsubw      %ymm12,%ymm\rln,%ymm\rln
+vpaddw      %ymm12,%ymm\rh0,%ymm\rh0
+vpsubw      %ymm13,%ymm\rl0,%ymm\rl0
+
+vpaddw      %ymm13,%ymm\rh1,%ymm\rh1
+vpsubw      %ymm14,%ymm\rl1,%ymm\rl1
+vpaddw      %ymm14,%ymm\rh2,%ymm\rh2
+
+vpsubw      %ymm15,%ymm\rl2,%ymm\rl2
+vpaddw      %ymm15,%ymm\rh3,%ymm\rh3
+.endm
+
+.macro level0 off
+vpbroadcastq    (_ZETAS_EXP+0)*2(%rsi),%ymm15
+vmovdqa         (64*\off+128)*2(%rdi),%ymm8
+vmovdqa         (64*\off+144)*2(%rdi),%ymm9
+vmovdqa         (64*\off+160)*2(%rdi),%ymm10
+vmovdqa         (64*\off+176)*2(%rdi),%ymm11
+vpbroadcastq    (_ZETAS_EXP+4)*2(%rsi),%ymm2
+
+mul         8,9,10,11
+
+vmovdqa     (64*\off+  0)*2(%rdi),%ymm4
+vmovdqa     (64*\off+ 16)*2(%rdi),%ymm5
+vmovdqa     (64*\off+ 32)*2(%rdi),%ymm6
+vmovdqa     (64*\off+ 48)*2(%rdi),%ymm7
+
+reduce
+update      3,4,5,6,7,8,9,10,11
+
+vmovdqa     %ymm3,(64*\off+  0)*2(%rdi)
+vmovdqa     %ymm4,(64*\off+ 16)*2(%rdi)
+vmovdqa     %ymm5,(64*\off+ 32)*2(%rdi)
+vmovdqa     %ymm6,(64*\off+ 48)*2(%rdi)
+vmovdqa     %ymm8,(64*\off+128)*2(%rdi)
+vmovdqa     %ymm9,(64*\off+144)*2(%rdi)
+vmovdqa     %ymm10,(64*\off+160)*2(%rdi)
+vmovdqa     %ymm11,(64*\off+176)*2(%rdi)
+.endm
+
+.macro levels1t6 off
+/* level 1 */
+vmovdqa     (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15
+vmovdqa     (128*\off+ 64)*2(%rdi),%ymm8
+vmovdqa     (128*\off+ 80)*2(%rdi),%ymm9
+vmovdqa     (128*\off+ 96)*2(%rdi),%ymm10
+vmovdqa     (128*\off+112)*2(%rdi),%ymm11
+vmovdqa     (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2
+
+mul         8,9,10,11
+
+vmovdqa     (128*\off+  0)*2(%rdi),%ymm4
+vmovdqa     (128*\off+ 16)*2(%rdi),%ymm5
+vmovdqa     (128*\off+ 32)*2(%rdi),%ymm6
+vmovdqa     (128*\off+ 48)*2(%rdi),%ymm7
+
+reduce
+update      3,4,5,6,7,8,9,10,11
+
+/* level 2 */
+shuffle8    5,10,7,10
+shuffle8    6,11,5,11
+
+vmovdqa     (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15
+vmovdqa     (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2
+
+mul         7,10,5,11
+
+shuffle8    3,8,6,8
+shuffle8    4,9,3,9
+
+reduce
+update      4,6,8,3,9,7,10,5,11
+
+/* level 3 */
+shuffle4    8,5,9,5
+shuffle4    3,11,8,11
+
+vmovdqa     (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15
+vmovdqa     (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2
+
+mul         9,5,8,11
+
+shuffle4    4,7,3,7
+shuffle4    6,10,4,10
+
+reduce
+update      6,3,7,4,10,9,5,8,11
+
+/* level 4 */
+shuffle2    7,8,10,8
+shuffle2    4,11,7,11
+
+vmovdqa     (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15
+vmovdqa     (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2
+
+mul         10,8,7,11
+
+shuffle2    6,9,4,9
+shuffle2    3,5,6,5
+
+reduce
+update      3,4,9,6,5,10,8,7,11
+
+/* level 5 */
+shuffle1    9,7,5,7
+shuffle1    6,11,9,11
+
+vmovdqa     (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15
+vmovdqa     (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2
+
+mul         5,7,9,11
+
+shuffle1    3,10,6,10
+shuffle1    4,8,3,8
+
+reduce
+update      4,6,10,3,8,5,7,9,11
+
+/* level 6 */
+vmovdqa     (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14
+vmovdqa     (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15
+vmovdqa     (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8
+vmovdqa     (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2
+
+mul         10,3,9,11,14,15,8,2
+
+reduce
+update      8,4,6,5,7,10,3,9,11
+
+vmovdqa     %ymm8,(128*\off+  0)*2(%rdi)
+vmovdqa     %ymm4,(128*\off+ 16)*2(%rdi)
+vmovdqa     %ymm10,(128*\off+ 32)*2(%rdi)
+vmovdqa     %ymm3,(128*\off+ 48)*2(%rdi)
+vmovdqa     %ymm6,(128*\off+ 64)*2(%rdi)
+vmovdqa     %ymm5,(128*\off+ 80)*2(%rdi)
+vmovdqa     %ymm9,(128*\off+ 96)*2(%rdi)
+vmovdqa     %ymm11,(128*\off+112)*2(%rdi)
+.endm
+
+.text
+.global cdecl(ntt_avx2_asm)
+cdecl(ntt_avx2_asm):
+vmovdqa     _16XQ*2(%rsi),%ymm0
+
+level0      0
+level0      1
+
+levels1t6   0
+levels1t6   1
+
+ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.h
new file mode 100644
index 0000000000..3616132358
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <stdint.h>
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+#define ntt_avx2_asm S2N_KYBER_512_R3_NAMESPACE(ntt_avx2_asm)
+void ntt_avx2_asm(__m256i *r, const __m256i *qdata);
+
+#define invntt_avx2_asm S2N_KYBER_512_R3_NAMESPACE(invntt_avx2_asm)
+void invntt_avx2_asm(__m256i *r, const __m256i *qdata);
+
+#define nttunpack_avx2_asm S2N_KYBER_512_R3_NAMESPACE(nttunpack_avx2_asm)
+void nttunpack_avx2_asm(__m256i *r, const __m256i *qdata);
+
+#define basemul_avx2_asm S2N_KYBER_512_R3_NAMESPACE(basemul_avx2_asm)
+void basemul_avx2_asm(__m256i *r,
+                 const __m256i *a,
+                 const __m256i *b,
+                 const __m256i *qdata);
+
+#define ntttobytes_avx2_asm S2N_KYBER_512_R3_NAMESPACE(ntttobytes_avx2_asm)
+void ntttobytes_avx2_asm(uint8_t *r, const __m256i *a, const __m256i *qdata);
+
+#define nttfrombytes_avx2_asm S2N_KYBER_512_R3_NAMESPACE(nttfrombytes_avx2_asm)
+void nttfrombytes_avx2_asm(__m256i *r, const uint8_t *a, const __m256i *qdata);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_params.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_params.h
new file mode 100644
index 0000000000..d2d32d08f1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_params.h
@@ -0,0 +1,31 @@
+#pragma once
+
+/* All kyber512r3 functions and global variables in the pq-crypto/kyber_r3 directory
+ * should be defined using the namespace macro to avoid symbol collisions. For example,
+ * in foo.h, declare a function as follows:
+ *
+ * #define foo_function S2N_KYBER_512_R3_NAMESPACE(foo_function)
+ * int foo_function(int foo_argument); */
+#define S2N_KYBER_512_R3_NAMESPACE(s) s2n_kyber_512_r3_##s
+
+#define S2N_KYBER_512_R3_K 2
+
+#define S2N_KYBER_512_R3_N 256
+#define S2N_KYBER_512_R3_Q 3329
+
+#define S2N_KYBER_512_R3_SYMBYTES 32   /* size in bytes of hashes, and seeds */
+#define S2N_KYBER_512_R3_SSBYTES  32   /* size in bytes of shared key */
+
+#define S2N_KYBER_512_R3_POLYBYTES     384
+#define S2N_KYBER_512_R3_POLYVECBYTES  (S2N_KYBER_512_R3_K * S2N_KYBER_512_R3_POLYBYTES)
+
+#define S2N_KYBER_512_R3_ETA1 3
+#define S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES    128
+#define S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES 640
+
+#define S2N_KYBER_512_R3_ETA2 2
+
+#define S2N_KYBER_512_R3_INDCPA_MSGBYTES       S2N_KYBER_512_R3_SYMBYTES
+#define S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES (S2N_KYBER_512_R3_POLYVECBYTES + S2N_KYBER_512_R3_SYMBYTES)
+#define S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES (S2N_KYBER_512_R3_POLYVECBYTES)
+#define S2N_KYBER_512_R3_INDCPA_BYTES          (S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES + S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES)
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.c
new file mode 100644
index 0000000000..76ae60a583
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.c
@@ -0,0 +1,300 @@
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_poly.h"
+#include "kyber512r3_ntt.h"
+#include "kyber512r3_reduce.h"
+#include "kyber512r3_cbd.h"
+#include "kyber512r3_symmetric.h"
+
+/*************************************************
+* Name:        poly_compress
+*
+* Description: Compression and subsequent serialization of a polynomial
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (of length S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES)
+*              - poly *a:    pointer to input polynomial
+**************************************************/
+void poly_compress(uint8_t r[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES], poly *a) {
+    unsigned int i, j;
+    uint8_t t[8];
+
+    poly_csubq(a);
+
+    for (i = 0; i < S2N_KYBER_512_R3_N / 8; i++) {
+        for (j = 0; j < 8; j++) {
+            t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 4) + S2N_KYBER_512_R3_Q / 2) / S2N_KYBER_512_R3_Q) & 15;
+        }
+
+        r[0] = t[0] | (t[1] << 4);
+        r[1] = t[2] | (t[3] << 4);
+        r[2] = t[4] | (t[5] << 4);
+        r[3] = t[6] | (t[7] << 4);
+        r += 4;
+    }
+}
+
+/*************************************************
+* Name:        poly_decompress
+*
+* Description: De-serialization and subsequent decompression of a polynomial;
+*              approximate inverse of poly_compress
+*
+* Arguments:   - poly *r:          pointer to output polynomial
+*              - const uint8_t *a: pointer to input byte array
+*                                  (of length S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES bytes)
+**************************************************/
+void poly_decompress(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES]) {
+    unsigned int i;
+
+    for (i = 0; i < S2N_KYBER_512_R3_N / 2; i++) {
+        r->coeffs[2 * i + 0] = (((uint16_t)(a[0] & 15) * S2N_KYBER_512_R3_Q) + 8) >> 4;
+        r->coeffs[2 * i + 1] = (((uint16_t)(a[0] >> 4) * S2N_KYBER_512_R3_Q) + 8) >> 4;
+        a += 1;
+    }
+}
+
+/*************************************************
+* Name:        poly_tobytes
+*
+* Description: Serialization of a polynomial
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (needs space for S2N_KYBER_512_R3_POLYBYTES bytes)
+*              - poly *a:    pointer to input polynomial
+**************************************************/
+void poly_tobytes(uint8_t r[S2N_KYBER_512_R3_POLYBYTES], poly *a) {
+    unsigned int i;
+
+    poly_csubq(a);
+
+    for (i = 0; i < S2N_KYBER_512_R3_N / 2; i++) {
+        uint16_t t0 = a->coeffs[2 * i];
+        uint16_t t1 = a->coeffs[2 * i + 1];
+        r[3 * i + 0] = (t0 >> 0);
+        r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
+        r[3 * i + 2] = (t1 >> 4);
+    }
+}
+
+/*************************************************
+* Name:        poly_frombytes
+*
+* Description: De-serialization of a polynomial;
+*              inverse of poly_tobytes
+*
+* Arguments:   - poly *r:          pointer to output polynomial
+*              - const uint8_t *a: pointer to input byte array
+*                                  (of S2N_KYBER_512_R3_POLYBYTES bytes)
+**************************************************/
+void poly_frombytes(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYBYTES]) {
+    unsigned int i;
+    for (i = 0; i < S2N_KYBER_512_R3_N / 2; i++) {
+        r->coeffs[2 * i]   = ((a[3 * i + 0] >> 0) | ((uint16_t)a[3 * i + 1] << 8)) & 0xFFF;
+        r->coeffs[2 * i + 1] = ((a[3 * i + 1] >> 4) | ((uint16_t)a[3 * i + 2] << 4)) & 0xFFF;
+    }
+}
+
+/*************************************************
+* Name:        poly_frommsg
+*
+* Description: Convert 32-byte message to polynomial
+*
+* Arguments:   - poly *r:            pointer to output polynomial
+*              - const uint8_t *msg: pointer to input message
+**************************************************/
+void poly_frommsg(poly *r, const uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES]) {
+    unsigned int i, j;
+    int16_t mask;
+
+    for (i = 0; i < S2N_KYBER_512_R3_N / 8; i++) {
+        for (j = 0; j < 8; j++) {
+            mask = -(int16_t)((msg[i] >> j) & 1);
+            r->coeffs[8 * i + j] = mask & ((S2N_KYBER_512_R3_Q + 1) / 2);
+        }
+    }
+}
+
+/*************************************************
+* Name:        poly_tomsg
+*
+* Description: Convert polynomial to 32-byte message
+*
+* Arguments:   - uint8_t *msg: pointer to output message
+*              - poly *a:      pointer to input polynomial
+**************************************************/
+void poly_tomsg(uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES], poly *a) {
+    unsigned int i, j;
+    uint16_t t;
+
+    poly_csubq(a);
+
+    for (i = 0; i < S2N_KYBER_512_R3_N / 8; i++) {
+        msg[i] = 0;
+        for (j = 0; j < 8; j++) {
+            t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + S2N_KYBER_512_R3_Q / 2) / S2N_KYBER_512_R3_Q) & 1;
+            msg[i] |= t << j;
+        }
+    }
+}
+
+/*************************************************
+* Name:        poly_getnoise_eta1
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+*              with output polynomial close to centered binomial distribution
+*              with parameter S2N_KYBER_512_R3_ETA1
+*
+* Arguments:   - poly *r:             pointer to output polynomial
+*              - const uint8_t *seed: pointer to input seed
+*                                     (of length S2N_KYBER_512_R3_SYMBYTES bytes)
+*              - uint8_t nonce:       one-byte input nonce
+**************************************************/
+void poly_getnoise_eta1(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce) {
+    uint8_t buf[S2N_KYBER_512_R3_ETA1 * S2N_KYBER_512_R3_N / 4];
+    shake256_prf(buf, sizeof(buf), seed, nonce);
+    cbd_eta1(r, buf);
+}
+
+/*************************************************
+* Name:        poly_getnoise_eta2
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+*              with output polynomial close to centered binomial distribution
+*              with parameter S2N_KYBER_512_R3_ETA2
+*
+* Arguments:   - poly *r:             pointer to output polynomial
+*              - const uint8_t *seed: pointer to input seed
+*                                     (of length S2N_KYBER_512_R3_SYMBYTES bytes)
+*              - uint8_t nonce:       one-byte input nonce
+**************************************************/
+void poly_getnoise_eta2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce) {
+    uint8_t buf[S2N_KYBER_512_R3_ETA2 * S2N_KYBER_512_R3_N / 4];
+    shake256_prf(buf, sizeof(buf), seed, nonce);
+    cbd_eta2(r, buf);
+}
+
+
+/*************************************************
+* Name:        poly_ntt
+*
+* Description: Computes negacyclic number-theoretic transform (NTT) of
+*              a polynomial in place;
+*              inputs assumed to be in normal order, output in bitreversed order
+*
+* Arguments:   - uint16_t *r: pointer to in/output polynomial
+**************************************************/
+void poly_ntt(poly *r) {
+    ntt(r->coeffs);
+    poly_reduce(r);
+}
+
+/*************************************************
+* Name:        poly_invntt_tomont
+*
+* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+*              of a polynomial in place;
+*              inputs assumed to be in bitreversed order, output in normal order
+*
+* Arguments:   - uint16_t *a: pointer to in/output polynomial
+**************************************************/
+void poly_invntt_tomont(poly *r) {
+    invntt(r->coeffs);
+}
+
+/*************************************************
+* Name:        poly_basemul_montgomery
+*
+* Description: Multiplication of two polynomials in NTT domain
+*
+* Arguments:   - poly *r:       pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_basemul_montgomery(poly *r, const poly *a, const poly *b) {
+    unsigned int i;
+    for (i = 0; i < S2N_KYBER_512_R3_N / 4; i++) {
+        basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], zetas[64 + i]);
+        basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2],
+                -zetas[64 + i]);
+    }
+}
+
+/*************************************************
+* Name:        poly_tomont
+*
+* Description: Inplace conversion of all coefficients of a polynomial
+*              from normal domain to Montgomery domain
+*
+* Arguments:   - poly *r: pointer to input/output polynomial
+**************************************************/
+void poly_tomont(poly *r) {
+    unsigned int i;
+    const int16_t f = (1ULL << 32) % S2N_KYBER_512_R3_Q;
+    for (i = 0; i < S2N_KYBER_512_R3_N; i++) {
+        r->coeffs[i] = montgomery_reduce((int32_t)r->coeffs[i] * f);
+    }
+}
+
+/*************************************************
+* Name:        poly_reduce
+*
+* Description: Applies Barrett reduction to all coefficients of a polynomial
+*              for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments:   - poly *r: pointer to input/output polynomial
+**************************************************/
+void poly_reduce(poly *r) {
+    unsigned int i;
+    for (i = 0; i < S2N_KYBER_512_R3_N; i++) {
+        r->coeffs[i] = barrett_reduce(r->coeffs[i]);
+    }
+}
+
+/*************************************************
+* Name:        poly_csubq
+*
+* Description: Applies conditional subtraction of q to each coefficient
+*              of a polynomial. For details of conditional subtraction
+*              of q see comments in reduce.c
+*
+* Arguments:   - poly *r: pointer to input/output polynomial
+**************************************************/
+void poly_csubq(poly *r) {
+    unsigned int i;
+    for (i = 0; i < S2N_KYBER_512_R3_N; i++) {
+        r->coeffs[i] = csubq(r->coeffs[i]);
+    }
+}
+
+/*************************************************
+* Name:        poly_add
+*
+* Description: Add two polynomials
+*
+* Arguments: - poly *r:       pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_add(poly *r, const poly *a, const poly *b) {
+    unsigned int i;
+    for (i = 0; i < S2N_KYBER_512_R3_N; i++) {
+        r->coeffs[i] = a->coeffs[i] + b->coeffs[i];
+    }
+}
+
+/*************************************************
+* Name:        poly_sub
+*
+* Description: Subtract two polynomials
+*
+* Arguments: - poly *r:       pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_sub(poly *r, const poly *a, const poly *b) {
+    unsigned int i;
+    for (i = 0; i < S2N_KYBER_512_R3_N; i++) {
+        r->coeffs[i] = a->coeffs[i] - b->coeffs[i];
+    }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.h
new file mode 100644
index 0000000000..da43766e51
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+
+/*
+ * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
+ * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
+ */
+#define poly S2N_KYBER_512_R3_NAMESPACE(poly)
+typedef struct {
+    int16_t coeffs[S2N_KYBER_512_R3_N];
+} poly;
+
+#define poly_compress S2N_KYBER_512_R3_NAMESPACE(poly_compress)
+void poly_compress(uint8_t r[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES], poly *a);
+
+#define poly_decompress S2N_KYBER_512_R3_NAMESPACE(poly_decompress)
+void poly_decompress(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES]);
+
+#define poly_tobytes S2N_KYBER_512_R3_NAMESPACE(poly_tobytes)
+void poly_tobytes(uint8_t r[S2N_KYBER_512_R3_POLYBYTES], poly *a);
+
+#define poly_frombytes S2N_KYBER_512_R3_NAMESPACE(poly_frombytes)
+void poly_frombytes(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYBYTES]);
+
+#define poly_frommsg S2N_KYBER_512_R3_NAMESPACE(poly_frommsg)
+void poly_frommsg(poly *r, const uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES]);
+
+#define poly_tomsg S2N_KYBER_512_R3_NAMESPACE(poly_tomsg)
+void poly_tomsg(uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES], poly *r);
+
+#define poly_getnoise_eta1 S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta1)
+void poly_getnoise_eta1(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce);
+
+#define poly_getnoise_eta2 S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta2)
+void poly_getnoise_eta2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce);
+
+#define poly_ntt S2N_KYBER_512_R3_NAMESPACE(poly_ntt)
+void poly_ntt(poly *r);
+
+#define poly_invntt_tomont S2N_KYBER_512_R3_NAMESPACE(poly_invntt_tomont)
+void poly_invntt_tomont(poly *r);
+
+#define poly_basemul_montgomery S2N_KYBER_512_R3_NAMESPACE(poly_basemul_montgomery)
+void poly_basemul_montgomery(poly *r, const poly *a, const poly *b);
+
+#define poly_tomont S2N_KYBER_512_R3_NAMESPACE(poly_tomont)
+void poly_tomont(poly *r);
+
+#define poly_reduce S2N_KYBER_512_R3_NAMESPACE(poly_reduce)
+void poly_reduce(poly *r);
+
+#define poly_csubq S2N_KYBER_512_R3_NAMESPACE(poly_csubq)
+void poly_csubq(poly *r);
+
+#define poly_add S2N_KYBER_512_R3_NAMESPACE(poly_add)
+void poly_add(poly *r, const poly *a, const poly *b);
+
+#define poly_sub S2N_KYBER_512_R3_NAMESPACE(poly_sub)
+void poly_sub(poly *r, const poly *a, const poly *b);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.c
new file mode 100644
index 0000000000..aa961ff403
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.c
@@ -0,0 +1,453 @@
+#include <stdint.h>
+#include <string.h>
+#include "kyber512r3_align_avx2.h"
+#include "kyber512r3_consts_avx2.h"
+#include "kyber512r3_poly_avx2.h"
+#include "kyber512r3_ntt_avx2.h"
+#include "kyber512r3_reduce_avx2.h"
+#include "kyber512r3_cbd_avx2.h"
+#include "kyber512r3_fips202.h"
+#include "kyber512r3_fips202x4_avx2.h"
+#include "kyber512r3_symmetric.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+/*************************************************
+* Name:        poly_compress_avx2
+*
+* Description: Compression and subsequent serialization of a polynomial.
+*              The coefficients of the input polynomial are assumed to
+*              lie in the invertal [0,q], i.e. the polynomial must be reduced
+*              by poly_reduce_avx2().
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (of length S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES)
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void poly_compress_avx2(uint8_t r[128], const poly * restrict a)
+{
+  unsigned int i;
+  __m256i f0, f1, f2, f3;
+  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
+  const __m256i shift1 = _mm256_set1_epi16(1 << 9);
+  const __m256i mask = _mm256_set1_epi16(15);
+  const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1);
+  const __m256i permdidx = _mm256_set_epi32(7,3,6,2,5,1,4,0);
+
+  for(i=0;i<S2N_KYBER_512_R3_N/64;i++) {
+    f0 = _mm256_load_si256(&a->vec[4*i+0]);
+    f1 = _mm256_load_si256(&a->vec[4*i+1]);
+    f2 = _mm256_load_si256(&a->vec[4*i+2]);
+    f3 = _mm256_load_si256(&a->vec[4*i+3]);
+    f0 = _mm256_mulhi_epi16(f0,v);
+    f1 = _mm256_mulhi_epi16(f1,v);
+    f2 = _mm256_mulhi_epi16(f2,v);
+    f3 = _mm256_mulhi_epi16(f3,v);
+    f0 = _mm256_mulhrs_epi16(f0,shift1);
+    f1 = _mm256_mulhrs_epi16(f1,shift1);
+    f2 = _mm256_mulhrs_epi16(f2,shift1);
+    f3 = _mm256_mulhrs_epi16(f3,shift1);
+    f0 = _mm256_and_si256(f0,mask);
+    f1 = _mm256_and_si256(f1,mask);
+    f2 = _mm256_and_si256(f2,mask);
+    f3 = _mm256_and_si256(f3,mask);
+    f0 = _mm256_packus_epi16(f0,f1);
+    f2 = _mm256_packus_epi16(f2,f3);
+    f0 = _mm256_maddubs_epi16(f0,shift2);
+    f2 = _mm256_maddubs_epi16(f2,shift2);
+    f0 = _mm256_packus_epi16(f0,f2);
+    f0 = _mm256_permutevar8x32_epi32(f0,permdidx);
+    // correcting cast-align error
+    // old version: _mm256_storeu_si256((__m256i *)&r[32*i],f0);
+    _mm256_storeu_si256((void *)&r[32*i],f0);
+  }
+}
+
+void poly_decompress_avx2(poly * restrict r, const uint8_t a[128])
+{
+  unsigned int i;
+  __m128i t;
+  __m256i f;
+  const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
+  const __m256i shufbidx = _mm256_set_epi8(7,7,7,7,6,6,6,6,5,5,5,5,4,4,4,4,
+                                           3,3,3,3,2,2,2,2,1,1,1,1,0,0,0,0);
+  const __m256i mask = _mm256_set1_epi32(0x00F0000F);
+  const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048);
+
+  for(i=0;i<S2N_KYBER_512_R3_N/16;i++) {
+    // correcting cast-align and cast-qual errors
+    // old version: t = _mm_loadl_epi64((__m128i *)&a[8*i]);
+    t = _mm_loadl_epi64((const void *)&a[8*i]);
+    f = _mm256_broadcastsi128_si256(t);
+    f = _mm256_shuffle_epi8(f,shufbidx);
+    f = _mm256_and_si256(f,mask);
+    f = _mm256_mullo_epi16(f,shift);
+    f = _mm256_mulhrs_epi16(f,q);
+    _mm256_store_si256(&r->vec[i],f);
+  }
+}
+
+/*************************************************
+* Name:        poly_tobytes_avx2
+*
+* Description: Serialization of a polynomial in NTT representation.
+*              The coefficients of the input polynomial are assumed to
+*              lie in the invertal [0,q], i.e. the polynomial must be reduced
+*              by poly_reduce_avx2(). The coefficients are orderd as output by
+*              poly_ntt_avx2(); the serialized output coefficients are in bitreversed
+*              order.
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (needs space for S2N_KYBER_512_R3_POLYBYTES bytes)
+*              - poly *a: pointer to input polynomial
+**************************************************/
+void poly_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYBYTES], const poly *a)
+{
+  ntttobytes_avx2_asm(r, a->vec, qdata.vec);
+}
+
+/*************************************************
+* Name:        poly_frombytes_avx2
+*
+* Description: De-serialization of a polynomial;
+*              inverse of poly_tobytes_avx2
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: pointer to input byte array
+*                                  (of S2N_KYBER_512_R3_POLYBYTES bytes)
+**************************************************/
+void poly_frombytes_avx2(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYBYTES])
+{
+  nttfrombytes_avx2_asm(r->vec, a, qdata.vec);
+}
+
+/*************************************************
+* Name:        poly_frommsg_avx2
+*
+* Description: Convert 32-byte message to polynomial
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *msg: pointer to input message
+**************************************************/
+void poly_frommsg_avx2(poly * restrict r, const uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES])
+{
+  __m256i f, g0, g1, g2, g3, h0, h1, h2, h3;
+  const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0,1,2,3));
+  const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0));
+  const __m256i hqs = _mm256_set1_epi16((S2N_KYBER_512_R3_Q+1)/2);
+
+#define FROMMSG64(i)                        \
+  g3 = _mm256_shuffle_epi32(f,0x55*i);                \
+  g3 = _mm256_sllv_epi32(g3,shift);                \
+  g3 = _mm256_shuffle_epi8(g3,idx);                \
+  g0 = _mm256_slli_epi16(g3,12);                \
+  g1 = _mm256_slli_epi16(g3,8);                    \
+  g2 = _mm256_slli_epi16(g3,4);                    \
+  g0 = _mm256_srai_epi16(g0,15);                \
+  g1 = _mm256_srai_epi16(g1,15);                \
+  g2 = _mm256_srai_epi16(g2,15);                \
+  g3 = _mm256_srai_epi16(g3,15);                \
+  g0 = _mm256_and_si256(g0,hqs);  /* 19 18 17 16  3  2  1  0 */    \
+  g1 = _mm256_and_si256(g1,hqs);  /* 23 22 21 20  7  6  5  4 */    \
+  g2 = _mm256_and_si256(g2,hqs);  /* 27 26 25 24 11 10  9  8 */    \
+  g3 = _mm256_and_si256(g3,hqs);  /* 31 30 29 28 15 14 13 12 */    \
+  h0 = _mm256_unpacklo_epi64(g0,g1);                \
+  h2 = _mm256_unpackhi_epi64(g0,g1);                \
+  h1 = _mm256_unpacklo_epi64(g2,g3);                \
+  h3 = _mm256_unpackhi_epi64(g2,g3);                \
+  g0 = _mm256_permute2x128_si256(h0,h1,0x20);            \
+  g2 = _mm256_permute2x128_si256(h0,h1,0x31);            \
+  g1 = _mm256_permute2x128_si256(h2,h3,0x20);            \
+  g3 = _mm256_permute2x128_si256(h2,h3,0x31);            \
+  _mm256_store_si256(&r->vec[0+2*i+0],g0);    \
+  _mm256_store_si256(&r->vec[0+2*i+1],g1);    \
+  _mm256_store_si256(&r->vec[8+2*i+0],g2);    \
+  _mm256_store_si256(&r->vec[8+2*i+1],g3)
+
+  // correcting cast-align and cast-qual errors
+  // old version: f = _mm256_loadu_si256((__m256i *)msg);
+  f = _mm256_loadu_si256((const void *)msg);
+  FROMMSG64(0);
+  FROMMSG64(1);
+  FROMMSG64(2);
+  FROMMSG64(3);
+}
+
+/*************************************************
+* Name:        poly_tomsg_avx2
+*
+* Description: Convert polynomial to 32-byte message.
+*              The coefficients of the input polynomial are assumed to
+*              lie in the invertal [0,q], i.e. the polynomial must be reduced
+*              by poly_reduce_avx2().
+*
+* Arguments:   - uint8_t *msg: pointer to output message
+*              - poly *a: pointer to input polynomial
+**************************************************/
+void poly_tomsg_avx2(uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES], const poly * restrict a)
+{
+  unsigned int i;
+  uint32_t small;
+  __m256i f0, f1, g0, g1;
+  const __m256i hq = _mm256_set1_epi16((S2N_KYBER_512_R3_Q - 1)/2);
+  const __m256i hhq = _mm256_set1_epi16((S2N_KYBER_512_R3_Q - 1)/4);
+
+  for(i=0;i<S2N_KYBER_512_R3_N/32;i++) {
+    f0 = _mm256_load_si256(&a->vec[2*i+0]);
+    f1 = _mm256_load_si256(&a->vec[2*i+1]);
+    f0 = _mm256_sub_epi16(hq, f0);
+    f1 = _mm256_sub_epi16(hq, f1);
+    g0 = _mm256_srai_epi16(f0, 15);
+    g1 = _mm256_srai_epi16(f1, 15);
+    f0 = _mm256_xor_si256(f0, g0);
+    f1 = _mm256_xor_si256(f1, g1);
+    f0 = _mm256_sub_epi16(f0, hhq);
+    f1 = _mm256_sub_epi16(f1, hhq);
+    f0 = _mm256_packs_epi16(f0, f1);
+    f0 = _mm256_permute4x64_epi64(f0, 0xD8);
+    small = _mm256_movemask_epi8(f0);
+    memcpy(&msg[4*i], &small, 4);
+  }
+}
+
+/*************************************************
+* Name:        poly_getnoise_eta1_avx2
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+*              with output polynomial close to centered binomial distribution
+*              with parameter S2N_KYBER_512_R3_ETA1
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *seed: pointer to input seed
+*                                     (of length S2N_KYBER_512_R3_SYMBYTES bytes)
+*              - uint8_t nonce: one-byte input nonce
+**************************************************/
+void poly_getnoise_eta1_avx2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce)
+{
+  ALIGNED_UINT8(S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/4+32) buf; // +32 bytes as required by poly_cbd_eta1_avx2
+  shake256_prf(buf.coeffs, S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/4, seed, nonce);
+  poly_cbd_eta1_avx2(r, buf.vec);
+}
+
+/*************************************************
+* Name:        poly_getnoise_eta2_avx2
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+*              with output polynomial close to centered binomial distribution
+*              with parameter S2N_KYBER_512_R3_ETA2
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *seed: pointer to input seed
+*                                     (of length S2N_KYBER_512_R3_SYMBYTES bytes)
+*              - uint8_t nonce: one-byte input nonce
+**************************************************/
+void poly_getnoise_eta2_avx2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce)
+{
+  ALIGNED_UINT8(S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/4) buf;
+  shake256_prf(buf.coeffs, S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/4, seed, nonce);
+  poly_cbd_eta2_avx2(r, buf.vec);
+}
+
+#define NOISE_NBLOCKS ((S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/4+S2N_KYBER_512_R3_SHAKE256_RATE-1)/S2N_KYBER_512_R3_SHAKE256_RATE)
+void poly_getnoise_eta1_4x(poly *r0,
+                           poly *r1,
+                           poly *r2,
+                           poly *r3,
+                           const uint8_t seed[32],
+                           uint8_t nonce0,
+                           uint8_t nonce1,
+                           uint8_t nonce2,
+                           uint8_t nonce3)
+{
+  ALIGNED_UINT8(NOISE_NBLOCKS*S2N_KYBER_512_R3_SHAKE256_RATE) buf[4];
+  __m256i f;
+  keccakx4_state state;
+
+  // correcting cast-align and cast-qual errors
+  // old version: f = _mm256_loadu_si256((__m256i *)seed);
+  f = _mm256_loadu_si256((const void *)seed);
+  _mm256_store_si256(buf[0].vec, f);
+  _mm256_store_si256(buf[1].vec, f);
+  _mm256_store_si256(buf[2].vec, f);
+  _mm256_store_si256(buf[3].vec, f);
+
+  buf[0].coeffs[32] = nonce0;
+  buf[1].coeffs[32] = nonce1;
+  buf[2].coeffs[32] = nonce2;
+  buf[3].coeffs[32] = nonce3;
+
+  shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
+  shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
+
+  poly_cbd_eta1_avx2(r0, buf[0].vec);
+  poly_cbd_eta1_avx2(r1, buf[1].vec);
+  poly_cbd_eta1_avx2(r2, buf[2].vec);
+  poly_cbd_eta1_avx2(r3, buf[3].vec);
+}
+
+void poly_getnoise_eta1122_4x(poly *r0,
+                              poly *r1,
+                              poly *r2,
+                              poly *r3,
+                              const uint8_t seed[32],
+                              uint8_t nonce0,
+                              uint8_t nonce1,
+                              uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGNED_UINT8(NOISE_NBLOCKS*S2N_KYBER_512_R3_SHAKE256_RATE) buf[4];
+  __m256i f;
+  keccakx4_state state;
+
+  // correcting cast-align and cast-qual errors
+  // old version: f = _mm256_loadu_si256((__m256i *)seed);
+  f = _mm256_loadu_si256((const void *)seed);
+  _mm256_store_si256(buf[0].vec, f);
+  _mm256_store_si256(buf[1].vec, f);
+  _mm256_store_si256(buf[2].vec, f);
+  _mm256_store_si256(buf[3].vec, f);
+
+  buf[0].coeffs[32] = nonce0;
+  buf[1].coeffs[32] = nonce1;
+  buf[2].coeffs[32] = nonce2;
+  buf[3].coeffs[32] = nonce3;
+
+  shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
+  shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
+
+  poly_cbd_eta1_avx2(r0, buf[0].vec);
+  poly_cbd_eta1_avx2(r1, buf[1].vec);
+  poly_cbd_eta2_avx2(r2, buf[2].vec);
+  poly_cbd_eta2_avx2(r3, buf[3].vec);
+}
+
+/*************************************************
+* Name:        poly_ntt_avx2
+*
+* Description: Computes negacyclic number-theoretic transform (NTT) of
+*              a polynomial in place.
+*              Input coefficients assumed to be in normal order,
+*              output coefficients are in special order that is natural
+*              for the vectorization. Input coefficients are assumed to be
+*              bounded by q in absolute value, output coefficients are bounded
+*              by 16118 in absolute value.
+*
+* Arguments:   - poly *r: pointer to in/output polynomial
+**************************************************/
+void poly_ntt_avx2(poly *r)
+{
+  ntt_avx2_asm(r->vec, qdata.vec);
+}
+
+/*************************************************
+* Name:        poly_invntt_tomont_avx2
+*
+* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+*              of a polynomial in place;
+*              Input coefficients assumed to be in special order from vectorized
+*              forward ntt, output in normal order. Input coefficients can be
+*              arbitrary 16-bit integers, output coefficients are bounded by 14870
+*              in absolute value.
+*
+* Arguments:   - poly *a: pointer to in/output polynomial
+**************************************************/
+void poly_invntt_tomont_avx2(poly *r)
+{
+  invntt_avx2_asm(r->vec, qdata.vec);
+}
+
+void poly_nttunpack_avx2(poly *r)
+{
+  nttunpack_avx2_asm(r->vec, qdata.vec);
+}
+
+/*************************************************
+* Name:        poly_basemul_montgomery_avx2
+*
+* Description: Multiplication of two polynomials in NTT domain.
+*              One of the input polynomials needs to have coefficients
+*              bounded by q, the other polynomial can have arbitrary
+*              coefficients. Output coefficients are bounded by 6656.
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_basemul_montgomery_avx2(poly *r, const poly *a, const poly *b)
+{
+  basemul_avx2_asm(r->vec, a->vec, b->vec, qdata.vec);
+}
+
+/*************************************************
+* Name:        poly_tomont_avx2
+*
+* Description: Inplace conversion of all coefficients of a polynomial
+*              from normal domain to Montgomery domain
+*
+* Arguments:   - poly *r: pointer to input/output polynomial
+**************************************************/
+void poly_tomont_avx2(poly *r)
+{
+  tomont_avx2_asm(r->vec, qdata.vec);
+}
+
+/*************************************************
+* Name:        poly_reduce_avx2
+*
+* Description: Applies Barrett reduction to all coefficients of a polynomial
+*              for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments:   - poly *r: pointer to input/output polynomial
+**************************************************/
+void poly_reduce_avx2(poly *r)
+{
+  reduce_avx2_asm(r->vec, qdata.vec);
+}
+
+/*************************************************
+* Name:        poly_add_avx2
+*
+* Description: Add two polynomials. No modular reduction
+*              is performed.
+*
+* Arguments: - poly *r: pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_add_avx2(poly *r, const poly *a, const poly *b)
+{
+  unsigned int i;
+  __m256i f0, f1;
+
+  for(i=0;i<S2N_KYBER_512_R3_N/16;i++) {
+    f0 = _mm256_load_si256(&a->vec[i]);
+    f1 = _mm256_load_si256(&b->vec[i]);
+    f0 = _mm256_add_epi16(f0, f1);
+    _mm256_store_si256(&r->vec[i], f0);
+  }
+}
+
+/*************************************************
+* Name:        poly_sub_avx2
+*
+* Description: Subtract two polynomials. No modular reduction
+*              is performed.
+*
+* Arguments: - poly *r: pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_sub_avx2(poly *r, const poly *a, const poly *b)
+{
+  unsigned int i;
+  __m256i f0, f1;
+
+  for(i=0;i<S2N_KYBER_512_R3_N/16;i++) {
+    f0 = _mm256_load_si256(&a->vec[i]);
+    f1 = _mm256_load_si256(&b->vec[i]);
+    f0 = _mm256_sub_epi16(f0, f1);
+    _mm256_store_si256(&r->vec[i], f0);
+  }
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.h
new file mode 100644
index 0000000000..bd6e857f79
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_align_avx2.h"
+#include "kyber512r3_params.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#define poly S2N_KYBER_512_R3_NAMESPACE(poly)
+typedef ALIGNED_INT16(S2N_KYBER_512_R3_N) poly;
+
+#define poly_compress_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_compress_avx2)
+void poly_compress_avx2(uint8_t r[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES], const poly *a);
+
+#define poly_decompress_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_decompress_avx2)
+void poly_decompress_avx2(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES]);
+
+#define poly_tobytes_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_tobytes_avx2)
+void poly_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYBYTES], const poly *a);
+
+#define poly_frombytes_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_frombytes_avx2)
+void poly_frombytes_avx2(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYBYTES]);
+
+#define poly_frommsg_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_frommsg_avx2)
+void poly_frommsg_avx2(poly *r, const uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES]);
+
+#define poly_tomsg_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_tomsg_avx2)
+void poly_tomsg_avx2(uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES], const poly *r);
+
+#define poly_getnoise_eta1_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta1_avx2)
+void poly_getnoise_eta1_avx2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce);
+
+#define poly_getnoise_eta2_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta2_avx2)
+void poly_getnoise_eta2_avx2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce);
+
+#define poly_getnoise_eta1_4x S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta2_4x)
+void poly_getnoise_eta1_4x(poly *r0,
+                           poly *r1,
+                           poly *r2,
+                           poly *r3,
+                           const uint8_t seed[32],
+                           uint8_t nonce0,
+                           uint8_t nonce1,
+                           uint8_t nonce2,
+                           uint8_t nonce3);
+
+#define poly_getnoise_eta1122_4x S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta1122_4x)
+void poly_getnoise_eta1122_4x(poly *r0,
+                              poly *r1,
+                              poly *r2,
+                              poly *r3,
+                              const uint8_t seed[32],
+                              uint8_t nonce0,
+                              uint8_t nonce1,
+                              uint8_t nonce2,
+                              uint8_t nonce3);
+
+#define poly_ntt_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_ntt_avx2)
+void poly_ntt_avx2(poly *r);
+
+#define poly_invntt_tomont_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_invntt_tomont_avx2)
+void poly_invntt_tomont_avx2(poly *r);
+
+#define poly_nttunpack_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_nttunpack_avx2)
+void poly_nttunpack_avx2(poly *r);
+
+#define poly_basemul_montgomery_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_basemul_montgomery_avx2)
+void poly_basemul_montgomery_avx2(poly *r, const poly *a, const poly *b);
+
+#define poly_tomont_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_tomont_avx2)
+void poly_tomont_avx2(poly *r);
+
+#define poly_reduce_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_reduce_avx2)
+void poly_reduce_avx2(poly *r);
+
+#define poly_add_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_add_avx2)
+void poly_add_avx2(poly *r, const poly *a, const poly *b);
+
+#define poly_sub_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_sub_avx2)
+void poly_sub_avx2(poly *r, const poly *a, const poly *b);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.c
new file mode 100644
index 0000000000..0a84cd092a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.c
@@ -0,0 +1,186 @@
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_poly.h"
+#include "kyber512r3_polyvec.h"
+
+/*************************************************
+* Name:        polyvec_compress
+*
+* Description: Compress and serialize vector of polynomials
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (needs space for S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES)
+*              - polyvec *a: pointer to input vector of polynomials
+**************************************************/
+void polyvec_compress(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES], polyvec *a) {
+    polyvec_csubq(a);
+
+    uint16_t t[4];
+    for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        for (unsigned int  j = 0; j < S2N_KYBER_512_R3_N / 4; j++) {
+            for (unsigned int  k = 0; k < 4; k++)
+                t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + S2N_KYBER_512_R3_Q / 2)
+                        / S2N_KYBER_512_R3_Q) & 0x3ff;
+
+            r[0] = (t[0] >> 0);
+            r[1] = (t[0] >> 8) | (t[1] << 2);
+            r[2] = (t[1] >> 6) | (t[2] << 4);
+            r[3] = (t[2] >> 4) | (t[3] << 6);
+            r[4] = (t[3] >> 2);
+            r += 5;
+        }
+    }
+}
+
+/*************************************************
+* Name:        polyvec_decompress
+*
+* Description: De-serialize and decompress vector of polynomials;
+*              approximate inverse of polyvec_compress
+*
+* Arguments:   - polyvec *r:       pointer to output vector of polynomials
+*              - const uint8_t *a: pointer to input byte array
+*                                  (of length S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES)
+**************************************************/
+void polyvec_decompress(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES]) {
+    uint16_t t[4];
+    for (unsigned int  i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        for (unsigned int  j = 0; j < S2N_KYBER_512_R3_N / 4; j++) {
+            t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8);
+            t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6);
+            t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4);
+            t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2);
+            a += 5;
+
+            for (unsigned int  k = 0; k < 4; k++) {
+                r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * S2N_KYBER_512_R3_Q + 512) >> 10;
+            }
+        }
+    }
+}
+
+/*************************************************
+* Name:        polyvec_tobytes
+*
+* Description: Serialize vector of polynomials
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (needs space for S2N_KYBER_512_R3_POLYVECBYTES)
+*              - polyvec *a: pointer to input vector of polynomials
+**************************************************/
+void polyvec_tobytes(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], polyvec *a) {
+    for (unsigned int  i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        poly_tobytes(r + i * S2N_KYBER_512_R3_POLYBYTES, &a->vec[i]);
+    }
+}
+
+/*************************************************
+* Name:        polyvec_frombytes
+*
+* Description: De-serialize vector of polynomials;
+*              inverse of polyvec_tobytes
+*
+* Arguments:   - uint8_t *r:       pointer to output byte array
+*              - const polyvec *a: pointer to input vector of polynomials
+*                                  (of length S2N_KYBER_512_R3_POLYVECBYTES)
+**************************************************/
+void polyvec_frombytes(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES]) {
+    for (unsigned int  i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        poly_frombytes(&r->vec[i], a + i * S2N_KYBER_512_R3_POLYBYTES);
+    }
+}
+
+/*************************************************
+* Name:        polyvec_ntt
+*
+* Description: Apply forward NTT to all elements of a vector of polynomials
+*
+* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+**************************************************/
+void polyvec_ntt(polyvec *r) {
+    for (unsigned int  i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        poly_ntt(&r->vec[i]);
+    }
+}
+
+/*************************************************
+* Name:        polyvec_invntt_tomont
+*
+* Description: Apply inverse NTT to all elements of a vector of polynomials
+*              and multiply by Montgomery factor 2^16
+*
+* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+**************************************************/
+void polyvec_invntt_tomont(polyvec *r) {
+    for (unsigned int  i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        poly_invntt_tomont(&r->vec[i]);
+    }
+}
+
+/*************************************************
+* Name:        polyvec_pointwise_acc_montgomery
+*
+* Description: Pointwise multiply elements of a and b, accumulate into r,
+*              and multiply by 2^-16.
+*
+* Arguments: - poly *r:          pointer to output polynomial
+*            - const polyvec *a: pointer to first input vector of polynomials
+*            - const polyvec *b: pointer to second input vector of polynomials
+**************************************************/
+void polyvec_pointwise_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) {
+    poly t;
+
+    poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]);
+    for (unsigned int  i = 1; i < S2N_KYBER_512_R3_K; i++) {
+        poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]);
+        poly_add(r, r, &t);
+    }
+
+    poly_reduce(r);
+}
+
+/*************************************************
+* Name:        polyvec_reduce
+*
+* Description: Applies Barrett reduction to each coefficient
+*              of each element of a vector of polynomials
+*              for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments:   - poly *r: pointer to input/output polynomial
+**************************************************/
+void polyvec_reduce(polyvec *r) {
+    for (unsigned int  i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        poly_reduce(&r->vec[i]);
+    }
+}
+
+/*************************************************
+* Name:        polyvec_csubq
+*
+* Description: Applies conditional subtraction of q to each coefficient
+*              of each element of a vector of polynomials
+*              for details of conditional subtraction of q see comments in
+*              reduce.c
+*
+* Arguments:   - poly *r: pointer to input/output polynomial
+**************************************************/
+void polyvec_csubq(polyvec *r) {
+    for (unsigned int  i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        poly_csubq(&r->vec[i]);
+    }
+}
+
+/*************************************************
+* Name:        polyvec_add
+*
+* Description: Add vectors of polynomials
+*
+* Arguments: - polyvec *r:       pointer to output vector of polynomials
+*            - const polyvec *a: pointer to first input vector of polynomials
+*            - const polyvec *b: pointer to second input vector of polynomials
+**************************************************/
+void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) {
+    for (unsigned int  i = 0; i < S2N_KYBER_512_R3_K; i++) {
+        poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
+    }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.h
new file mode 100644
index 0000000000..797f3c0d31
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_poly.h"
+
+#define polyvec S2N_KYBER_512_R3_NAMESPACE(polyvec)
+typedef struct {
+    poly vec[S2N_KYBER_512_R3_K];
+} polyvec;
+
+#define polyvec_compress S2N_KYBER_512_R3_NAMESPACE(polyvec_compress)
+void polyvec_compress(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES], polyvec *a);
+
+#define polyvec_decompress S2N_KYBER_512_R3_NAMESPACE(polyvec_decompress)
+void polyvec_decompress(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES]);
+
+#define polyvec_tobytes S2N_KYBER_512_R3_NAMESPACE(polyvec_tobytes)
+void polyvec_tobytes(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], polyvec *a);
+
+#define polyvec_frombytes S2N_KYBER_512_R3_NAMESPACE(polyvec_frombytes)
+void polyvec_frombytes(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES]);
+
+#define polyvec_ntt S2N_KYBER_512_R3_NAMESPACE(polyvec_ntt)
+void polyvec_ntt(polyvec *r);
+
+#define polyvec_invntt_tomont S2N_KYBER_512_R3_NAMESPACE(polyvec_invntt_tomont)
+void polyvec_invntt_tomont(polyvec *r);
+
+#define polyvec_pointwise_acc_montgomery S2N_KYBER_512_R3_NAMESPACE(polyvec_pointwise_acc_montgomery)
+void polyvec_pointwise_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);
+
+#define polyvec_reduce S2N_KYBER_512_R3_NAMESPACE(polyvec_reduce)
+void polyvec_reduce(polyvec *r);
+
+#define polyvec_csubq S2N_KYBER_512_R3_NAMESPACE(polyvec_csubq)
+void polyvec_csubq(polyvec *r);
+
+#define polyvec_add S2N_KYBER_512_R3_NAMESPACE(polyvec_add)
+void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.c
new file mode 100644
index 0000000000..8434b96d76
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.c
@@ -0,0 +1,227 @@
+#include <stdint.h>
+#include <string.h>
+#include "kyber512r3_polyvec_avx2.h"
+#include "kyber512r3_poly_avx2.h"
+#include "kyber512r3_consts_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+static void poly_compress10(uint8_t r[320], const poly * restrict a)
+{
+  unsigned int i;
+  __m256i f0, f1, f2;
+  __m128i t0, t1;
+  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
+  const __m256i v8 = _mm256_slli_epi16(v,3);
+  const __m256i off = _mm256_set1_epi16(15);
+  const __m256i shift1 = _mm256_set1_epi16(1 << 12);
+  const __m256i mask = _mm256_set1_epi16(1023);
+  const __m256i shift2 = _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1);
+  const __m256i sllvdidx = _mm256_set1_epi64x(12);
+  const __m256i shufbidx = _mm256_set_epi8( 8, 4, 3, 2, 1, 0,-1,-1,-1,-1,-1,-1,12,11,10, 9,
+                                           -1,-1,-1,-1,-1,-1,12,11,10, 9, 8, 4, 3, 2, 1, 0);
+
+  for(i=0;i<S2N_KYBER_512_R3_N/16;i++) {
+    f0 = _mm256_load_si256(&a->vec[i]);
+    f1 = _mm256_mullo_epi16(f0,v8);
+    f2 = _mm256_add_epi16(f0,off);
+    f0 = _mm256_slli_epi16(f0,3);
+    f0 = _mm256_mulhi_epi16(f0,v);
+    f2 = _mm256_sub_epi16(f1,f2);
+    f1 = _mm256_andnot_si256(f1,f2);
+    f1 = _mm256_srli_epi16(f1,15);
+    f0 = _mm256_sub_epi16(f0,f1);
+    f0 = _mm256_mulhrs_epi16(f0,shift1);
+    f0 = _mm256_and_si256(f0,mask);
+    f0 = _mm256_madd_epi16(f0,shift2);
+    f0 = _mm256_sllv_epi32(f0,sllvdidx);
+    f0 = _mm256_srli_epi64(f0,12);
+    f0 = _mm256_shuffle_epi8(f0,shufbidx);
+    t0 = _mm256_castsi256_si128(f0);
+    t1 = _mm256_extracti128_si256(f0,1);
+    t0 = _mm_blend_epi16(t0,t1,0xE0);
+    // correcting cast-align error
+    // old version: _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0);
+    _mm_storeu_si128((void *)&r[20*i+ 0],t0);
+    memcpy(&r[20*i+16],&t1,4);
+  }
+}
+
+static void poly_decompress10(poly * restrict r, const uint8_t a[320+12])
+{
+  unsigned int i;
+  __m256i f;
+  const __m256i q = _mm256_set1_epi32((S2N_KYBER_512_R3_Q << 16) + 4*S2N_KYBER_512_R3_Q);
+  const __m256i shufbidx = _mm256_set_epi8(11,10,10, 9, 9, 8, 8, 7,
+                                            6, 5, 5, 4, 4, 3, 3, 2,
+                                            9, 8, 8, 7, 7, 6, 6, 5,
+                                            4, 3, 3, 2, 2, 1, 1, 0);
+  const __m256i sllvdidx = _mm256_set1_epi64x(4);
+  const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184);
+
+  for(i=0;i<S2N_KYBER_512_R3_N/16;i++) {
+    // correcting cast-align and cast-qual errors
+    // old version: f = _mm256_loadu_si256((__m256i *)&a[20*i]);
+    f = _mm256_loadu_si256((const void *)&a[20*i]);
+    f = _mm256_permute4x64_epi64(f,0x94);
+    f = _mm256_shuffle_epi8(f,shufbidx);
+    f = _mm256_sllv_epi32(f,sllvdidx);
+    f = _mm256_srli_epi16(f,1);
+    f = _mm256_and_si256(f,mask);
+    f = _mm256_mulhrs_epi16(f,q);
+    _mm256_store_si256(&r->vec[i],f);
+  }
+}
+
+/*************************************************
+* Name:        polyvec_compress_avx2
+*
+* Description: Compress and serialize vector of polynomials
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (needs space for S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES)
+*                       - polyvec *a: pointer to input vector of polynomials
+**************************************************/
+void polyvec_compress_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+2], const polyvec *a)
+{
+  unsigned int i;
+
+  for(i=0;i<S2N_KYBER_512_R3_K;i++)
+    poly_compress10(&r[320*i],&a->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_decompress_avx2
+*
+* Description: De-serialize and decompress vector of polynomials;
+*              approximate inverse of polyvec_compress_avx2
+*
+* Arguments:   - polyvec *r: pointer to output vector of polynomials
+*                       - const uint8_t *a: pointer to input byte array
+*                                  (of length S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES)
+**************************************************/
+void polyvec_decompress_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+12])
+{
+  unsigned int i;
+
+  for(i=0;i<S2N_KYBER_512_R3_K;i++)
+    poly_decompress10(&r->vec[i],&a[320*i]);
+}
+
+/*************************************************
+* Name:        polyvec_tobytes_avx2
+*
+* Description: Serialize vector of polynomials
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (needs space for S2N_KYBER_512_R3_POLYVECBYTES)
+*                       - polyvec *a: pointer to input vector of polynomials
+**************************************************/
+void polyvec_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], const polyvec *a)
+{
+  unsigned int i;
+  for(i=0;i<S2N_KYBER_512_R3_K;i++)
+    poly_tobytes_avx2(r+i*S2N_KYBER_512_R3_POLYBYTES, &a->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_frombytes_avx2
+*
+* Description: De-serialize vector of polynomials;
+*              inverse of polyvec_tobytes_avx2
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                       - const polyvec *a: pointer to input vector of polynomials
+*                                  (of length S2N_KYBER_512_R3_POLYVECBYTES)
+**************************************************/
+void polyvec_frombytes_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES])
+{
+  unsigned int i;
+  for(i=0;i<S2N_KYBER_512_R3_K;i++)
+    poly_frombytes_avx2(&r->vec[i], a+i*S2N_KYBER_512_R3_POLYBYTES);
+}
+
+/*************************************************
+* Name:        polyvec_ntt_avx2
+*
+* Description: Apply forward NTT to all elements of a vector of polynomials
+*
+* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+**************************************************/
+void polyvec_ntt_avx2(polyvec *r)
+{
+  unsigned int i;
+  for(i=0;i<S2N_KYBER_512_R3_K;i++)
+    poly_ntt_avx2(&r->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_invntt_tomont_avx2
+*
+* Description: Apply inverse NTT to all elements of a vector of polynomials
+*              and multiply by Montgomery factor 2^16
+*
+* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+**************************************************/
+void polyvec_invntt_tomont_avx2(polyvec *r)
+{
+  unsigned int i;
+  for(i=0;i<S2N_KYBER_512_R3_K;i++)
+    poly_invntt_tomont_avx2(&r->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_basemul_acc_montgomery_avx2
+*
+* Description: Multiply elements in a and b in NTT domain, accumulate into r,
+*              and multiply by 2^-16.
+*
+* Arguments: - poly *r: pointer to output polynomial
+*            - const polyvec *a: pointer to first input vector of polynomials
+*            - const polyvec *b: pointer to second input vector of polynomials
+**************************************************/
+void polyvec_basemul_acc_montgomery_avx2(poly *r, const polyvec *a, const polyvec *b)
+{
+  unsigned int i;
+  poly tmp;
+
+  poly_basemul_montgomery_avx2(r,&a->vec[0],&b->vec[0]);
+  for(i=1;i<S2N_KYBER_512_R3_K;i++) {
+    poly_basemul_montgomery_avx2(&tmp,&a->vec[i],&b->vec[i]);
+    poly_add_avx2(r,r,&tmp);
+  }
+}
+
+/*************************************************
+* Name:        polyvec_reduce_avx2
+*
+* Description: Applies Barrett reduction to each coefficient
+*              of each element of a vector of polynomials;
+*              for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments:   - polyvec *r: pointer to input/output polynomial
+**************************************************/
+void polyvec_reduce_avx2(polyvec *r)
+{
+  unsigned int i;
+  for(i=0;i<S2N_KYBER_512_R3_K;i++)
+    poly_reduce_avx2(&r->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_add_avx2
+*
+* Description: Add vectors of polynomials
+*
+* Arguments: - polyvec *r:       pointer to output vector of polynomials
+*            - const polyvec *a: pointer to first input vector of polynomials
+*            - const polyvec *b: pointer to second input vector of polynomials
+**************************************************/
+void polyvec_add_avx2(polyvec *r, const polyvec *a, const polyvec *b)
+{
+  unsigned int i;
+  for(i=0;i<S2N_KYBER_512_R3_K;i++)
+    poly_add_avx2(&r->vec[i], &a->vec[i], &b->vec[i]);
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.h
new file mode 100644
index 0000000000..536e1b23d0
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_poly_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#define polyvec S2N_KYBER_512_R3_NAMESPACE(polyvec)
+typedef struct{
+  poly vec[S2N_KYBER_512_R3_K];
+} polyvec;
+
+#define polyvec_compress_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_compress_avx2)
+void polyvec_compress_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+2], const polyvec *a);
+
+#define polyvec_decompress_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_decompress_avx2)
+void polyvec_decompress_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+12]);
+
+#define polyvec_tobytes_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_tobytes_avx2)
+void polyvec_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], const polyvec *a);
+
+#define polyvec_frombytes_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_frombytes_avx2)
+void polyvec_frombytes_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES]);
+
+#define polyvec_ntt_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_ntt_avx2)
+void polyvec_ntt_avx2(polyvec *r);
+
+#define polyvec_invntt_tomont_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_invntt_tomont_avx2)
+void polyvec_invntt_tomont_avx2(polyvec *r);
+
+#define polyvec_basemul_acc_montgomery_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_basemul_acc_montgomery_avx2)
+void polyvec_basemul_acc_montgomery_avx2(poly *r, const polyvec *a, const polyvec *b);
+
+#define polyvec_reduce_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_reduce_avx2)
+void polyvec_reduce_avx2(polyvec *r);
+
+#define polyvec_add_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_add_avx2)
+void polyvec_add_avx2(polyvec *r, const polyvec *a, const polyvec *b);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.c
new file mode 100644
index 0000000000..6219ad7e88
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.c
@@ -0,0 +1,60 @@
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_reduce.h"
+
+/*************************************************
+* Name:        montgomery_reduce
+*
+* Description: Montgomery reduction; given a 32-bit integer a, computes
+*              16-bit integer congruent to a * R^-1 mod q,
+*              where R=2^16
+*
+* Arguments:   - int32_t a: input integer to be reduced;
+*                           has to be in {-q2^15,...,q2^15-1}
+*
+* Returns:     integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q.
+**************************************************/
+int16_t montgomery_reduce(int32_t a) {
+    int32_t t;
+    int16_t u;
+
+    u = a * S2N_KYBER_512_R3_QINV;
+    t = (int32_t)u * S2N_KYBER_512_R3_Q;
+    t = a - t;
+    t >>= 16;
+    return t;
+}
+
+/*************************************************
+* Name:        barrett_reduce
+*
+* Description: Barrett reduction; given a 16-bit integer a, computes
+*              16-bit integer congruent to a mod q in {0,...,q}
+*
+* Arguments:   - int16_t a: input integer to be reduced
+*
+* Returns:     integer in {0,...,q} congruent to a modulo q.
+**************************************************/
+int16_t barrett_reduce(int16_t a) {
+    int16_t t;
+    const int16_t v = ((1U << 26) + S2N_KYBER_512_R3_Q / 2) / S2N_KYBER_512_R3_Q;
+
+    t  = (int32_t)v * a >> 26;
+    t *= S2N_KYBER_512_R3_Q;
+    return a - t;
+}
+
+/*************************************************
+* Name:        csubq
+*
+* Description: Conditionallly subtract q
+*
+* Arguments:   - int16_t x: input integer
+*
+* Returns:     a - q if a >= q, else a
+**************************************************/
+int16_t csubq(int16_t a) {
+    a -= S2N_KYBER_512_R3_Q;
+    a += (a >> 15) & S2N_KYBER_512_R3_Q;
+    return a;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.h
new file mode 100644
index 0000000000..bab9fa54f9
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+
+#define S2N_KYBER_512_R3_QINV 62209 /* q^-1 mod 2^16 */
+
+#define montgomery_reduce S2N_KYBER_512_R3_NAMESPACE(montgomery_reduce)
+int16_t montgomery_reduce(int32_t a);
+
+#define barrett_reduce S2N_KYBER_512_R3_NAMESPACE(barrett_reduce)
+int16_t barrett_reduce(int16_t a);
+
+#define csubq S2N_KYBER_512_R3_NAMESPACE(csubq)
+int16_t csubq(int16_t x);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce_avx2.h
new file mode 100644
index 0000000000..24f0ede4e0
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce_avx2.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "kyber512r3_params.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+#define reduce_avx2_asm S2N_KYBER_512_R3_NAMESPACE(reduce_avx2_asm)
+void reduce_avx2_asm(__m256i *r, const __m256i *qdata);
+
+#define tomont_avx2_asm S2N_KYBER_512_R3_NAMESPACE(tomont_avx2_asm)
+void tomont_avx2_asm(__m256i *r, const __m256i *qdata);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.c
new file mode 100644
index 0000000000..1461e0b9b1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.c
@@ -0,0 +1,420 @@
+#include <stdint.h>
+#include <string.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_consts_avx2.h"
+#include "kyber512r3_rejsample_avx2.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#include <immintrin.h>
+
+//#define BMI
+
+#ifndef BMI
+static const uint8_t idx[256][8] = {
+  {-1, -1, -1, -1, -1, -1, -1, -1},
+  { 0, -1, -1, -1, -1, -1, -1, -1},
+  { 2, -1, -1, -1, -1, -1, -1, -1},
+  { 0,  2, -1, -1, -1, -1, -1, -1},
+  { 4, -1, -1, -1, -1, -1, -1, -1},
+  { 0,  4, -1, -1, -1, -1, -1, -1},
+  { 2,  4, -1, -1, -1, -1, -1, -1},
+  { 0,  2,  4, -1, -1, -1, -1, -1},
+  { 6, -1, -1, -1, -1, -1, -1, -1},
+  { 0,  6, -1, -1, -1, -1, -1, -1},
+  { 2,  6, -1, -1, -1, -1, -1, -1},
+  { 0,  2,  6, -1, -1, -1, -1, -1},
+  { 4,  6, -1, -1, -1, -1, -1, -1},
+  { 0,  4,  6, -1, -1, -1, -1, -1},
+  { 2,  4,  6, -1, -1, -1, -1, -1},
+  { 0,  2,  4,  6, -1, -1, -1, -1},
+  { 8, -1, -1, -1, -1, -1, -1, -1},
+  { 0,  8, -1, -1, -1, -1, -1, -1},
+  { 2,  8, -1, -1, -1, -1, -1, -1},
+  { 0,  2,  8, -1, -1, -1, -1, -1},
+  { 4,  8, -1, -1, -1, -1, -1, -1},
+  { 0,  4,  8, -1, -1, -1, -1, -1},
+  { 2,  4,  8, -1, -1, -1, -1, -1},
+  { 0,  2,  4,  8, -1, -1, -1, -1},
+  { 6,  8, -1, -1, -1, -1, -1, -1},
+  { 0,  6,  8, -1, -1, -1, -1, -1},
+  { 2,  6,  8, -1, -1, -1, -1, -1},
+  { 0,  2,  6,  8, -1, -1, -1, -1},
+  { 4,  6,  8, -1, -1, -1, -1, -1},
+  { 0,  4,  6,  8, -1, -1, -1, -1},
+  { 2,  4,  6,  8, -1, -1, -1, -1},
+  { 0,  2,  4,  6,  8, -1, -1, -1},
+  {10, -1, -1, -1, -1, -1, -1, -1},
+  { 0, 10, -1, -1, -1, -1, -1, -1},
+  { 2, 10, -1, -1, -1, -1, -1, -1},
+  { 0,  2, 10, -1, -1, -1, -1, -1},
+  { 4, 10, -1, -1, -1, -1, -1, -1},
+  { 0,  4, 10, -1, -1, -1, -1, -1},
+  { 2,  4, 10, -1, -1, -1, -1, -1},
+  { 0,  2,  4, 10, -1, -1, -1, -1},
+  { 6, 10, -1, -1, -1, -1, -1, -1},
+  { 0,  6, 10, -1, -1, -1, -1, -1},
+  { 2,  6, 10, -1, -1, -1, -1, -1},
+  { 0,  2,  6, 10, -1, -1, -1, -1},
+  { 4,  6, 10, -1, -1, -1, -1, -1},
+  { 0,  4,  6, 10, -1, -1, -1, -1},
+  { 2,  4,  6, 10, -1, -1, -1, -1},
+  { 0,  2,  4,  6, 10, -1, -1, -1},
+  { 8, 10, -1, -1, -1, -1, -1, -1},
+  { 0,  8, 10, -1, -1, -1, -1, -1},
+  { 2,  8, 10, -1, -1, -1, -1, -1},
+  { 0,  2,  8, 10, -1, -1, -1, -1},
+  { 4,  8, 10, -1, -1, -1, -1, -1},
+  { 0,  4,  8, 10, -1, -1, -1, -1},
+  { 2,  4,  8, 10, -1, -1, -1, -1},
+  { 0,  2,  4,  8, 10, -1, -1, -1},
+  { 6,  8, 10, -1, -1, -1, -1, -1},
+  { 0,  6,  8, 10, -1, -1, -1, -1},
+  { 2,  6,  8, 10, -1, -1, -1, -1},
+  { 0,  2,  6,  8, 10, -1, -1, -1},
+  { 4,  6,  8, 10, -1, -1, -1, -1},
+  { 0,  4,  6,  8, 10, -1, -1, -1},
+  { 2,  4,  6,  8, 10, -1, -1, -1},
+  { 0,  2,  4,  6,  8, 10, -1, -1},
+  {12, -1, -1, -1, -1, -1, -1, -1},
+  { 0, 12, -1, -1, -1, -1, -1, -1},
+  { 2, 12, -1, -1, -1, -1, -1, -1},
+  { 0,  2, 12, -1, -1, -1, -1, -1},
+  { 4, 12, -1, -1, -1, -1, -1, -1},
+  { 0,  4, 12, -1, -1, -1, -1, -1},
+  { 2,  4, 12, -1, -1, -1, -1, -1},
+  { 0,  2,  4, 12, -1, -1, -1, -1},
+  { 6, 12, -1, -1, -1, -1, -1, -1},
+  { 0,  6, 12, -1, -1, -1, -1, -1},
+  { 2,  6, 12, -1, -1, -1, -1, -1},
+  { 0,  2,  6, 12, -1, -1, -1, -1},
+  { 4,  6, 12, -1, -1, -1, -1, -1},
+  { 0,  4,  6, 12, -1, -1, -1, -1},
+  { 2,  4,  6, 12, -1, -1, -1, -1},
+  { 0,  2,  4,  6, 12, -1, -1, -1},
+  { 8, 12, -1, -1, -1, -1, -1, -1},
+  { 0,  8, 12, -1, -1, -1, -1, -1},
+  { 2,  8, 12, -1, -1, -1, -1, -1},
+  { 0,  2,  8, 12, -1, -1, -1, -1},
+  { 4,  8, 12, -1, -1, -1, -1, -1},
+  { 0,  4,  8, 12, -1, -1, -1, -1},
+  { 2,  4,  8, 12, -1, -1, -1, -1},
+  { 0,  2,  4,  8, 12, -1, -1, -1},
+  { 6,  8, 12, -1, -1, -1, -1, -1},
+  { 0,  6,  8, 12, -1, -1, -1, -1},
+  { 2,  6,  8, 12, -1, -1, -1, -1},
+  { 0,  2,  6,  8, 12, -1, -1, -1},
+  { 4,  6,  8, 12, -1, -1, -1, -1},
+  { 0,  4,  6,  8, 12, -1, -1, -1},
+  { 2,  4,  6,  8, 12, -1, -1, -1},
+  { 0,  2,  4,  6,  8, 12, -1, -1},
+  {10, 12, -1, -1, -1, -1, -1, -1},
+  { 0, 10, 12, -1, -1, -1, -1, -1},
+  { 2, 10, 12, -1, -1, -1, -1, -1},
+  { 0,  2, 10, 12, -1, -1, -1, -1},
+  { 4, 10, 12, -1, -1, -1, -1, -1},
+  { 0,  4, 10, 12, -1, -1, -1, -1},
+  { 2,  4, 10, 12, -1, -1, -1, -1},
+  { 0,  2,  4, 10, 12, -1, -1, -1},
+  { 6, 10, 12, -1, -1, -1, -1, -1},
+  { 0,  6, 10, 12, -1, -1, -1, -1},
+  { 2,  6, 10, 12, -1, -1, -1, -1},
+  { 0,  2,  6, 10, 12, -1, -1, -1},
+  { 4,  6, 10, 12, -1, -1, -1, -1},
+  { 0,  4,  6, 10, 12, -1, -1, -1},
+  { 2,  4,  6, 10, 12, -1, -1, -1},
+  { 0,  2,  4,  6, 10, 12, -1, -1},
+  { 8, 10, 12, -1, -1, -1, -1, -1},
+  { 0,  8, 10, 12, -1, -1, -1, -1},
+  { 2,  8, 10, 12, -1, -1, -1, -1},
+  { 0,  2,  8, 10, 12, -1, -1, -1},
+  { 4,  8, 10, 12, -1, -1, -1, -1},
+  { 0,  4,  8, 10, 12, -1, -1, -1},
+  { 2,  4,  8, 10, 12, -1, -1, -1},
+  { 0,  2,  4,  8, 10, 12, -1, -1},
+  { 6,  8, 10, 12, -1, -1, -1, -1},
+  { 0,  6,  8, 10, 12, -1, -1, -1},
+  { 2,  6,  8, 10, 12, -1, -1, -1},
+  { 0,  2,  6,  8, 10, 12, -1, -1},
+  { 4,  6,  8, 10, 12, -1, -1, -1},
+  { 0,  4,  6,  8, 10, 12, -1, -1},
+  { 2,  4,  6,  8, 10, 12, -1, -1},
+  { 0,  2,  4,  6,  8, 10, 12, -1},
+  {14, -1, -1, -1, -1, -1, -1, -1},
+  { 0, 14, -1, -1, -1, -1, -1, -1},
+  { 2, 14, -1, -1, -1, -1, -1, -1},
+  { 0,  2, 14, -1, -1, -1, -1, -1},
+  { 4, 14, -1, -1, -1, -1, -1, -1},
+  { 0,  4, 14, -1, -1, -1, -1, -1},
+  { 2,  4, 14, -1, -1, -1, -1, -1},
+  { 0,  2,  4, 14, -1, -1, -1, -1},
+  { 6, 14, -1, -1, -1, -1, -1, -1},
+  { 0,  6, 14, -1, -1, -1, -1, -1},
+  { 2,  6, 14, -1, -1, -1, -1, -1},
+  { 0,  2,  6, 14, -1, -1, -1, -1},
+  { 4,  6, 14, -1, -1, -1, -1, -1},
+  { 0,  4,  6, 14, -1, -1, -1, -1},
+  { 2,  4,  6, 14, -1, -1, -1, -1},
+  { 0,  2,  4,  6, 14, -1, -1, -1},
+  { 8, 14, -1, -1, -1, -1, -1, -1},
+  { 0,  8, 14, -1, -1, -1, -1, -1},
+  { 2,  8, 14, -1, -1, -1, -1, -1},
+  { 0,  2,  8, 14, -1, -1, -1, -1},
+  { 4,  8, 14, -1, -1, -1, -1, -1},
+  { 0,  4,  8, 14, -1, -1, -1, -1},
+  { 2,  4,  8, 14, -1, -1, -1, -1},
+  { 0,  2,  4,  8, 14, -1, -1, -1},
+  { 6,  8, 14, -1, -1, -1, -1, -1},
+  { 0,  6,  8, 14, -1, -1, -1, -1},
+  { 2,  6,  8, 14, -1, -1, -1, -1},
+  { 0,  2,  6,  8, 14, -1, -1, -1},
+  { 4,  6,  8, 14, -1, -1, -1, -1},
+  { 0,  4,  6,  8, 14, -1, -1, -1},
+  { 2,  4,  6,  8, 14, -1, -1, -1},
+  { 0,  2,  4,  6,  8, 14, -1, -1},
+  {10, 14, -1, -1, -1, -1, -1, -1},
+  { 0, 10, 14, -1, -1, -1, -1, -1},
+  { 2, 10, 14, -1, -1, -1, -1, -1},
+  { 0,  2, 10, 14, -1, -1, -1, -1},
+  { 4, 10, 14, -1, -1, -1, -1, -1},
+  { 0,  4, 10, 14, -1, -1, -1, -1},
+  { 2,  4, 10, 14, -1, -1, -1, -1},
+  { 0,  2,  4, 10, 14, -1, -1, -1},
+  { 6, 10, 14, -1, -1, -1, -1, -1},
+  { 0,  6, 10, 14, -1, -1, -1, -1},
+  { 2,  6, 10, 14, -1, -1, -1, -1},
+  { 0,  2,  6, 10, 14, -1, -1, -1},
+  { 4,  6, 10, 14, -1, -1, -1, -1},
+  { 0,  4,  6, 10, 14, -1, -1, -1},
+  { 2,  4,  6, 10, 14, -1, -1, -1},
+  { 0,  2,  4,  6, 10, 14, -1, -1},
+  { 8, 10, 14, -1, -1, -1, -1, -1},
+  { 0,  8, 10, 14, -1, -1, -1, -1},
+  { 2,  8, 10, 14, -1, -1, -1, -1},
+  { 0,  2,  8, 10, 14, -1, -1, -1},
+  { 4,  8, 10, 14, -1, -1, -1, -1},
+  { 0,  4,  8, 10, 14, -1, -1, -1},
+  { 2,  4,  8, 10, 14, -1, -1, -1},
+  { 0,  2,  4,  8, 10, 14, -1, -1},
+  { 6,  8, 10, 14, -1, -1, -1, -1},
+  { 0,  6,  8, 10, 14, -1, -1, -1},
+  { 2,  6,  8, 10, 14, -1, -1, -1},
+  { 0,  2,  6,  8, 10, 14, -1, -1},
+  { 4,  6,  8, 10, 14, -1, -1, -1},
+  { 0,  4,  6,  8, 10, 14, -1, -1},
+  { 2,  4,  6,  8, 10, 14, -1, -1},
+  { 0,  2,  4,  6,  8, 10, 14, -1},
+  {12, 14, -1, -1, -1, -1, -1, -1},
+  { 0, 12, 14, -1, -1, -1, -1, -1},
+  { 2, 12, 14, -1, -1, -1, -1, -1},
+  { 0,  2, 12, 14, -1, -1, -1, -1},
+  { 4, 12, 14, -1, -1, -1, -1, -1},
+  { 0,  4, 12, 14, -1, -1, -1, -1},
+  { 2,  4, 12, 14, -1, -1, -1, -1},
+  { 0,  2,  4, 12, 14, -1, -1, -1},
+  { 6, 12, 14, -1, -1, -1, -1, -1},
+  { 0,  6, 12, 14, -1, -1, -1, -1},
+  { 2,  6, 12, 14, -1, -1, -1, -1},
+  { 0,  2,  6, 12, 14, -1, -1, -1},
+  { 4,  6, 12, 14, -1, -1, -1, -1},
+  { 0,  4,  6, 12, 14, -1, -1, -1},
+  { 2,  4,  6, 12, 14, -1, -1, -1},
+  { 0,  2,  4,  6, 12, 14, -1, -1},
+  { 8, 12, 14, -1, -1, -1, -1, -1},
+  { 0,  8, 12, 14, -1, -1, -1, -1},
+  { 2,  8, 12, 14, -1, -1, -1, -1},
+  { 0,  2,  8, 12, 14, -1, -1, -1},
+  { 4,  8, 12, 14, -1, -1, -1, -1},
+  { 0,  4,  8, 12, 14, -1, -1, -1},
+  { 2,  4,  8, 12, 14, -1, -1, -1},
+  { 0,  2,  4,  8, 12, 14, -1, -1},
+  { 6,  8, 12, 14, -1, -1, -1, -1},
+  { 0,  6,  8, 12, 14, -1, -1, -1},
+  { 2,  6,  8, 12, 14, -1, -1, -1},
+  { 0,  2,  6,  8, 12, 14, -1, -1},
+  { 4,  6,  8, 12, 14, -1, -1, -1},
+  { 0,  4,  6,  8, 12, 14, -1, -1},
+  { 2,  4,  6,  8, 12, 14, -1, -1},
+  { 0,  2,  4,  6,  8, 12, 14, -1},
+  {10, 12, 14, -1, -1, -1, -1, -1},
+  { 0, 10, 12, 14, -1, -1, -1, -1},
+  { 2, 10, 12, 14, -1, -1, -1, -1},
+  { 0,  2, 10, 12, 14, -1, -1, -1},
+  { 4, 10, 12, 14, -1, -1, -1, -1},
+  { 0,  4, 10, 12, 14, -1, -1, -1},
+  { 2,  4, 10, 12, 14, -1, -1, -1},
+  { 0,  2,  4, 10, 12, 14, -1, -1},
+  { 6, 10, 12, 14, -1, -1, -1, -1},
+  { 0,  6, 10, 12, 14, -1, -1, -1},
+  { 2,  6, 10, 12, 14, -1, -1, -1},
+  { 0,  2,  6, 10, 12, 14, -1, -1},
+  { 4,  6, 10, 12, 14, -1, -1, -1},
+  { 0,  4,  6, 10, 12, 14, -1, -1},
+  { 2,  4,  6, 10, 12, 14, -1, -1},
+  { 0,  2,  4,  6, 10, 12, 14, -1},
+  { 8, 10, 12, 14, -1, -1, -1, -1},
+  { 0,  8, 10, 12, 14, -1, -1, -1},
+  { 2,  8, 10, 12, 14, -1, -1, -1},
+  { 0,  2,  8, 10, 12, 14, -1, -1},
+  { 4,  8, 10, 12, 14, -1, -1, -1},
+  { 0,  4,  8, 10, 12, 14, -1, -1},
+  { 2,  4,  8, 10, 12, 14, -1, -1},
+  { 0,  2,  4,  8, 10, 12, 14, -1},
+  { 6,  8, 10, 12, 14, -1, -1, -1},
+  { 0,  6,  8, 10, 12, 14, -1, -1},
+  { 2,  6,  8, 10, 12, 14, -1, -1},
+  { 0,  2,  6,  8, 10, 12, 14, -1},
+  { 4,  6,  8, 10, 12, 14, -1, -1},
+  { 0,  4,  6,  8, 10, 12, 14, -1},
+  { 2,  4,  6,  8, 10, 12, 14, -1},
+  { 0,  2,  4,  6,  8, 10, 12, 14}
+};
+#endif
+
+#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
+#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)
+
+unsigned int rej_uniform_avx2(int16_t * restrict r, const uint8_t *buf)
+{
+  unsigned int ctr, pos;
+  uint16_t val0, val1;
+  uint32_t good;
+#ifdef BMI
+  uint64_t idx0, idx1, idx2, idx3;
+#endif
+  const __m256i bound  = _mm256_load_si256(&qdata.vec[_16XQ/16]);
+  const __m256i ones   = _mm256_set1_epi8(1);
+  const __m256i mask  = _mm256_set1_epi16(0xFFF);
+  const __m256i idx8  = _mm256_set_epi8(15,14,14,13,12,11,11,10,
+                                         9, 8, 8, 7, 6, 5, 5, 4,
+                                        11,10,10, 9, 8, 7, 7, 6,
+                                         5, 4, 4, 3, 2, 1, 1, 0);
+  __m256i f0, f1, g0, g1, g2, g3;
+  __m128i f, t, pilo, pihi;
+
+  ctr = pos = 0;
+  while(ctr <= S2N_KYBER_512_R3_N - 32 && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 48) {
+    // correcting cast-align and cast-qual errors
+    // old version: f0 = _mm256_loadu_si256((__m256i *)&buf[pos]);
+    f0 = _mm256_loadu_si256((const void *)&buf[pos]);
+    // old version: f1 = _mm256_loadu_si256((__m256i *)&buf[pos+24]);
+    f1 = _mm256_loadu_si256((const void *)&buf[pos+24]);
+    f0 = _mm256_permute4x64_epi64(f0, 0x94);
+    f1 = _mm256_permute4x64_epi64(f1, 0x94);
+    f0 = _mm256_shuffle_epi8(f0, idx8);
+    f1 = _mm256_shuffle_epi8(f1, idx8);
+    g0 = _mm256_srli_epi16(f0, 4);
+    g1 = _mm256_srli_epi16(f1, 4);
+    f0 = _mm256_blend_epi16(f0, g0, 0xAA);
+    f1 = _mm256_blend_epi16(f1, g1, 0xAA);
+    f0 = _mm256_and_si256(f0, mask);
+    f1 = _mm256_and_si256(f1, mask);
+    pos += 48;
+
+    g0 = _mm256_cmpgt_epi16(bound, f0);
+    g1 = _mm256_cmpgt_epi16(bound, f1);
+
+    g0 = _mm256_packs_epi16(g0, g1);
+    good = _mm256_movemask_epi8(g0);
+
+#ifdef BMI
+    idx0 = _pdep_u64(good >>  0, 0x0101010101010101);
+    idx1 = _pdep_u64(good >>  8, 0x0101010101010101);
+    idx2 = _pdep_u64(good >> 16, 0x0101010101010101);
+    idx3 = _pdep_u64(good >> 24, 0x0101010101010101);
+    idx0 = (idx0 << 8) - idx0;
+    idx0  = _pext_u64(0x0E0C0A0806040200, idx0);
+    idx1 = (idx1 << 8) - idx1;
+    idx1  = _pext_u64(0x0E0C0A0806040200, idx1);
+    idx2 = (idx2 << 8) - idx2;
+    idx2  = _pext_u64(0x0E0C0A0806040200, idx2);
+    idx3 = (idx3 << 8) - idx3;
+    idx3  = _pext_u64(0x0E0C0A0806040200, idx3);
+
+    g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0));
+    g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1));
+    g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1);
+    g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1);
+#else
+    // correcting cast-align and cast-qual errors
+    // old version: g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >>  0) & 0xFF]));
+    g0 = _mm256_castsi128_si256(_mm_loadl_epi64((const void *)&idx[(good >>  0) & 0xFF]));
+    // old version: g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >>  8) & 0xFF]));
+    g1 = _mm256_castsi128_si256(_mm_loadl_epi64((const void *)&idx[(good >>  8) & 0xFF]));
+    // old version: g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx[(good >> 16) & 0xFF]), 1);
+    g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((const void *)&idx[(good >> 16) & 0xFF]), 1);
+    // old version: g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx[(good >> 24) & 0xFF]), 1);
+    g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((const void *)&idx[(good >> 24) & 0xFF]), 1);
+#endif
+
+    g2 = _mm256_add_epi8(g0, ones);
+    g3 = _mm256_add_epi8(g1, ones);
+    g0 = _mm256_unpacklo_epi8(g0, g2);
+    g1 = _mm256_unpacklo_epi8(g1, g3);
+
+    f0 = _mm256_shuffle_epi8(f0, g0);
+    f1 = _mm256_shuffle_epi8(f1, g1);
+
+    // correcting cast-align errors
+    // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0));
+    _mm_storeu_si128((void *)&r[ctr], _mm256_castsi256_si128(f0));
+    ctr += _mm_popcnt_u32((good >>  0) & 0xFF);
+    // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1));
+    _mm_storeu_si128((void *)&r[ctr], _mm256_extracti128_si256(f0, 1));
+    ctr += _mm_popcnt_u32((good >> 16) & 0xFF);
+    // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1));
+    _mm_storeu_si128((void *)&r[ctr], _mm256_castsi256_si128(f1));
+    ctr += _mm_popcnt_u32((good >>  8) & 0xFF);
+    // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1));
+    _mm_storeu_si128((void *)&r[ctr], _mm256_extracti128_si256(f1, 1));
+    ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
+  }
+
+  while(ctr <= S2N_KYBER_512_R3_N - 8 && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 12) {
+    // correcting cast-align and cast-qual errors
+    // old version: f = _mm_loadu_si128((__m128i *)&buf[pos]);
+    f = _mm_loadu_si128((const void *)&buf[pos]);
+    f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8));
+    t = _mm_srli_epi16(f, 4);
+    f = _mm_blend_epi16(f, t, 0xAA);
+    f = _mm_and_si128(f, _mm256_castsi256_si128(mask));
+    pos += 12;
+
+    t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f);
+    good = _mm_movemask_epi8(t);
+
+#ifdef BMI
+    good &= 0x5555;
+    idx0 = _pdep_u64(good, 0x1111111111111111);
+    idx0 = (idx0 << 8) - idx0;
+    idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
+    pilo = _mm_cvtsi64_si128(idx0);
+#else
+    good = _pext_u32(good, 0x5555);
+    // correcting cast-align and cast-qual errors
+    // old version: pilo = _mm_loadl_epi64((__m128i *)&idx[good]);
+    pilo = _mm_loadl_epi64((const void *)&idx[good]);
+#endif
+
+    pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
+    pilo = _mm_unpacklo_epi8(pilo, pihi);
+    f = _mm_shuffle_epi8(f, pilo);
+    // correcting cast-align error
+    // old version: _mm_storeu_si128((__m128i *)&r[ctr], f);
+    _mm_storeu_si128((void *)&r[ctr], f);
+    ctr += _mm_popcnt_u32(good);
+  }
+
+  while(ctr < S2N_KYBER_512_R3_N && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 3) {
+    val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
+    val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4));
+    pos += 3;
+
+    if(val0 < S2N_KYBER_512_R3_Q)
+      r[ctr++] = val0;
+    if(val1 < S2N_KYBER_512_R3_Q && ctr < S2N_KYBER_512_R3_N)
+      r[ctr++] = val1;
+  }
+
+  return ctr;
+}
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.h
new file mode 100644
index 0000000000..bd8a970464
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <stdint.h>
+#include "kyber512r3_params.h"
+#include "kyber512r3_fips202.h"
+
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+#define S2N_KYBER_512_R3_XOF_BLOCKBYTES S2N_KYBER_512_R3_SHAKE128_RATE
+#define S2N_KYBER_512_R3_REJ_UNIFORM_AVX_NBLOCKS ((12*S2N_KYBER_512_R3_N/8*(1 << 12)/S2N_KYBER_512_R3_Q + S2N_KYBER_512_R3_XOF_BLOCKBYTES)/S2N_KYBER_512_R3_XOF_BLOCKBYTES)
+#define S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN (S2N_KYBER_512_R3_REJ_UNIFORM_AVX_NBLOCKS*S2N_KYBER_512_R3_XOF_BLOCKBYTES)
+
+#define rej_uniform_avx2 S2N_KYBER_512_R3_NAMESPACE(rej_uniform_avx2)
+unsigned int rej_uniform_avx2(int16_t *r, const uint8_t *buf);
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_shuffle_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_shuffle_avx2.S
new file mode 100644
index 0000000000..ce7200e5ca
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_shuffle_avx2.S
@@ -0,0 +1,272 @@
+#include "kyber512r3_consts_avx2.h"
+
+// The small macros (.inc files) are combined with .S files directly
+/*****.include "fq.inc"*****/
+/***************************/
+.macro red16 r,rs=0,x=12
+vpmulhw     %ymm1,%ymm\r,%ymm\x
+.if \rs
+vpmulhrsw   %ymm\rs,%ymm\x,%ymm\x
+.else
+vpsraw      $10,%ymm\x,%ymm\x
+.endif
+vpmullw     %ymm0,%ymm\x,%ymm\x
+vpsubw      %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro csubq r,x=12
+vpsubw      %ymm0,%ymm\r,%ymm\r
+vpsraw      $15,%ymm\r,%ymm\x
+vpand       %ymm0,%ymm\x,%ymm\x
+vpaddw      %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro caddq r,x=12
+vpsraw      $15,%ymm\r,%ymm\x
+vpand       %ymm0,%ymm\x,%ymm\x
+vpaddw      %ymm\x,%ymm\r,%ymm\r
+.endm
+
+.macro fqmulprecomp al,ah,b,x=12
+vpmullw     %ymm\al,%ymm\b,%ymm\x
+vpmulhw     %ymm\ah,%ymm\b,%ymm\b
+vpmulhw     %ymm0,%ymm\x,%ymm\x
+vpsubw      %ymm\x,%ymm\b,%ymm\b
+.endm
+/***************************/
+
+/*****.include "shuffle.inc"*****/
+/********************************/
+.macro shuffle8 r0,r1,r2,r3
+vperm2i128  $0x20,%ymm\r1,%ymm\r0,%ymm\r2
+vperm2i128  $0x31,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle4 r0,r1,r2,r3
+vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
+vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle2 r0,r1,r2,r3
+#vpsllq     $32,%ymm\r1,%ymm\r2
+vmovsldup   %ymm\r1,%ymm\r2
+vpblendd    $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
+vpsrlq      $32,%ymm\r0,%ymm\r0
+#vmovshdup  %ymm\r0,%ymm\r0
+vpblendd    $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+
+.macro shuffle1 r0,r1,r2,r3
+vpslld      $16,%ymm\r1,%ymm\r2
+vpblendw    $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
+vpsrld      $16,%ymm\r0,%ymm\r0
+vpblendw    $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
+.endm
+/********************************/
+
+.text
+nttunpack128_avx:
+#load
+vmovdqa     (%rdi),%ymm4
+vmovdqa     32(%rdi),%ymm5
+vmovdqa     64(%rdi),%ymm6
+vmovdqa     96(%rdi),%ymm7
+vmovdqa     128(%rdi),%ymm8
+vmovdqa     160(%rdi),%ymm9
+vmovdqa     192(%rdi),%ymm10
+vmovdqa     224(%rdi),%ymm11
+
+shuffle8    4,8,3,8
+shuffle8    5,9,4,9
+shuffle8    6,10,5,10
+shuffle8    7,11,6,11
+
+shuffle4    3,5,7,5
+shuffle4    8,10,3,10
+shuffle4    4,6,8,6
+shuffle4    9,11,4,11
+
+shuffle2    7,8,9,8
+shuffle2    5,6,7,6
+shuffle2    3,4,5,4
+shuffle2    10,11,3,11
+
+shuffle1    9,5,10,5
+shuffle1    8,4,9,4
+shuffle1    7,3,8,3
+shuffle1    6,11,7,11
+
+#store
+vmovdqa     %ymm10,(%rdi)
+vmovdqa     %ymm5,32(%rdi)
+vmovdqa     %ymm9,64(%rdi)
+vmovdqa     %ymm4,96(%rdi)
+vmovdqa     %ymm8,128(%rdi)
+vmovdqa     %ymm3,160(%rdi)
+vmovdqa     %ymm7,192(%rdi)
+vmovdqa     %ymm11,224(%rdi)
+
+ret
+
+.global cdecl(nttunpack_avx2_asm)
+cdecl(nttunpack_avx2_asm):
+call        nttunpack128_avx
+add         $256,%rdi
+call        nttunpack128_avx
+ret
+
+ntttobytes128_avx:
+#load
+vmovdqa     (%rsi),%ymm5
+vmovdqa     32(%rsi),%ymm6
+vmovdqa     64(%rsi),%ymm7
+vmovdqa     96(%rsi),%ymm8
+vmovdqa     128(%rsi),%ymm9
+vmovdqa     160(%rsi),%ymm10
+vmovdqa     192(%rsi),%ymm11
+vmovdqa     224(%rsi),%ymm12
+
+#csubq
+csubq       5,13
+csubq       6,13
+csubq       7,13
+csubq       8,13
+csubq       9,13
+csubq       10,13
+csubq       11,13
+csubq       12,13
+
+#bitpack
+vpsllw      $12,%ymm6,%ymm4
+vpor        %ymm4,%ymm5,%ymm4
+
+vpsrlw      $4,%ymm6,%ymm5
+vpsllw      $8,%ymm7,%ymm6
+vpor        %ymm5,%ymm6,%ymm5
+
+vpsrlw      $8,%ymm7,%ymm6
+vpsllw      $4,%ymm8,%ymm7
+vpor        %ymm6,%ymm7,%ymm6
+
+vpsllw      $12,%ymm10,%ymm7
+vpor        %ymm7,%ymm9,%ymm7
+
+vpsrlw      $4,%ymm10,%ymm8
+vpsllw      $8,%ymm11,%ymm9
+vpor        %ymm8,%ymm9,%ymm8
+
+vpsrlw      $8,%ymm11,%ymm9
+vpsllw      $4,%ymm12,%ymm10
+vpor        %ymm9,%ymm10,%ymm9
+
+shuffle1    4,5,3,5
+shuffle1    6,7,4,7
+shuffle1    8,9,6,9
+
+shuffle2    3,4,8,4
+shuffle2    6,5,3,5
+shuffle2    7,9,6,9
+
+shuffle4    8,3,7,3
+shuffle4    6,4,8,4
+shuffle4    5,9,6,9
+
+shuffle8    7,8,5,8
+shuffle8    6,3,7,3
+shuffle8    4,9,6,9
+
+#store
+vmovdqu     %ymm5,(%rdi)
+vmovdqu     %ymm7,32(%rdi)
+vmovdqu     %ymm6,64(%rdi)
+vmovdqu     %ymm8,96(%rdi)
+vmovdqu     %ymm3,128(%rdi)
+vmovdqu     %ymm9,160(%rdi)
+
+ret
+
+.global cdecl(ntttobytes_avx2_asm)
+cdecl(ntttobytes_avx2_asm):
+#consts
+vmovdqa     _16XQ*2(%rdx),%ymm0
+call        ntttobytes128_avx
+add         $256,%rsi
+add         $192,%rdi
+call        ntttobytes128_avx
+ret
+
+nttfrombytes128_avx:
+#load
+vmovdqu     (%rsi),%ymm4
+vmovdqu     32(%rsi),%ymm5
+vmovdqu     64(%rsi),%ymm6
+vmovdqu     96(%rsi),%ymm7
+vmovdqu     128(%rsi),%ymm8
+vmovdqu     160(%rsi),%ymm9
+
+shuffle8    4,7,3,7
+shuffle8    5,8,4,8
+shuffle8    6,9,5,9
+
+shuffle4    3,8,6,8
+shuffle4    7,5,3,5
+shuffle4    4,9,7,9
+
+shuffle2    6,5,4,5
+shuffle2    8,7,6,7
+shuffle2    3,9,8,9
+
+shuffle1    4,7,10,7
+shuffle1    5,8,4,8
+shuffle1    6,9,5,9
+
+#bitunpack
+vpsrlw      $12,%ymm10,%ymm11
+vpsllw      $4,%ymm7,%ymm12
+vpor        %ymm11,%ymm12,%ymm11
+vpand       %ymm0,%ymm10,%ymm10
+vpand       %ymm0,%ymm11,%ymm11
+
+vpsrlw      $8,%ymm7,%ymm12
+vpsllw      $8,%ymm4,%ymm13
+vpor        %ymm12,%ymm13,%ymm12
+vpand       %ymm0,%ymm12,%ymm12
+
+vpsrlw      $4,%ymm4,%ymm13
+vpand       %ymm0,%ymm13,%ymm13
+
+vpsrlw      $12,%ymm8,%ymm14
+vpsllw      $4,%ymm5,%ymm15
+vpor        %ymm14,%ymm15,%ymm14
+vpand       %ymm0,%ymm8,%ymm8
+vpand       %ymm0,%ymm14,%ymm14
+
+vpsrlw      $8,%ymm5,%ymm15
+vpsllw      $8,%ymm9,%ymm1
+vpor        %ymm15,%ymm1,%ymm15
+vpand       %ymm0,%ymm15,%ymm15
+
+vpsrlw      $4,%ymm9,%ymm1
+vpand       %ymm0,%ymm1,%ymm1
+
+#store
+vmovdqa     %ymm10,(%rdi)
+vmovdqa     %ymm11,32(%rdi)
+vmovdqa     %ymm12,64(%rdi)
+vmovdqa     %ymm13,96(%rdi)
+vmovdqa     %ymm8,128(%rdi)
+vmovdqa     %ymm14,160(%rdi)
+vmovdqa     %ymm15,192(%rdi)
+vmovdqa     %ymm1,224(%rdi)
+
+ret
+
+.global cdecl(nttfrombytes_avx2_asm)
+cdecl(nttfrombytes_avx2_asm):
+#consts
+vmovdqa     _16XMASK*2(%rdx),%ymm0
+call        nttfrombytes128_avx
+add         $256,%rdi
+add         $192,%rsi
+call        nttfrombytes128_avx
+ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric-shake.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric-shake.c
new file mode 100644
index 0000000000..390a2a4e38
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric-shake.c
@@ -0,0 +1,49 @@
+#include "kyber512r3_params.h"
+#include "kyber512r3_fips202.h"
+#include "kyber512r3_symmetric.h"
+#include <stdlib.h>
+
+/*************************************************
+* Name:        kyber_shake128_absorb
+*
+* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
+
+* Arguments:   - keccak_state *s:           pointer to (uninitialized) output Keccak state
+*              - const uint8_t *input:      pointer to S2N_KYBER_512_R3_SYMBYTES input to be absorbed into s
+*              - uint8_t i                  additional byte of input
+*              - uint8_t j                  additional byte of input
+**************************************************/
+void kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y) {
+    size_t i;
+    uint8_t extseed[S2N_KYBER_512_R3_SYMBYTES + 2];
+
+    for (i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) {
+        extseed[i] = input[i];
+    }
+    extseed[i++] = x;
+    extseed[i]   = y;
+    shake128_absorb(s, extseed, S2N_KYBER_512_R3_SYMBYTES + 2);
+}
+
+/*************************************************
+* Name:        shake256_prf
+*
+* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
+*              and then generates outlen bytes of SHAKE256 output
+*
+* Arguments:   - uint8_t *output:      pointer to output
+*              - size_t outlen:        number of requested output bytes
+*              - const uint8_t * key:  pointer to the key (of length S2N_KYBER_512_R3_SYMBYTES)
+*              - uint8_t nonce:  single-byte nonce (public PRF input)
+**************************************************/
+void shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) {
+    uint8_t extkey[S2N_KYBER_512_R3_SYMBYTES + 1];
+    size_t i;
+
+    for (i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) {
+        extkey[i] = key[i];
+    }
+    extkey[i] = nonce;
+
+    shake256(output, outlen, extkey, S2N_KYBER_512_R3_SYMBYTES + 1);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric.h
new file mode 100644
index 0000000000..e898a29450
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "kyber512r3_params.h"
+#include "kyber512r3_fips202.h"
+#include <stdint.h>
+
+#define keccak_state S2N_KYBER_512_R3_NAMESPACE(keccak_state)
+typedef shake128ctx keccak_state;
+
+#define xof_state S2N_KYBER_512_R3_NAMESPACE(xof_state)
+typedef keccak_state xof_state;
+
+#define kyber_shake128_absorb S2N_KYBER_512_R3_NAMESPACE(kyber_shake128_absorb)
+void kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y);
+
+#define shake256_prf S2N_KYBER_512_R3_NAMESPACE(shake256_prf)
+void shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c
index 7381deed4e..8eda65be59 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c
@@ -14,13 +14,23 @@
  */
 
 #include "s2n_pq.h"
+#include "crypto/s2n_openssl.h"
 
-static bool sikep434r2_asm_enabled = false;
+static bool sikep434r3_asm_enabled = false;
+
+/* BIKE Round-3 code supports several levels of optimization */
+static bool bike_r3_avx2_enabled    = false;
+static bool bike_r3_avx512_enabled  = false;
+static bool bike_r3_pclmul_enabled  = false;
+static bool bike_r3_vpclmul_enabled = false;
+
+static bool kyber512r3_avx2_bmi2_enabled = false;
 
 #if defined(S2N_CPUID_AVAILABLE)
 /* https://en.wikipedia.org/wiki/CPUID */
 #include <cpuid.h>
 
+#define PROCESSOR_INFO_AND_FEATURES    1
 #define EXTENDED_FEATURES_LEAF         7
 #define EXTENDED_FEATURES_SUBLEAF_ZERO 0
 
@@ -35,6 +45,12 @@ static bool sikep434r2_asm_enabled = false;
     #define bit_BMI2 (1 << 8)
 #endif
 
+/* BIKE related CPU features */
+#define EBX_BIT_AVX2    (1 << 5)
+#define EBX_BIT_AVX512  (1 << 16)
+#define ECX_BIT_VPCLMUL (1 << 10)
+#define ECX_BIT_PCLMUL  (1 << 1)
+
 bool s2n_get_cpuid_count(uint32_t leaf, uint32_t sub_leaf, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) {
     /* 0x80000000 probes for extended cpuid info */
     uint32_t max_level = __get_cpuid_max(leaf & 0x80000000, 0);
@@ -67,56 +83,228 @@ bool s2n_cpu_supports_adx() {
     return (ebx & bit_ADX);
 }
 
-bool s2n_cpu_supports_sikep434r2_asm() {
-#if defined(S2N_SIKEP434R2_ASM)
-    /* The sikep434r2 assembly code always requires BMI2. If the assembly
+bool s2n_cpu_supports_avx2() {
+    uint32_t eax, ebx, ecx, edx;
+    if (!s2n_get_cpuid_count(EXTENDED_FEATURES_LEAF, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) {
+        return false;
+    }
+
+    return (ebx & EBX_BIT_AVX2);
+}
+
+bool s2n_cpu_supports_sikep434r3_asm() {
+#if defined(S2N_SIKE_P434_R3_ASM)
+    /* The sikep434r3 assembly code always requires BMI2. If the assembly
      * was compiled with support for ADX, we also require ADX at runtime. */
-    #if defined(S2N_ADX)
-        return s2n_cpu_supports_bmi2() && s2n_cpu_supports_adx();
-    #else
-        return s2n_cpu_supports_bmi2();
-    #endif
+#if defined(S2N_ADX)
+    return s2n_cpu_supports_bmi2() && s2n_cpu_supports_adx();
+#else
+    return s2n_cpu_supports_bmi2();
+#endif
+#else
+    /* sikep434r3 assembly was not supported at compile time */
+    return false;
+#endif /* defined(S2N_SIKE_P434_R3_ASM) */
+}
+
+bool s2n_cpu_supports_bike_r3_avx2() {
+#if defined(S2N_BIKE_R3_AVX2)
+    uint32_t eax, ebx, ecx, edx;
+    if (!s2n_get_cpuid_count(EXTENDED_FEATURES_LEAF, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) {
+        return false;
+    }
+    return ((ebx & EBX_BIT_AVX2) != 0);
+#else
+    return false;
+#endif
+}
+
+bool s2n_cpu_supports_bike_r3_avx512() {
+#if defined(S2N_BIKE_R3_AVX512)
+    uint32_t eax, ebx, ecx, edx;
+    if (!s2n_get_cpuid_count(EXTENDED_FEATURES_LEAF, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) {
+        return false;
+    }
+    return ((ebx & EBX_BIT_AVX512) != 0);
+#else
+    return false;
+#endif
+}
+
+bool s2n_cpu_supports_bike_r3_pclmul() {
+#if defined(S2N_BIKE_R3_PCLMUL)
+    uint32_t eax, ebx, ecx, edx;
+    if (!s2n_get_cpuid_count(PROCESSOR_INFO_AND_FEATURES, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) {
+        return false;
+    }
+    return ((ecx & ECX_BIT_PCLMUL) != 0);
 #else
-    /* sikep434r2 assembly was not supported at compile time */
     return false;
-#endif /* defined(S2N_SIKEP434R2_ASM) */
+#endif
+}
+
+bool s2n_cpu_supports_bike_r3_vpclmul() {
+#if defined(S2N_BIKE_R3_AVX512)
+    uint32_t eax, ebx, ecx, edx;
+    if (!s2n_get_cpuid_count(EXTENDED_FEATURES_LEAF, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) {
+        return false;
+    }
+    return ((ecx & ECX_BIT_VPCLMUL) != 0);
+#else
+    return false;
+#endif
+}
+
+bool s2n_cpu_supports_kyber512r3_avx2_bmi2() {
+#if defined(S2N_KYBER512R3_AVX2_BMI2)
+    return s2n_cpu_supports_bmi2() && s2n_cpu_supports_avx2();
+#else
+    return false;
+#endif
 }
 
 #else /* defined(S2N_CPUID_AVAILABLE) */
 
 /* If CPUID is not available, we cannot perform necessary run-time checks. */
-bool s2n_cpu_supports_sikep434r2_asm() {
+bool s2n_cpu_supports_sikep434r3_asm() {
+    return false;
+}
+
+bool s2n_cpu_supports_bike_r3_avx2() {
+    return false;
+}
+
+bool s2n_cpu_supports_bike_r3_avx512() {
+    return false;
+}
+
+bool s2n_cpu_supports_bike_r3_pclmul() {
+    return false;
+}
+
+bool s2n_cpu_supports_bike_r3_vpclmul() {
+    return false;
+}
+
+bool s2n_cpu_supports_kyber512r3_avx2_bmi2() {
     return false;
 }
 
 #endif /* defined(S2N_CPUID_AVAILABLE) */
 
-bool s2n_sikep434r2_asm_is_enabled() {
-    return sikep434r2_asm_enabled;
+bool s2n_sikep434r3_asm_is_enabled() {
+    return sikep434r3_asm_enabled;
+}
+
+bool s2n_bike_r3_is_avx2_enabled() {
+    return bike_r3_avx2_enabled;
+}
+
+bool s2n_bike_r3_is_avx512_enabled() {
+    return bike_r3_avx512_enabled;
+}
+
+bool s2n_bike_r3_is_pclmul_enabled() {
+    return bike_r3_pclmul_enabled;
+}
+
+bool s2n_bike_r3_is_vpclmul_enabled() {
+    return bike_r3_vpclmul_enabled;
+}
+
+bool s2n_kyber512r3_is_avx2_bmi2_enabled() {
+    return kyber512r3_avx2_bmi2_enabled;
 }
 
 bool s2n_pq_is_enabled() {
 #if defined(S2N_NO_PQ)
     return false;
 #else
-    return !s2n_is_in_fips_mode();
+    /* aws-lc is currently the only supported FIPS library known to support PQ. */
+    return s2n_libcrypto_is_awslc() || (!s2n_is_in_fips_mode());
 #endif
 }
 
-S2N_RESULT s2n_disable_sikep434r2_asm() {
-    sikep434r2_asm_enabled = false;
+S2N_RESULT s2n_disable_sikep434r3_asm() {
+    sikep434r3_asm_enabled = false;
+    return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_disable_bike_r3_opt_all() {
+    bike_r3_avx2_enabled    = false;
+    bike_r3_avx512_enabled  = false;
+    bike_r3_pclmul_enabled  = false;
+    bike_r3_vpclmul_enabled = false;
     return S2N_RESULT_OK;
 }
 
-S2N_RESULT s2n_try_enable_sikep434r2_asm() {
-    if (s2n_pq_is_enabled() && s2n_cpu_supports_sikep434r2_asm()) {
-        sikep434r2_asm_enabled = true;
+S2N_RESULT s2n_disable_kyber512r3_opt_avx2_bmi2() {
+    kyber512r3_avx2_bmi2_enabled = false;
+    return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_try_enable_bike_r3_opt_pclmul() {
+    if (s2n_pq_is_enabled() && s2n_cpu_supports_bike_r3_pclmul()) {
+        bike_r3_pclmul_enabled = true;
     }
     return S2N_RESULT_OK;
 }
 
-S2N_RESULT s2n_pq_init() {
-    ENSURE_OK(s2n_try_enable_sikep434r2_asm(), S2N_ERR_SAFETY);
+S2N_RESULT s2n_try_enable_bike_r3_opt_avx2() {
+    /* When AVX2 is available, PCLMUL is too by default. */
+    RESULT_ENSURE_OK(s2n_try_enable_bike_r3_opt_pclmul(), S2N_ERR_SAFETY);
+    if (s2n_pq_is_enabled() && s2n_cpu_supports_bike_r3_avx2()) {
+        bike_r3_avx2_enabled = true;
+    }
+    return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_try_enable_bike_r3_opt_avx512() {
+    /* When AVX512 is available, AVX2 is too by default. */
+    RESULT_ENSURE_OK(s2n_try_enable_bike_r3_opt_avx2(), S2N_ERR_SAFETY);
+    if (s2n_pq_is_enabled() && s2n_cpu_supports_bike_r3_avx512()) {
+        bike_r3_avx512_enabled = true;
+    }
+    return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_try_enable_bike_r3_opt_vpclmul() {
+    RESULT_ENSURE_OK(s2n_try_enable_bike_r3_opt_avx512(), S2N_ERR_SAFETY);
+    /* Only Enable VPCLMUL if AVX512 is also supported. This is to because the BIKE R3 VPCLMUL requires 512-bit version
+     * of VPCLMUL, and not the 256-bit version that is available on AMD Zen 3 processors. */
+    if (s2n_pq_is_enabled() && s2n_cpu_supports_bike_r3_vpclmul() && s2n_bike_r3_is_avx512_enabled()) {
+        bike_r3_vpclmul_enabled = true;
+    }
+    return S2N_RESULT_OK;
+}
 
+S2N_RESULT s2n_try_enable_sikep434r3_asm() {
+    if (s2n_pq_is_enabled() && s2n_cpu_supports_sikep434r3_asm()) {
+        sikep434r3_asm_enabled = true;
+    }
+    return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_try_enable_kyber512r3_opt_avx2_bmi2() {
+    if (s2n_pq_is_enabled() && s2n_cpu_supports_kyber512r3_avx2_bmi2()) {
+        kyber512r3_avx2_bmi2_enabled = true;
+    }
+    return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_bike_r3_x86_64_opt_init()
+{
+    /* try_enable_vpclmul function recursively tries to enable
+     * all the optimizations (avx2, avx512, pclmul, vpclmul),
+     * so it's sufficient to call only this function. */
+    RESULT_ENSURE_OK(s2n_try_enable_bike_r3_opt_vpclmul(), S2N_ERR_SAFETY);
+    return S2N_RESULT_OK;
+}
+
+S2N_RESULT s2n_pq_init() {
+    RESULT_ENSURE_OK(s2n_try_enable_sikep434r3_asm(), S2N_ERR_SAFETY);
+    RESULT_ENSURE_OK(s2n_bike_r3_x86_64_opt_init(), S2N_ERR_SAFETY);
+    RESULT_ENSURE_OK(s2n_try_enable_kyber512r3_opt_avx2_bmi2(), S2N_ERR_SAFETY);
+    
     return S2N_RESULT_OK;
 }
diff --git a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h
index 7e5d93e991..2af5c4c940 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h
@@ -20,8 +20,23 @@
 #include "utils/s2n_safety.h"
 #include "crypto/s2n_fips.h"
 
-bool s2n_sikep434r2_asm_is_enabled(void);
+bool s2n_sikep434r3_asm_is_enabled(void);
+S2N_RESULT s2n_disable_sikep434r3_asm(void);
+S2N_RESULT s2n_try_enable_sikep434r3_asm(void);
+
+bool s2n_bike_r3_is_avx2_enabled(void);
+bool s2n_bike_r3_is_avx512_enabled(void);
+bool s2n_bike_r3_is_pclmul_enabled(void);
+bool s2n_bike_r3_is_vpclmul_enabled(void);
+S2N_RESULT s2n_disable_bike_r3_opt_all(void);
+S2N_RESULT s2n_try_enable_bike_r3_opt_pclmul(void);
+S2N_RESULT s2n_try_enable_bike_r3_opt_avx2(void);
+S2N_RESULT s2n_try_enable_bike_r3_opt_avx512(void);
+S2N_RESULT s2n_try_enable_bike_r3_opt_vpclmul(void);
+
+bool s2n_kyber512r3_is_avx2_bmi2_enabled(void);
+S2N_RESULT s2n_try_enable_kyber512r3_opt_avx2_bmi2(void);
+S2N_RESULT s2n_disable_kyber512r3_opt_avx2_bmi2(void);
+ 
 bool s2n_pq_is_enabled(void);
-S2N_RESULT s2n_disable_sikep434r2_asm(void);
-S2N_RESULT s2n_try_enable_sikep434r2_asm(void);
 S2N_RESULT s2n_pq_init(void);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c
index 845def4a31..275a3e132d 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c
@@ -23,21 +23,21 @@ static S2N_RESULT s2n_get_random_bytes_default(uint8_t *buffer, uint32_t num_byt
 static s2n_get_random_bytes_callback s2n_get_random_bytes_cb = s2n_get_random_bytes_default;
 
 S2N_RESULT s2n_get_random_bytes(uint8_t *buffer, uint32_t num_bytes) {
-    ENSURE_REF(buffer);
-    GUARD_RESULT(s2n_get_random_bytes_cb(buffer, num_bytes));
+    RESULT_ENSURE_REF(buffer);
+    RESULT_GUARD(s2n_get_random_bytes_cb(buffer, num_bytes));
 
     return S2N_RESULT_OK;
 }
 
 static S2N_RESULT s2n_get_random_bytes_default(uint8_t *buffer, uint32_t num_bytes) {
     struct s2n_blob out = { .data = buffer, .size = num_bytes };
-    GUARD_RESULT(s2n_get_private_random_data(&out));
+    RESULT_GUARD(s2n_get_private_random_data(&out));
 
     return S2N_RESULT_OK;
 }
 
 S2N_RESULT s2n_set_rand_bytes_callback_for_testing(s2n_get_random_bytes_callback rand_bytes_callback) {
-    ENSURE(s2n_in_unit_test(), S2N_ERR_NOT_IN_UNIT_TEST);
+    RESULT_ENSURE(s2n_in_unit_test(), S2N_ERR_NOT_IN_UNIT_TEST);
 
     s2n_get_random_bytes_cb = rand_bytes_callback;
 
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h
index f6674fa2bc..64465f19ed 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h
@@ -150,7 +150,7 @@ void fpdiv2_503(const digit_t* a, digit_t* c);
 void fpcorrection503(digit_t* a);
 
 // 503-bit Montgomery reduction, c = a mod p
-void rdc_mont(const digit_t* a, digit_t* c);
+void rdc_mont(const dfelm_t ma, felm_t mc);
 
 // Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768
 void fpmul503_mont(const felm_t a, const felm_t b, felm_t c);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h
index 128a0127bf..983537c2ca 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h
@@ -7,7 +7,7 @@
 #define SHAKE128_RATE 168
 #define SHAKE256_RATE 136
 
-void cshake256_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen);
+void cshake256_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen);
 void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen);
 
 #endif // FIPS202_R1_H
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c
index 7f3c63fd85..bdf2834121 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c
@@ -63,7 +63,7 @@ int random_mod_order_B(unsigned char* random_digits)
     unsigned long long nbytes = NBITS_TO_NBYTES(OBOB_BITS-1);
 
     clear_words((void*)random_digits, MAXWORDS_ORDER);
-    GUARD_AS_POSIX(s2n_get_random_bytes(random_digits, nbytes));
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(random_digits, nbytes));
     random_digits[nbytes-1] &= MASK_BOB;     // Masking last byte
 
     return S2N_SUCCESS;
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c
index 3122eb6539..ee905ca74a 100644
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c
@@ -16,13 +16,13 @@ int SIKE_P503_r1_crypto_kem_keypair(unsigned char *pk, unsigned char *sk)
 { // SIKE's key generation
   // Outputs: secret key sk (SIKE_P503_R1_SECRET_KEY_BYTES = MSG_BYTES + SECRETKEY_B_BYTES + SIKE_P503_R1_PUBLIC_KEY_BYTES bytes)
   //          public key pk (SIKE_P503_R1_PUBLIC_KEY_BYTES bytes)
-    ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
 
     digit_t _sk[SECRETKEY_B_BYTES/sizeof(digit_t)];
 
     // Generate lower portion of secret key sk <- s||SK
-    GUARD_AS_POSIX(s2n_get_random_bytes(sk, MSG_BYTES));
-    GUARD(random_mod_order_B((unsigned char*)_sk));
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(sk, MSG_BYTES));
+    POSIX_GUARD(random_mod_order_B((unsigned char*)_sk));
 
     // Generate public key pk
     EphemeralKeyGeneration_B(_sk, pk);
@@ -40,7 +40,7 @@ int SIKE_P503_r1_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsi
   // Input:   public key pk         (SIKE_P503_R1_PUBLIC_KEY_BYTES bytes)
   // Outputs: shared secret ss      (SIKE_P503_R1_SHARED_SECRET_BYTES bytes)
   //          ciphertext message ct (SIKE_P503_R1_CIPHERTEXT_BYTES = SIKE_P503_R1_PUBLIC_KEY_BYTES + MSG_BYTES bytes)
-    ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
 
     const uint16_t G = 0;
     const uint16_t H = 1;
@@ -55,7 +55,7 @@ int SIKE_P503_r1_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsi
     unsigned int i;
 
     // Generate ephemeralsk <- G(m||pk) mod oA
-    GUARD_AS_POSIX(s2n_get_random_bytes(temp, MSG_BYTES));
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(temp, MSG_BYTES));
     memcpy(&temp[MSG_BYTES], pk, SIKE_P503_R1_PUBLIC_KEY_BYTES);
     cshake256_simple(ephemeralsk.b, SECRETKEY_A_BYTES, G, temp, SIKE_P503_R1_PUBLIC_KEY_BYTES+MSG_BYTES);
 
@@ -82,7 +82,7 @@ int SIKE_P503_r1_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, cons
   // Input:   secret key sk         (SIKE_P503_R1_SECRET_KEY_BYTES = MSG_BYTES + SECRETKEY_B_BYTES + SIKE_P503_R1_PUBLIC_KEY_BYTES bytes)
   //          ciphertext message ct (SIKE_P503_R1_CIPHERTEXT_BYTES = SIKE_P503_R1_PUBLIC_KEY_BYTES + MSG_BYTES bytes)
   // Outputs: shared secret ss      (SIKE_P503_R1_SHARED_SECRET_BYTES bytes)
-    ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
 
     const uint16_t G = 0;
     const uint16_t H = 1;
@@ -117,9 +117,13 @@ int SIKE_P503_r1_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, cons
 
     // Generate shared secret ss <- H(m||ct) or output ss <- H(s||ct)
     EphemeralKeyGeneration_A(ephemeralsk_.d, c0_);
-    if (memcmp(c0_, ct, SIKE_P503_R1_PUBLIC_KEY_BYTES) != 0) {
-        memcpy(temp, sk, MSG_BYTES);
-    }
+
+    // Note: This step deviates from the NIST supplied code by using constant time operations.
+    // We only want to copy the data if c0_ and ct are different
+    bool dont_copy = s2n_constant_time_equals(c0_, ct, SIKE_P503_R1_PUBLIC_KEY_BYTES);
+    // The last argument to s2n_constant_time_copy_or_dont is dont and thus prevents the copy when non-zero/true
+    s2n_constant_time_copy_or_dont(temp, sk, MSG_BYTES, dont_copy);
+
     memcpy(&temp[MSG_BYTES], ct, SIKE_P503_R1_CIPHERTEXT_BYTES);
     cshake256_simple(ss, SIKE_P503_R1_SHARED_SECRET_BYTES, H, temp, SIKE_P503_R1_CIPHERTEXT_BYTES+MSG_BYTES);
 
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434.c
deleted file mode 100644
index 4288a5d186..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: supersingular isogeny parameters and generation of functions for P434
-*********************************************************************************************/
-
-#include "P434_api.h"
-#include "P434_internal.h"
-#include "pq-crypto/s2n_pq.h"
-
-// Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points:
-// --------------------------------------------------------------------------------------------------
-// Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at the leftmost position (i.e., little endian format).
-// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as {a, b}, with a in the least significant position.
-// Elliptic curve points P = (x,y) are encoded as {x, y}, with x in the least significant position.
-// Internally, the number of digits used to represent all these elements is obtained by approximating the number of bits to the immediately greater multiple of 32.
-// For example, a 434-bit field element is represented with Ceil(434 / 64) = 7 64-bit digits or Ceil(434 / 32) = 14 32-bit digits.
-
-//
-// Curve isogeny system "SIDHp434". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over GF(p434^2), where A=6, B=1, C=1 and p434 = 2^216*3^137-1
-//
-
-
-// The constants p434, p434p1, and p434x2 have been duplicated in
-// sikep434r2_fp_x64_asm.S. If, for any reason, the constants are changed in
-// one file, they should be updated in the other file as well.
-const uint64_t p434[NWORDS64_FIELD] = {0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFDC1767AE2FFFFFF,
-                                              0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344};
-const uint64_t p434p1[NWORDS64_FIELD] = {0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFDC1767AE3000000,
-                                                0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344};
-const uint64_t p434x2[NWORDS64_FIELD] = {0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFB82ECF5C5FFFFFF,
-                                                0xF78CB8F062B15D47, 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688};
-// Order of Alice's subgroup
-const uint64_t Alice_order[NWORDS64_ORDER] = {0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000001000000};
-// Order of Bob's subgroup
-const uint64_t Bob_order[NWORDS64_ORDER] = {0x58AEA3FDC1767AE3, 0xC520567BC65C7831, 0x1773446CFC5FD681, 0x0000000002341F27};
-// Alice's generator values {XPA0 + XPA1*i, XQA0 + xQA1*i, XRA0 + XRA1*i} in GF(p434^2), expressed in Montgomery representation
-const uint64_t A_gen[6 * NWORDS64_FIELD] = {0x05ADF455C5C345BF, 0x91935C5CC767AC2B, 0xAFE4E879951F0257, 0x70E792DC89FA27B1,
-                                                   0xF797F526BB48C8CD, 0x2181DB6131AF621F, 0x00000A1C08B1ECC4, // XPA0
-                                                   0x74840EB87CDA7788, 0x2971AA0ECF9F9D0B, 0xCB5732BDF41715D5, 0x8CD8E51F7AACFFAA,
-                                                   0xA7F424730D7E419F, 0xD671EB919A179E8C, 0x0000FFA26C5A924A, // XPA1
-                                                   0xFEC6E64588B7273B, 0xD2A626D74CBBF1C6, 0xF8F58F07A78098C7, 0xE23941F470841B03,
-                                                   0x1B63EDA2045538DD, 0x735CFEB0FFD49215, 0x0001C4CB77542876, // XQA0
-                                                   0xADB0F733C17FFDD6, 0x6AFFBD037DA0A050, 0x680EC43DB144E02F, 0x1E2E5D5FF524E374,
-                                                   0xE2DDA115260E2995, 0xA6E4B552E2EDE508, 0x00018ECCDDF4B53E, // XQA1
-                                                   0x01BA4DB518CD6C7D, 0x2CB0251FE3CC0611, 0x259B0C6949A9121B, 0x60E17AC16D2F82AD,
-                                                   0x3AA41F1CE175D92D, 0x413FBE6A9B9BC4F3, 0x00022A81D8D55643, // XRA0
-                                                   0xB8ADBC70FC82E54A, 0xEF9CDDB0D5FADDED, 0x5820C734C80096A0, 0x7799994BAA96E0E4,
-                                                   0x044961599E379AF8, 0xDB2B94FBF09F27E2, 0x0000B87FC716C0C6}; // XRA1
-// Bob's generator values {XPB0, XQB0, XRB0 + XRB1*i} in GF(p434^2), expressed in Montgomery representation
-const uint64_t B_gen[6 * NWORDS64_FIELD] = {0x6E5497556EDD48A3, 0x2A61B501546F1C05, 0xEB919446D049887D, 0x5864A4A69D450C4F,
-                                                   0xB883F276A6490D2B, 0x22CC287022D5F5B9, 0x0001BED4772E551F, // XPB0
-                                                   0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
-                                                   0x0000000000000000, 0x0000000000000000, 0x0000000000000000, // XPB1
-                                                   0xFAE2A3F93D8B6B8E, 0x494871F51700FE1C, 0xEF1A94228413C27C, 0x498FF4A4AF60BD62,
-                                                   0xB00AD2A708267E8A, 0xF4328294E017837F, 0x000034080181D8AE, // XQB0
-                                                   0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
-                                                   0x0000000000000000, 0x0000000000000000, 0x0000000000000000, // XQB1
-                                                   0x283B34FAFEFDC8E4, 0x9208F44977C3E647, 0x7DEAE962816F4E9A, 0x68A2BA8AA262EC9D,
-                                                   0x8176F112EA43F45B, 0x02106D022634F504, 0x00007E8A50F02E37, // XRB0
-                                                   0xB378B7C1DA22CCB1, 0x6D089C99AD1D9230, 0xEBE15711813E2369, 0x2B35A68239D48A53,
-                                                   0x445F6FD138407C93, 0xBEF93B29A3F6B54B, 0x000173FA910377D3}; // XRB1
-// Montgomery constant Montgomery_R2 = (2^448)^2 mod p434
-const uint64_t Montgomery_R2[NWORDS64_FIELD] = {0x28E55B65DCD69B30, 0xACEC7367768798C2, 0xAB27973F8311688D, 0x175CC6AF8D6C7C0B,
-                                                       0xABCD92BF2DDE347E, 0x69E16A61C7686D9A, 0x000025A89BCDD12A};
-// Value one in Montgomery representation
-const uint64_t Montgomery_one[NWORDS64_FIELD] = {0x000000000000742C, 0x0000000000000000, 0x0000000000000000, 0xB90FF404FC000000,
-                                                        0xD801A4FB559FACD4, 0xE93254545F77410C, 0x0000ECEEA7BD2EDA};
-
-// Fixed parameters for isogeny tree computation
-const unsigned int strat_Alice[MAX_Alice - 1] = {
-    48, 28, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 13, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1,
-    1, 1, 5, 4, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 21, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1,
-    1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1};
-
-const unsigned int strat_Bob[MAX_Bob - 1] = {
-    66, 33, 17, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 1,
-    2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 32, 16, 8, 4, 3, 1, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2,
-    1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1};
-
-// Setting up macro defines and including GF(p), GF(p^2), curve, isogeny and kex functions
-#define fpcopy fpcopy434
-#define fpzero fpzero434
-#define fpadd fpadd434
-#define fpsub fpsub434
-#define fpneg fpneg434
-#define fpdiv2 fpdiv2_434
-#define fpcorrection fpcorrection434
-#define fpmul_mont fpmul434_mont
-#define fpsqr_mont fpsqr434_mont
-#define fpinv_mont fpinv434_mont
-#define fpinv_chain_mont fpinv434_chain_mont
-#define fp2copy fp2copy434
-#define fp2zero fp2zero434
-#define fp2add fp2add434
-#define fp2sub fp2sub434
-#define fp2neg fp2neg434
-#define fp2div2 fp2div2_434
-#define fp2correction fp2correction434
-#define fp2mul_mont fp2mul434_mont
-#define fp2sqr_mont fp2sqr434_mont
-#define fp2inv_mont fp2inv434_mont
-#define mp_add_asm mp_add434_asm
-#define mp_subaddx2_asm mp_subadd434x2_asm
-#define mp_dblsubx2_asm mp_dblsub434x2_asm
-#define random_mod_order_A oqs_kem_sidh_p434_random_mod_order_A
-#define random_mod_order_B oqs_kem_sidh_p434_random_mod_order_B
-#define EphemeralKeyGeneration_A oqs_kem_sidh_p434_EphemeralKeyGeneration_A
-#define EphemeralKeyGeneration_B oqs_kem_sidh_p434_EphemeralKeyGeneration_B
-#define EphemeralSecretAgreement_A oqs_kem_sidh_p434_EphemeralSecretAgreement_A
-#define EphemeralSecretAgreement_B oqs_kem_sidh_p434_EphemeralSecretAgreement_B
-
-#include "fp.c"
-#include "fpx.c"
-#include "ec_isogeny.c"
-#include "sidh.c"
-#include "sike_r2_kem.c"
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_api.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_api.h
deleted file mode 100644
index bdf3eee8cd..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_api.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: API header file for P434
-*********************************************************************************************/
-
-#ifndef P434_API_H
-#define P434_API_H
-
-#include "P434_internal.h"
-
-/*********************** Key encapsulation mechanism API ***********************/
-
-#define CRYPTO_SECRETKEYBYTES 374 // MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes
-#define CRYPTO_PUBLICKEYBYTES 330
-#define CRYPTO_BYTES 16
-#define CRYPTO_CIPHERTEXTBYTES 346 // CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes
-
-// Encoding of keys for KEM-based isogeny system "SIKEp434" (wire format):
-// ----------------------------------------------------------------------
-// Elements over GF(p434) are encoded in 55 octets in little endian format (i.e., the least significant octet is located in the lowest memory address).
-// Elements (a+b*i) over GF(p434^2), where a and b are defined over GF(p434), are encoded as {a, b}, with a in the lowest memory portion.
-//
-// Private keys sk consist of the concatenation of a 16-byte random value, a value in the range [0, 2^217-1] and the public key pk. In the SIKE API,
-// private keys are encoded in 374 octets in little endian format.
-// Public keys pk consist of 3 elements in GF(p434^2). In the SIKE API, pk is encoded in 330 octets.
-// Ciphertexts ct consist of the concatenation of a public key value and a 16-byte value. In the SIKE API, ct is encoded in 330 + 16 = 346 octets.
-// Shared keys ss consist of a value of 16 octets.
-
-/*********************** Ephemeral key exchange API ***********************/
-
-// SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use it with static keys.
-// See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith, C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016.
-// Extended version available at: http://eprint.iacr.org/2016/859
-
-// Generation of Alice's secret key
-// Outputs random value in [0, 2^216 - 1] to be used as Alice's private key
-int oqs_kem_sidh_p434_random_mod_order_A(unsigned char *random_digits);
-
-// Generation of Bob's secret key
-// Outputs random value in [0, 2^Floor(Log(2,3^137)) - 1] to be used as Bob's private key
-int oqs_kem_sidh_p434_random_mod_order_B(unsigned char *random_digits);
-
-// Alice's ephemeral public key generation
-// Input:  a private key PrivateKeyA in the range [0, 2^216 - 1], stored in 27 bytes.
-// Output: the public key PublicKeyA consisting of 3 GF(p434^2) elements encoded in 330 bytes.
-int oqs_kem_sidh_p434_EphemeralKeyGeneration_A(const digit_t *PrivateKeyA, unsigned char *PublicKeyA);
-
-// Bob's ephemeral key-pair generation
-// It produces a private key PrivateKeyB and computes the public key PublicKeyB.
-// The private key is an integer in the range [0, 2^Floor(Log(2,3^137)) - 1], stored in 28 bytes.
-// The public key consists of 3 GF(p434^2) elements encoded in 330 bytes.
-int oqs_kem_sidh_p434_EphemeralKeyGeneration_B(const digit_t *PrivateKeyB, unsigned char *PublicKeyB);
-
-// Alice's ephemeral shared secret computation
-// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB
-// Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^216 - 1], stored in 27 bytes.
-//         Bob's PublicKeyB consists of 3 GF(p434^2) elements encoded in 330 bytes.
-// Output: a shared secret SharedSecretA that consists of one element in GF(p434^2) encoded in 110 bytes.
-int oqs_kem_sidh_p434_EphemeralSecretAgreement_A(const digit_t *PrivateKeyA, const unsigned char *PublicKeyB, unsigned char *SharedSecretA);
-
-// Bob's ephemeral shared secret computation
-// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA
-// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^137)) - 1], stored in 28 bytes.
-//         Alice's PublicKeyA consists of 3 GF(p434^2) elements encoded in 330 bytes.
-// Output: a shared secret SharedSecretB that consists of one element in GF(p434^2) encoded in 110 bytes.
-int oqs_kem_sidh_p434_EphemeralSecretAgreement_B(const digit_t *PrivateKeyB, const unsigned char *PublicKeyA, unsigned char *SharedSecretB);
-
-
-#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_internal.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_internal.h
deleted file mode 100644
index 30056d455b..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_internal.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: internal header file for P434
-*********************************************************************************************/
-
-#ifndef P434_INTERNAL_H
-#define P434_INTERNAL_H
-
-#include "config.h"
-
-#if (TARGET == TARGET_AMD64)
-#define NWORDS_FIELD 7    // Number of words of a 434-bit field element
-#define p434_ZERO_WORDS 3 // Number of "0" digits in the least significant part of p434 + 1
-#elif (TARGET == TARGET_x86)
-#define NWORDS_FIELD 14
-#define p434_ZERO_WORDS 6
-#elif (TARGET == TARGET_ARM)
-#define NWORDS_FIELD 14
-#define p434_ZERO_WORDS 6
-#elif (TARGET == TARGET_ARM64)
-#define NWORDS_FIELD 7
-#define p434_ZERO_WORDS 3
-#endif
-
-// Basic constants
-
-#define NBITS_FIELD 434
-#define MAXBITS_FIELD 448
-#define MAXWORDS_FIELD ((MAXBITS_FIELD + RADIX - 1) / RADIX) // Max. number of words to represent field elements
-#define NWORDS64_FIELD ((NBITS_FIELD + 63) / 64)             // Number of 64-bit words of a 434-bit field element
-#define NBITS_ORDER 256
-#define NWORDS_ORDER ((NBITS_ORDER + RADIX - 1) / RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp.
-#define NWORDS64_ORDER ((NBITS_ORDER + 63) / 64)         // Number of 64-bit words of a 224-bit element
-#define MAXBITS_ORDER NBITS_ORDER
-#define ALICE 0
-#define BOB 1
-#define OALICE_BITS 216
-#define OBOB_BITS 218
-#define OBOB_EXPON 137
-#define MASK_ALICE 0xFF
-#define MASK_BOB 0x01
-#define PRIME p434
-#define PARAM_A 6
-#define PARAM_C 1
-// Fixed parameters for isogeny tree computation
-#define MAX_INT_POINTS_ALICE 7
-#define MAX_INT_POINTS_BOB 8
-#define MAX_Alice 108
-#define MAX_Bob 137
-#define MSG_BYTES 16
-#define SECRETKEY_A_BYTES ((OALICE_BITS + 7) / 8)
-#define SECRETKEY_B_BYTES ((OBOB_BITS - 1 + 7) / 8)
-#define FP2_ENCODED_BYTES 2 * ((NBITS_FIELD + 7) / 8)
-
-// SIDH's basic element definitions and point representations
-
-typedef digit_t felm_t[NWORDS_FIELD];      // Datatype for representing 434-bit field elements (448-bit max.)
-typedef digit_t dfelm_t[2 * NWORDS_FIELD]; // Datatype for representing double-precision 2x434-bit field elements (448-bit max.)
-typedef struct felm_s {
-    felm_t e[2];
-} f2elm_t; // Datatype for representing quadratic extension field elements GF(p434^2)
-
-typedef struct {
-	f2elm_t X;
-	f2elm_t Z;
-} point_proj; // Point representation in projective XZ Montgomery coordinates.
-typedef point_proj point_proj_t[1];
-
-/**************** Function prototypes ****************/
-/************* Multiprecision functions **************/
-
-// Copy wordsize digits, c = a, where lng(a) = nwords
-void copy_words(const digit_t *a, digit_t *c, const unsigned int nwords);
-
-// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit
-unsigned int mp_add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords);
-
-// 434-bit multiprecision addition, c = a+b
-void mp_add434_asm(const digit_t *a, const digit_t *b, digit_t *c);
-
-// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit
-unsigned int mp_sub(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords);
-
-// 2x434-bit multiprecision subtraction followed by addition with p434*2^448, c = a-b+(p434*2^448) if a-b < 0, otherwise c=a-b
-void mp_subaddx2_asm(const digit_t *a, const digit_t *b, digit_t *c);
-void mp_subadd434x2_asm(const digit_t *a, const digit_t *b, digit_t *c);
-
-// Double 2x434-bit multiprecision subtraction, c = c-a-b, where c > a and c > b
-void mp_dblsub434x2_asm(const digit_t *a, const digit_t *b, digit_t *c);
-
-// Multiprecision right shift by one
-void mp_shiftr1(digit_t *x, const unsigned int nwords);
-
-// Digit multiplication, digit * digit -> 2-digit result
-void digit_x_digit(const digit_t a, const digit_t b, digit_t *c);
-
-// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords.
-void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords);
-
-/************ Field arithmetic functions *************/
-
-// Copy of a field element, c = a
-void fpcopy434(const digit_t *a, digit_t *c);
-
-// Zeroing a field element, a = 0
-void fpzero434(digit_t *a);
-
-// Modular addition, c = a+b mod p434
-extern void fpadd434(const digit_t *a, const digit_t *b, digit_t *c);
-extern void fpadd434_asm(const digit_t *a, const digit_t *b, digit_t *c);
-
-// Modular subtraction, c = a-b mod p434
-extern void fpsub434(const digit_t *a, const digit_t *b, digit_t *c);
-extern void fpsub434_asm(const digit_t *a, const digit_t *b, digit_t *c);
-
-// Modular negation, a = -a mod p434
-extern void fpneg434(digit_t *a);
-
-// Modular division by two, c = a/2 mod p434.
-void fpdiv2_434(const digit_t *a, digit_t *c);
-
-// Modular correction to reduce field element a in [0, 2*p434-1] to [0, p434-1].
-void fpcorrection434(digit_t *a);
-
-// 434-bit Montgomery reduction, c = a mod p
-void rdc_mont(const digit_t *a, digit_t *c);
-
-// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p434, where R=2^768
-void fpmul434_mont(const digit_t *a, const digit_t *b, digit_t *c);
-void mul434_asm(const digit_t *a, const digit_t *b, digit_t *c);
-void rdc434_asm(const digit_t *ma, digit_t *mc);
-
-// Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p434, where R=2^768
-void fpsqr434_mont(const digit_t *ma, digit_t *mc);
-
-// Conversion to Montgomery representation
-void to_mont(const digit_t *a, digit_t *mc);
-
-// Conversion from Montgomery representation to standard representation
-void from_mont(const digit_t *ma, digit_t *c);
-
-// Field inversion, a = a^-1 in GF(p434)
-void fpinv434_mont(digit_t *a);
-
-// Chain to compute (p434-3)/4 using Montgomery arithmetic
-void fpinv434_chain_mont(digit_t *a);
-
-/************ GF(p^2) arithmetic functions *************/
-
-// Copy of a GF(p434^2) element, c = a
-void fp2copy434(const f2elm_t *a, f2elm_t *c);
-
-// Zeroing a GF(p434^2) element, a = 0
-void fp2zero434(f2elm_t *a);
-
-// GF(p434^2) negation, a = -a in GF(p434^2)
-void fp2neg434(f2elm_t *a);
-
-// GF(p434^2) addition, c = a+b in GF(p434^2)
-void fp2add434(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
-
-// GF(p434^2) subtraction, c = a-b in GF(p434^2)
-extern void fp2sub434(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
-
-// GF(p434^2) division by two, c = a/2  in GF(p434^2)
-void fp2div2_434(const f2elm_t *a, f2elm_t *c);
-
-// Modular correction, a = a in GF(p434^2)
-void fp2correction434(f2elm_t *a);
-
-// GF(p434^2) squaring using Montgomery arithmetic, c = a^2 in GF(p434^2)
-void fp2sqr434_mont(const f2elm_t *a, f2elm_t *c);
-
-// GF(p434^2) multiplication using Montgomery arithmetic, c = a*b in GF(p434^2)
-void fp2mul434_mont(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
-
-// Conversion of a GF(p434^2) element to Montgomery representation
-void to_fp2mont(const f2elm_t *a, f2elm_t *mc);
-
-// Conversion of a GF(p434^2) element from Montgomery representation to standard representation
-void from_fp2mont(const f2elm_t *ma, f2elm_t *c);
-
-// GF(p434^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2)
-void fp2inv434_mont(f2elm_t *a);
-
-/************ Elliptic curve and isogeny functions *************/
-
-// Computes the j-invariant of a Montgomery curve with projective constant.
-void j_inv(const f2elm_t *A, const f2elm_t *C, f2elm_t *jinv);
-
-// Simultaneous doubling and differential addition.
-void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t *xPQ, const f2elm_t *A24);
-
-// Doubling of a Montgomery point in projective coordinates (X:Z).
-void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24);
-
-// Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings.
-void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24, const int e);
-
-// Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4.
-void get_4_isog(const point_proj_t P, f2elm_t *A24plus, f2elm_t *C24, f2elm_t *coeff);
-
-// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny.
-void eval_4_isog(point_proj_t P, f2elm_t *coeff);
-
-// Tripling of a Montgomery point in projective coordinates (X:Z).
-void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus);
-
-// Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings.
-void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus, const int e);
-
-// Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3.
-void get_3_isog(const point_proj_t P, f2elm_t *A24minus, f2elm_t *A24plus, f2elm_t *coeff);
-
-// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and a point P with coefficients given in coeff.
-void eval_3_isog(point_proj_t Q, const f2elm_t *coeff);
-
-// 3-way simultaneous inversion
-void inv_3_way(f2elm_t *z1, f2elm_t *z2, f2elm_t *z3);
-
-// Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A.
-void get_A(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xR, f2elm_t *A);
-
-#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/config.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/config.h
deleted file mode 100644
index 6199e5a708..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/config.h
+++ /dev/null
@@ -1,218 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: configuration file and platform-dependent macros
-*********************************************************************************************/
-
-#ifndef SIKE_CONFIG_H
-#define SIKE_CONFIG_H
-
-#include <stdint.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-// Definition of operating system
-
-#define OS_WIN 1
-#define OS_LINUX 2
-
-#if defined(_WIN32) // Microsoft Windows OS
-#define OS_TARGET OS_WIN
-#else
-#define OS_TARGET OS_LINUX // default to Linux
-#endif
-
-// Definition of compiler (removed in OQS)
-
-#define COMPILER_GCC     1
-#define COMPILER_CLANG   2
-
-#if defined(__GNUC__)           // GNU GCC compiler
-#define COMPILER COMPILER_GCC
-#elif defined(__clang__)        // Clang compiler
-#define COMPILER COMPILER_CLANG
-#else
-#error -- "Unsupported COMPILER"
-#endif
-
-// Definition of the targeted architecture and basic data types
-#define TARGET_AMD64 1
-#define TARGET_x86 2
-#define TARGET_ARM 3
-#define TARGET_ARM64 4
-
-#if defined(__x86_64__)
-#define TARGET TARGET_AMD64
-#define RADIX 64
-#define LOG2RADIX 6
-typedef uint64_t digit_t;  // Unsigned 64-bit digit
-typedef uint32_t hdigit_t; // Unsigned 32-bit digit
-#elif defined(__i386__)
-#define TARGET TARGET_x86
-#define RADIX 32
-#define LOG2RADIX 5
-typedef uint32_t digit_t;  // Unsigned 32-bit digit
-typedef uint16_t hdigit_t; // Unsigned 16-bit digit
-#elif defined(__arm__)
-#define TARGET TARGET_ARM
-#define RADIX 32
-#define LOG2RADIX 5
-typedef uint32_t digit_t;  // Unsigned 32-bit digit
-typedef uint16_t hdigit_t; // Unsigned 16-bit digit
-#elif defined(__aarch64__)
-#define TARGET TARGET_ARM64
-#define RADIX 64
-#define LOG2RADIX 6
-typedef uint64_t digit_t;  // Unsigned 64-bit digit
-typedef uint32_t hdigit_t; // Unsigned 32-bit digit
-#else
-#error-- "Unsupported ARCHITECTURE"
-#endif
-
-#define RADIX64 64
-
-// Extended datatype support
-#if !defined(S2N_SIKEP434R2_ASM)
-typedef uint64_t uint128_t[2];
-#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_LINUX)
-typedef unsigned uint128_t __attribute__((mode(TI)));
-#elif (TARGET == TARGET_ARM64 && OS_TARGET == OS_LINUX)
-typedef unsigned uint128_t __attribute__((mode(TI)));
-#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_WIN)
-typedef uint64_t uint128_t[2];
-#endif
-
-// Macro definitions
-
-#define NBITS_TO_NBYTES(nbits) (((nbits) + 7) / 8)                                             // Conversion macro from number of bits to number of bytes
-#define NBITS_TO_NWORDS(nbits) (((nbits) + (sizeof(digit_t) * 8) - 1) / (sizeof(digit_t) * 8)) // Conversion macro from number of bits to number of computer words
-#define NBYTES_TO_NWORDS(nbytes) (((nbytes) + sizeof(digit_t) - 1) / sizeof(digit_t))          // Conversion macro from number of bytes to number of computer words
-
-// Macro to avoid compiler warnings when detecting unreferenced parameters
-#define UNREFERENCED_PARAMETER(PAR) ((void) (PAR))
-
-/********************** Constant-time unsigned comparisons ***********************/
-
-// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
-
-unsigned int is_digit_nonzero_ct(digit_t x) { // Is x != 0?
-    return (unsigned int) ((x | (0 - x)) >> (RADIX - 1));
-}
-
-unsigned int is_digit_zero_ct(digit_t x) { // Is x = 0?
-    return (unsigned int) (1 ^ is_digit_nonzero_ct(x));
-}
-
-unsigned int is_digit_lessthan_ct(digit_t x, digit_t y) { // Is x < y?
-    return (unsigned int) ((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1));
-}
-
-/********************** Macros for platform-dependent operations **********************/
-
-#if (!defined(S2N_SIKEP434R2_ASM)) || (TARGET == TARGET_ARM)
-
-// Digit multiplication
-#define MUL(multiplier, multiplicand, hi, lo) \
-    digit_x_digit((multiplier), (multiplicand), &(lo));
-
-// Digit addition with carry
-#define ADDC(carryIn, addend1, addend2, carryOut, sumOut)                                                           \
-    {                                                                                                               \
-        digit_t tempReg = (addend1) + (digit_t)(carryIn);                                                           \
-        (sumOut) = (addend2) + tempReg;                                                                             \
-        (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); \
-    }
-
-// Digit subtraction with borrow
-#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut)                                                       \
-    {                                                                                                                       \
-        digit_t tempReg = (minuend) - (subtrahend);                                                                         \
-        unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) &is_digit_zero_ct(tempReg))); \
-        (differenceOut) = tempReg - (digit_t)(borrowIn);                                                                    \
-        (borrowOut) = borrowReg;                                                                                            \
-    }
-
-// Shift right with flexible datatype
-#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \
-    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift)));
-
-// Shift left with flexible datatype
-#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \
-    (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (DigitSize - (shift)));
-
-#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_WIN)
-
-// Digit multiplication
-#define MUL(multiplier, multiplicand, hi, lo) \
-    (lo) = _umul128((multiplier), (multiplicand), (hi));
-
-// Digit addition with carry
-#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \
-    (carryOut) = _addcarry_u64((carryIn), (addend1), (addend2), &(sumOut));
-
-// Digit subtraction with borrow
-#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \
-    (borrowOut) = _subborrow_u64((borrowIn), (minuend), (subtrahend), &(differenceOut));
-
-// Digit shift right
-#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \
-    (shiftOut) = __shiftright128((lowIn), (highIn), (shift));
-
-// Digit shift left
-#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \
-    (shiftOut) = __shiftleft128((lowIn), (highIn), (shift));
-
-// 64x64-bit multiplication
-#define MUL128(multiplier, multiplicand, product) \
-    (product)[0] = _umul128((multiplier), (multiplicand), &(product)[1]);
-
-// 128-bit addition with output carry
-#define ADC128(addend1, addend2, carry, addition)                           \
-    (carry) = _addcarry_u64(0, (addend1)[0], (addend2)[0], &(addition)[0]); \
-    (carry) = _addcarry_u64((carry), (addend1)[1], (addend2)[1], &(addition)[1]);
-
-#define MULADD128(multiplier, multiplicand, addend, carry, result) \
-    ;                                                              \
-    {                                                              \
-        uint128_t product;                                         \
-        MUL128(multiplier, multiplicand, product);                 \
-        ADC128(addend, product, carry, result);                    \
-    }
-
-#elif ((TARGET == TARGET_AMD64 || TARGET == TARGET_ARM64) && OS_TARGET == OS_LINUX)
-
-// Digit multiplication
-#define MUL(multiplier, multiplicand, hi, lo)                                    \
-    {                                                                            \
-        uint128_t tempReg = (uint128_t)(multiplier) * (uint128_t)(multiplicand); \
-        *(hi) = (digit_t)(tempReg >> RADIX);                                     \
-        (lo) = (digit_t) tempReg;                                                \
-    }
-
-// Digit addition with carry
-#define ADDC(carryIn, addend1, addend2, carryOut, sumOut)                                       \
-    {                                                                                           \
-        uint128_t tempReg = (uint128_t)(addend1) + (uint128_t)(addend2) + (uint128_t)(carryIn); \
-        (carryOut) = (digit_t)(tempReg >> RADIX);                                               \
-        (sumOut) = (digit_t) tempReg;                                                           \
-    }
-
-// Digit subtraction with borrow
-#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut)                               \
-    {                                                                                               \
-        uint128_t tempReg = (uint128_t)(minuend) - (uint128_t)(subtrahend) - (uint128_t)(borrowIn); \
-        (borrowOut) = (digit_t)(tempReg >> (sizeof(uint128_t) * 8 - 1));                            \
-        (differenceOut) = (digit_t) tempReg;                                                        \
-    }
-
-// Digit shift right
-#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \
-    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (RADIX - (shift)));
-
-// Digit shift left
-#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \
-    (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
-
-#endif
-
-#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/ec_isogeny.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/ec_isogeny.c
deleted file mode 100644
index 8a3f85e92b..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/ec_isogeny.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: elliptic curve and isogeny functions
-*********************************************************************************************/
-
-void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24) { // Doubling of a Montgomery point in projective coordinates (X:Z).
-	                                                                                               // Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C.
-	                                                                                               // Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2).
-	f2elm_t _t0, _t1;
-	f2elm_t *t0=&_t0, *t1=&_t1;
-
-	fp2sub(&P->X, &P->Z, t0);       // t0 = X1-Z1
-	fp2add(&P->X, &P->Z, t1);       // t1 = X1+Z1
-	fp2sqr_mont(t0, t0);          // t0 = (X1-Z1)^2
-	fp2sqr_mont(t1, t1);          // t1 = (X1+Z1)^2
-	fp2mul_mont(C24, t0, &Q->Z);   // Z2 = C24*(X1-Z1)^2
-	fp2mul_mont(t1, &Q->Z, &Q->X);  // X2 = C24*(X1-Z1)^2*(X1+Z1)^2
-	fp2sub(t1, t0, t1);           // t1 = (X1+Z1)^2-(X1-Z1)^2
-	fp2mul_mont(A24plus, t1, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2]
-	fp2add(&Q->Z, t0, &Q->Z);       // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2
-	fp2mul_mont(&Q->Z, t1, &Q->Z);  // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2]
-}
-
-void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24, const int e) { // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings.
-	                                                                                                             // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C.
-	                                                                                                             // Output: projective Montgomery x-coordinates Q <- (2^e)*P.
-	int i;
-
-	copy_words((const digit_t *) P, (digit_t *) Q, 2 * 2 * NWORDS_FIELD);
-
-	for (i = 0; i < e; i++) {
-		xDBL(Q, Q, A24plus, C24);
-	}
-}
-
-void get_4_isog(const point_proj_t P, f2elm_t *A24plus, f2elm_t *C24, f2elm_t *coeff) { // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4.
-	                                                                                         // Input:  projective point of order four P = (X4:Z4).
-	                                                                                         // Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients
-	                                                                                         //         that are used to evaluate the isogeny at a point in eval_4_isog().
-
-	fp2sub(&P->X, &P->Z, &coeff[1]);         // coeff[1] = X4-Z4
-	fp2add(&P->X, &P->Z, &coeff[2]);         // coeff[2] = X4+Z4
-	fp2sqr_mont(&P->Z, &coeff[0]);          // coeff[0] = Z4^2
-	fp2add(&coeff[0], &coeff[0], &coeff[0]); // coeff[0] = 2*Z4^2
-	fp2sqr_mont(&coeff[0], C24);           // C24 = 4*Z4^4
-	fp2add(&coeff[0], &coeff[0], &coeff[0]); // coeff[0] = 4*Z4^2
-	fp2sqr_mont(&P->X, A24plus);           // A24plus = X4^2
-	fp2add(A24plus, A24plus, A24plus);    // A24plus = 2*X4^2
-	fp2sqr_mont(A24plus, A24plus);        // A24plus = 4*X4^4
-}
-
-void eval_4_isog(point_proj_t P, f2elm_t *coeff) { // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined
-	                                                      // by the 3 coefficients in coeff (computed in the function get_4_isog()).
-	                                                      // Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z).
-	                                                      // Output: the projective point P = phi(P) = (X:Z) in the codomain.
-	f2elm_t _t0, _t1;
-	f2elm_t *t0=&_t0, *t1=&_t1;
-
-	fp2add(&P->X, &P->Z, t0);          // t0 = X+Z
-	fp2sub(&P->X, &P->Z, t1);          // t1 = X-Z
-	fp2mul_mont(t0, &coeff[1], &P->X); // X = (X+Z)*coeff[1]
-	fp2mul_mont(t1, &coeff[2], &P->Z); // Z = (X-Z)*coeff[2]
-	fp2mul_mont(t0, t1, t0);         // t0 = (X+Z)*(X-Z)
-	fp2mul_mont(t0, &coeff[0], t0);   // t0 = coeff[0]*(X+Z)*(X-Z)
-	fp2add(&P->X, &P->Z, t1);          // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1]
-	fp2sub(&P->X, &P->Z, &P->Z);        // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1]
-	fp2sqr_mont(t1, t1);             // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2
-	fp2sqr_mont(&P->Z, &P->Z);         // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2
-	fp2add(t1, t0, &P->X);            // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2
-	fp2sub(&P->Z, t0, t0);            // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z)
-	fp2mul_mont(&P->X, t1, &P->X);     // Xfinal
-	fp2mul_mont(&P->Z, t0, &P->Z);     // Zfinal
-}
-
-void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus) { // Tripling of a Montgomery point in projective coordinates (X:Z).
-	                                                                                                    // Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C.
-	                                                                                                    // Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3).
-	f2elm_t _t0, _t1, _t2, _t3, _t4, _t5, _t6;
-    f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3, *t4=&_t4, *t5=&_t5, *t6=&_t6;
-
-	fp2sub(&P->X, &P->Z, t0);        // t0 = X-Z
-	fp2sqr_mont(t0, t2);           // t2 = (X-Z)^2
-	fp2add(&P->X, &P->Z, t1);        // t1 = X+Z
-	fp2sqr_mont(t1, t3);           // t3 = (X+Z)^2
-	fp2add(t0, t1, t4);            // t4 = 2*X
-	fp2sub(t1, t0, t0);            // t0 = 2*Z
-	fp2sqr_mont(t4, t1);           // t1 = 4*X^2
-	fp2sub(t1, t3, t1);            // t1 = 4*X^2 - (X+Z)^2
-	fp2sub(t1, t2, t1);            // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2
-	fp2mul_mont(t3, A24plus, t5);  // t5 = A24plus*(X+Z)^2
-	fp2mul_mont(t3, t5, t3);       // t3 = A24plus*(X+Z)^3
-	fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2
-	fp2mul_mont(t2, t6, t2);       // t2 = A24minus*(X-Z)^3
-	fp2sub(t2, t3, t3);            // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3
-	fp2sub(t5, t6, t2);            // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2
-	fp2mul_mont(t1, t2, t1);       // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2]
-	fp2add(t3, t1, t2);            // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3
-	fp2sqr_mont(t2, t2);           // t2 = t2^2
-	fp2mul_mont(t4, t2, &Q->X);     // X3 = 2*X*t2
-	fp2sub(t3, t1, t1);            // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2]
-	fp2sqr_mont(t1, t1);           // t1 = t1^2
-	fp2mul_mont(t0, t1, &Q->Z);     // Z3 = 2*Z*t1
-}
-
-void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus, const int e) { // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings.
-	                                                                                                                  // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C.
-	                                                                                                                  // Output: projective Montgomery x-coordinates Q <- (3^e)*P.
-	int i;
-
-	copy_words((const digit_t *) P, (digit_t *) Q, 2 * 2 * NWORDS_FIELD);
-
-	for (i = 0; i < e; i++) {
-		xTPL(Q, Q, A24minus, A24plus);
-	}
-}
-
-void get_3_isog(const point_proj_t P, f2elm_t *A24minus, f2elm_t *A24plus, f2elm_t *coeff) { // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3.
-	                                                                                              // Input:  projective point of order three P = (X3:Z3).
-	                                                                                              // Output: the 3-isogenous Montgomery curve with projective coefficient A/C.
-    f2elm_t _t0, _t1, _t2, _t3, _t4;
-    f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3, *t4=&_t4;
-
-	fp2sub(&P->X, &P->Z, &coeff[0]);   // coeff0 = X-Z
-	fp2sqr_mont(&coeff[0], t0);      // t0 = (X-Z)^2
-	fp2add(&P->X, &P->Z, &coeff[1]);   // coeff1 = X+Z
-	fp2sqr_mont(&coeff[1], t1);      // t1 = (X+Z)^2
-	fp2add(t0, t1, t2);             // t2 = (X+Z)^2 + (X-Z)^2
-	fp2add(&coeff[0], &coeff[1], t3); // t3 = 2*X
-	fp2sqr_mont(t3, t3);            // t3 = 4*X^2
-	fp2sub(t3, t2, t3);             // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2
-	fp2add(t1, t3, t2);             // t2 = 4*X^2 - (X-Z)^2
-	fp2add(t3, t0, t3);             // t3 = 4*X^2 - (X+Z)^2
-	fp2add(t0, t3, t4);             // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2
-	fp2add(t4, t4, t4);             // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2)
-	fp2add(t1, t4, t4);             // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2
-	fp2mul_mont(t2, t4, A24minus);  // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2]
-	fp2add(t1, t2, t4);             // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2
-	fp2add(t4, t4, t4);             // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2)
-	fp2add(t0, t4, t4);             // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2
-	fp2mul_mont(t3, t4, A24plus);   // A24plus = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2]
-}
-
-void eval_3_isog(point_proj_t Q, const f2elm_t *coeff) { // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and
-	                                                            // a point P with 2 coefficients in coeff (computed in the function get_3_isog()).
-	                                                            // Inputs: projective points P = (X3:Z3) and Q = (X:Z).
-	                                                            // Output: the projective point Q <- phi(Q) = (X3:Z3).
-    f2elm_t _t0, _t1, _t2;
-    f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2;
-
-	fp2add(&Q->X, &Q->Z, t0);        // t0 = X+Z
-	fp2sub(&Q->X, &Q->Z, t1);        // t1 = X-Z
-	fp2mul_mont(t0, &coeff[0], t0); // t0 = coeff0*(X+Z)
-	fp2mul_mont(t1, &coeff[1], t1); // t1 = coeff1*(X-Z)
-	fp2add(t0, t1, t2);            // t2 = coeff0*(X+Z) + coeff1*(X-Z)
-	fp2sub(t1, t0, t0);            // t0 = coeff1*(X-Z) - coeff0*(X+Z)
-	fp2sqr_mont(t2, t2);           // t2 = [coeff0*(X+Z) + coeff1*(X-Z)]^2
-	fp2sqr_mont(t0, t0);           // t0 = [coeff1*(X-Z) - coeff0*(X+Z)]^2
-	fp2mul_mont(&Q->X, t2, &Q->X);   // X3final = X*[coeff0*(X+Z) + coeff1*(X-Z)]^2
-	fp2mul_mont(&Q->Z, t0, &Q->Z);   // Z3final = Z*[coeff1*(X-Z) - coeff0*(X+Z)]^2
-}
-
-void inv_3_way(f2elm_t *z1, f2elm_t *z2, f2elm_t *z3) { // 3-way simultaneous inversion
-	                                                        // Input:  z1,z2,z3
-	                                                        // Output: 1/z1,1/z2,1/z3 (override inputs).
-    f2elm_t _t0, _t1, _t2, _t3;
-    f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3;
-
-	fp2mul_mont(z1, z2, t0); // t0 = z1*z2
-	fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3
-	fp2inv_mont(t1);         // t1 = 1/(z1*z2*z3)
-	fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2)
-	fp2mul_mont(t2, z2, t3); // t3 = 1/z1
-	fp2mul_mont(t2, z1, z2); // z2 = 1/z2
-	fp2mul_mont(t0, t1, z3); // z3 = 1/z3
-	fp2copy(t3, z1);         // z1 = 1/z1
-}
-
-void get_A(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xR, f2elm_t *A) { // Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A.
-	                                                                                 // Input:  the x-coordinates xP, xQ, and xR of the points P, Q and R.
-	                                                                                 // Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x.
-    f2elm_t _t0, _t1, one = {0};
-    f2elm_t *t0=&_t0, *t1=&_t1;
-
-	fpcopy((const digit_t *) &Montgomery_one, one.e[0]);
-	fp2add(xP, xQ, t1);      // t1 = xP+xQ
-	fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ
-	fp2mul_mont(xR, t1, A);  // A = xR*t1
-	fp2add(t0, A, A);        // A = A+t0
-	fp2mul_mont(t0, xR, t0); // t0 = t0*xR
-	fp2sub(A, &one, A);       // A = A-1
-	fp2add(t0, t0, t0);      // t0 = t0+t0
-	fp2add(t1, xR, t1);      // t1 = t1+xR
-	fp2add(t0, t0, t0);      // t0 = t0+t0
-	fp2sqr_mont(A, A);       // A = A^2
-	fp2inv_mont(t0);         // t0 = 1/t0
-	fp2mul_mont(A, t0, A);   // A = A*t0
-	fp2sub(A, t1, A);        // Afinal = A-t1
-}
-
-void j_inv(const f2elm_t *A, const f2elm_t *C, f2elm_t *jinv) { // Computes the j-invariant of a Montgomery curve with projective constant.
-	                                                                // Input: A,C in GF(p^2).
-	                                                                // Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x.
-    f2elm_t _t0, _t1;
-    f2elm_t *t0=&_t0, *t1=&_t1;
-
-	fp2sqr_mont(A, jinv);        // jinv = A^2
-	fp2sqr_mont(C, t1);          // t1 = C^2
-	fp2add(t1, t1, t0);          // t0 = t1+t1
-	fp2sub(jinv, t0, t0);        // t0 = jinv-t0
-	fp2sub(t0, t1, t0);          // t0 = t0-t1
-	fp2sub(t0, t1, jinv);        // jinv = t0-t1
-	fp2sqr_mont(t1, t1);         // t1 = t1^2
-	fp2mul_mont(jinv, t1, jinv); // jinv = jinv*t1
-	fp2add(t0, t0, t0);          // t0 = t0+t0
-	fp2add(t0, t0, t0);          // t0 = t0+t0
-	fp2sqr_mont(t0, t1);         // t1 = t0^2
-	fp2mul_mont(t0, t1, t0);     // t0 = t0*t1
-	fp2add(t0, t0, t0);          // t0 = t0+t0
-	fp2add(t0, t0, t0);          // t0 = t0+t0
-	fp2inv_mont(jinv);           // jinv = 1/jinv
-	fp2mul_mont(jinv, t0, jinv); // jinv = t0*jinv
-}
-
-void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t *xPQ, const f2elm_t *A24) { // Simultaneous doubling and differential addition.
-	                                                                                        // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4.
-	                                                                                        // Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP.
-    f2elm_t _t0, _t1, _t2;
-    f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2;
-
-	fp2add(&P->X, &P->Z, t0); // t0 = XP+ZP
-	fp2sub(&P->X, &P->Z, t1); // t1 = XP-ZP
-	fp2sqr_mont(t0, &P->X);  // XP = (XP+ZP)^2
-	fp2sub(&Q->X, &Q->Z, t2); // t2 = XQ-ZQ
-	fp2correction(t2);
-	fp2add(&Q->X, &Q->Z, &Q->X);      // XQ = XQ+ZQ
-	fp2mul_mont(t0, t2, t0);       // t0 = (XP+ZP)*(XQ-ZQ)
-	fp2sqr_mont(t1, &P->Z);         // ZP = (XP-ZP)^2
-	fp2mul_mont(t1, &Q->X, t1);     // t1 = (XP-ZP)*(XQ+ZQ)
-	fp2sub(&P->X, &P->Z, t2);        // t2 = (XP+ZP)^2-(XP-ZP)^2
-	fp2mul_mont(&P->X, &P->Z, &P->X); // XP = (XP+ZP)^2*(XP-ZP)^2
-	fp2mul_mont(t2, A24, &Q->X);    // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2]
-	fp2sub(t0, t1, &Q->Z);          // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)
-	fp2add(&Q->X, &P->Z, &P->Z);      // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2
-	fp2add(t0, t1, &Q->X);          // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)
-	fp2mul_mont(&P->Z, t2, &P->Z);   // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2]
-	fp2sqr_mont(&Q->Z, &Q->Z);       // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2
-	fp2sqr_mont(&Q->X, &Q->X);       // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2
-	fp2mul_mont(&Q->Z, xPQ, &Q->Z);  // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2
-}
-
-static void swap_points(point_proj_t P, point_proj_t Q, const digit_t option) { // Swap points.
-	                                                                            // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P
-	for (unsigned int i = 0; i < NWORDS_FIELD; i++) {
-		digit_t temp = option & (P->X.e[0][i] ^ Q->X.e[0][i]);
-		P->X.e[0][i] = temp ^ P->X.e[0][i];
-		Q->X.e[0][i] = temp ^ Q->X.e[0][i];
-		temp = option & (P->Z.e[0][i] ^ Q->Z.e[0][i]);
-		P->Z.e[0][i] = temp ^ P->Z.e[0][i];
-		Q->Z.e[0][i] = temp ^ Q->Z.e[0][i];
-		temp = option & (P->X.e[1][i] ^ Q->X.e[1][i]);
-		P->X.e[1][i] = temp ^ P->X.e[1][i];
-		Q->X.e[1][i] = temp ^ Q->X.e[1][i];
-		temp = option & (P->Z.e[1][i] ^ Q->Z.e[1][i]);
-		P->Z.e[1][i] = temp ^ P->Z.e[1][i];
-		Q->Z.e[1][i] = temp ^ Q->Z.e[1][i];
-	}
-}
-
-void LADDER3PT(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xPQ, const digit_t *m, const unsigned int AliceOrBob, point_proj_t R, const f2elm_t *A) {
-	point_proj_t R0 = {0}, R2 = {0};
-    f2elm_t _A24 = {0};
-    f2elm_t *A24=&_A24;
-	digit_t mask;
-	int i, nbits, swap, prevbit = 0;
-
-	if (AliceOrBob == ALICE) {
-		nbits = OALICE_BITS;
-	} else {
-		nbits = OBOB_BITS - 1;
-	}
-
-	// Initializing constant
-	fpcopy((const digit_t *) &Montgomery_one, A24->e[0]);
-	fp2add(A24, A24, A24);
-	fp2add(A, A24, A24);
-	fp2div2(A24, A24);
-	fp2div2(A24, A24); // A24 = (A+2)/4
-
-	// Initializing points
-	fp2copy(xQ, &R0->X);
-	fpcopy((const digit_t *) &Montgomery_one, (digit_t *) R0->Z.e);
-	fp2copy(xPQ, &R2->X);
-	fpcopy((const digit_t *) &Montgomery_one, (digit_t *) R2->Z.e);
-	fp2copy(xP, &R->X);
-	fpcopy((const digit_t *) &Montgomery_one, (digit_t *) R->Z.e);
-	fpzero((digit_t *) (R->Z.e)[1]);
-
-	// Main loop
-	for (i = 0; i < nbits; i++) {
-		int bit = (m[i >> LOG2RADIX] >> (i & (RADIX - 1))) & 1;
-		swap = bit ^ prevbit;
-		prevbit = bit;
-		mask = 0 - (digit_t) swap;
-
-		swap_points(R, R2, mask);
-		xDBLADD(R0, R2, &R->X, A24);
-		fp2mul_mont(&R2->X, &R->Z, &R2->X);
-	}
-	swap = 0 ^ prevbit;
-	mask = 0 - (digit_t) swap;
-	swap_points(R, R2, mask);
-}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.h
deleted file mode 100644
index 1196bff2c0..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef FIPS202_H
-#define FIPS202_H
-
-#define SHAKE256_RATE 136
-
-/** Data structure for the state of the SHAKE-256 non-incremental hashing API. */
-typedef struct {
-/** Internal state. */
-    uint64_t ctx[25];
-} shake256_ctx;
-
-void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen);
-
-#endif // FIPS202_H
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fp.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fp.c
deleted file mode 100644
index 0e09ce25a0..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fp.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: Portable C and x86_64 ASM functions for modular arithmetic for P434
-*********************************************************************************************/
-
-#include "P434_internal.h"
-
-// Modular addition, c = a+b mod p434.
-// Inputs: a, b in [0, 2*p434-1]
-// Output: c in [0, 2*p434-1]
-void fpadd434(const digit_t *a, const digit_t *b, digit_t *c) {
-#if defined(S2N_SIKEP434R2_ASM)
-    if (s2n_sikep434r2_asm_is_enabled()) {
-        fpadd434_asm(a, b, c);
-        return;
-    }
-#endif
-
-	unsigned int i, carry = 0;
-	digit_t mask;
-
-	for (i = 0; i < NWORDS_FIELD; i++) {
-		ADDC(carry, a[i], b[i], carry, c[i]);
-	}
-
-	carry = 0;
-	for (i = 0; i < NWORDS_FIELD; i++) {
-		SUBC(carry, c[i], ((const digit_t *) p434x2)[i], carry, c[i]);
-	}
-	mask = 0 - (digit_t) carry;
-
-	carry = 0;
-	for (i = 0; i < NWORDS_FIELD; i++) {
-		ADDC(carry, c[i], ((const digit_t *) p434x2)[i] & mask, carry, c[i]);
-	}
-}
-
-// Modular subtraction, c = a-b mod p434.
-// Inputs: a, b in [0, 2*p434-1]
-// Output: c in [0, 2*p434-1]
-void fpsub434(const digit_t *a, const digit_t *b, digit_t *c) {
-#if defined(S2N_SIKEP434R2_ASM)
-    if (s2n_sikep434r2_asm_is_enabled()) {
-        fpsub434_asm(a, b, c);
-        return;
-    }
-#endif
-
-	unsigned int i, borrow = 0;
-	digit_t mask;
-
-	for (i = 0; i < NWORDS_FIELD; i++) {
-		SUBC(borrow, a[i], b[i], borrow, c[i]);
-	}
-	mask = 0 - (digit_t) borrow;
-
-	borrow = 0;
-	for (i = 0; i < NWORDS_FIELD; i++) {
-		ADDC(borrow, c[i], ((const digit_t *) p434x2)[i] & mask, borrow, c[i]);
-	}
-}
-
-// Modular negation, a = -a mod p434.
-// Input/output: a in [0, 2*p434-1]
-void fpneg434(digit_t *a) {
-	unsigned int i, borrow = 0;
-
-	for (i = 0; i < NWORDS_FIELD; i++) {
-		SUBC(borrow, ((const digit_t *) p434x2)[i], a[i], borrow, a[i]);
-	}
-}
-
-// Modular division by two, c = a/2 mod p434.
-// Input : a in [0, 2*p434-1]
-// Output: c in [0, 2*p434-1]
-void fpdiv2_434(const digit_t *a, digit_t *c) {
-	unsigned int i, carry = 0;
-	digit_t mask;
-
-	mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p434
-	for (i = 0; i < NWORDS_FIELD; i++) {
-		ADDC(carry, a[i], ((const digit_t *) p434)[i] & mask, carry, c[i]);
-	}
-
-	mp_shiftr1(c, NWORDS_FIELD);
-}
-
-// Modular correction to reduce field element a in [0, 2*p434-1] to [0, p434-1].
-void fpcorrection434(digit_t *a) {
-	unsigned int i, borrow = 0;
-	digit_t mask;
-
-	for (i = 0; i < NWORDS_FIELD; i++) {
-		SUBC(borrow, a[i], ((const digit_t *) p434)[i], borrow, a[i]);
-	}
-	mask = 0 - (digit_t) borrow;
-
-	borrow = 0;
-	for (i = 0; i < NWORDS_FIELD; i++) {
-		ADDC(borrow, a[i], ((const digit_t *) p434)[i] & mask, borrow, a[i]);
-	}
-}
-
-// Digit multiplication, digit * digit -> 2-digit result
-void digit_x_digit(const digit_t a, const digit_t b, digit_t *c) {
-	register digit_t al, ah, bl, bh, temp;
-	digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
-	digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t) * 4), mask_high = (digit_t)(-1) << (sizeof(digit_t) * 4);
-
-	al = a & mask_low;               // Low part
-	ah = a >> (sizeof(digit_t) * 4); // High part
-	bl = b & mask_low;
-	bh = b >> (sizeof(digit_t) * 4);
-
-	albl = al * bl;
-	albh = al * bh;
-	ahbl = ah * bl;
-	ahbh = ah * bh;
-	c[0] = albl & mask_low; // C00
-
-	res1 = albl >> (sizeof(digit_t) * 4);
-	res2 = ahbl & mask_low;
-	res3 = albh & mask_low;
-	temp = res1 + res2 + res3;
-	carry = temp >> (sizeof(digit_t) * 4);
-	c[0] ^= temp << (sizeof(digit_t) * 4); // C01
-
-	res1 = ahbl >> (sizeof(digit_t) * 4);
-	res2 = albh >> (sizeof(digit_t) * 4);
-	res3 = ahbh & mask_low;
-	temp = res1 + res2 + res3 + carry;
-	c[1] = temp & mask_low; // C10
-	carry = temp & mask_high;
-	c[1] ^= (ahbh & mask_high) + carry; // C11
-}
-
-// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords.
-void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) {
-#if defined(S2N_SIKEP434R2_ASM)
-    if (s2n_sikep434r2_asm_is_enabled()) {
-        UNREFERENCED_PARAMETER(nwords);
-        mul434_asm(a, b, c);
-        return;
-    }
-#endif
-
-	unsigned int i, j, carry;
-	digit_t t = 0, u = 0, v = 0, UV[2];
-
-	for (i = 0; i < nwords; i++) {
-		for (j = 0; j <= i; j++) {
-			MUL(a[j], b[i - j], UV + 1, UV[0]);
-			ADDC(0, UV[0], v, carry, v);
-			ADDC(carry, UV[1], u, carry, u);
-			t += carry;
-		}
-		c[i] = v;
-		v = u;
-		u = t;
-		t = 0;
-	}
-
-	for (i = nwords; i < 2 * nwords - 1; i++) {
-		for (j = i - nwords + 1; j < nwords; j++) {
-			MUL(a[j], b[i - j], UV + 1, UV[0]);
-			ADDC(0, UV[0], v, carry, v);
-			ADDC(carry, UV[1], u, carry, u);
-			t += carry;
-		}
-		c[i] = v;
-		v = u;
-		u = t;
-		t = 0;
-	}
-	c[2 * nwords - 1] = v;
-}
-
-// Efficient Montgomery reduction using comba and exploiting the special form of the prime p434.
-// mc = ma*R^-1 mod p434x2, where R = 2^448.
-// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1].
-// ma is assumed to be in Montgomery representation.
-void rdc_mont(const digit_t *ma, digit_t *mc) {
-#if defined(S2N_SIKEP434R2_ASM)
-    if (s2n_sikep434r2_asm_is_enabled()) {
-        rdc434_asm(ma, mc);
-        return;
-    }
-#endif
-
-	unsigned int i, j, carry, count = p434_ZERO_WORDS;
-	digit_t UV[2], t = 0, u = 0, v = 0;
-
-	for (i = 0; i < NWORDS_FIELD; i++) {
-		mc[i] = 0;
-	}
-
-	for (i = 0; i < NWORDS_FIELD; i++) {
-		for (j = 0; j < i; j++) {
-			if (j < (i - p434_ZERO_WORDS + 1)) {
-				MUL(mc[j], ((const digit_t *) p434p1)[i - j], UV + 1, UV[0]);
-				ADDC(0, UV[0], v, carry, v);
-				ADDC(carry, UV[1], u, carry, u);
-				t += carry;
-			}
-		}
-		ADDC(0, v, ma[i], carry, v);
-		ADDC(carry, u, 0, carry, u);
-		t += carry;
-		mc[i] = v;
-		v = u;
-		u = t;
-		t = 0;
-	}
-
-	for (i = NWORDS_FIELD; i < 2 * NWORDS_FIELD - 1; i++) {
-		if (count > 0) {
-			count -= 1;
-		}
-		for (j = i - NWORDS_FIELD + 1; j < NWORDS_FIELD; j++) {
-			if (j < (NWORDS_FIELD - count)) {
-				MUL(mc[j], ((const digit_t *) p434p1)[i - j], UV + 1, UV[0]);
-				ADDC(0, UV[0], v, carry, v);
-				ADDC(carry, UV[1], u, carry, u);
-				t += carry;
-			}
-		}
-		ADDC(0, v, ma[i], carry, v);
-		ADDC(carry, u, 0, carry, u);
-		t += carry;
-		mc[i - NWORDS_FIELD] = v;
-		v = u;
-		u = t;
-		t = 0;
-	}
-
-	/* `carry` isn't read after this, but it's still a necessary argument to the macro */
-	/* cppcheck-suppress unreadVariable */
-	ADDC(0, v, ma[2 * NWORDS_FIELD - 1], carry, v);
-	mc[NWORDS_FIELD - 1] = v;
-}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fpx.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fpx.c
deleted file mode 100644
index e5b356b93b..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fpx.c
+++ /dev/null
@@ -1,387 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: core functions over GF(p) and GF(p^2)
-*********************************************************************************************/
-
-// Conversion of GF(p^2) element from Montgomery to standard representation, and encoding by removing leading 0 bytes
-void fp2_encode(const f2elm_t *x, unsigned char *enc) {
-	unsigned int i;
-	f2elm_t t;
-
-	from_fp2mont(x, &t);
-	for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) {
-		enc[i] = ((unsigned char *) t.e)[i];
-		enc[i + FP2_ENCODED_BYTES / 2] = ((unsigned char *) t.e)[i + MAXBITS_FIELD / 8];
-	}
-}
-
-// Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation
-void fp2_decode(const unsigned char *enc, f2elm_t *x) {
-	unsigned int i;
-
-	for (i = 0; i < 2 * (MAXBITS_FIELD / 8); i++)
-		((unsigned char *) x->e)[i] = 0;
-	for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) {
-		((unsigned char *) x->e)[i] = enc[i];
-		((unsigned char *) x->e)[i + MAXBITS_FIELD / 8] = enc[i + FP2_ENCODED_BYTES / 2];
-	}
-	to_fp2mont(x, x);
-}
-
-// Copy a field element, c = a.
-__inline void fpcopy(const felm_t a, felm_t c) {
-	unsigned int i;
-
-	for (i = 0; i < NWORDS_FIELD; i++)
-		c[i] = a[i];
-}
-
-// Zero a field element, a = 0.
-__inline void fpzero(felm_t a) {
-	unsigned int i;
-
-	for (i = 0; i < NWORDS_FIELD; i++)
-		a[i] = 0;
-}
-
-// Conversion to Montgomery representation,
-// mc = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1].
-// The Montgomery constant R^2 mod p is the global value "Montgomery_R2".
-void to_mont(const felm_t a, felm_t mc) {
-	fpmul_mont(a, (const digit_t *) &Montgomery_R2, mc);
-}
-
-// Conversion from Montgomery representation to standard representation,
-// c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1].
-void from_mont(const felm_t ma, felm_t c) {
-	digit_t one[NWORDS_FIELD] = {0};
-
-	one[0] = 1;
-	fpmul_mont(ma, one, c);
-	fpcorrection(c);
-}
-
-// Copy wordsize digits, c = a, where lng(a) = nwords.
-void copy_words(const digit_t *a, digit_t *c, const unsigned int nwords) {
-	unsigned int i;
-
-	for (i = 0; i < nwords; i++)
-		c[i] = a[i];
-}
-
-// Multiprecision multiplication, c = a*b mod p.
-void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc) {
-	dfelm_t temp = {0};
-
-	mp_mul(ma, mb, temp, NWORDS_FIELD);
-	rdc_mont(temp, mc);
-}
-
-// Multiprecision squaring, c = a^2 mod p.
-void fpsqr_mont(const felm_t ma, felm_t mc) {
-	dfelm_t temp = {0};
-
-	mp_mul(ma, ma, temp, NWORDS_FIELD);
-	rdc_mont(temp, mc);
-}
-
-// Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p.
-void fpinv_mont(felm_t a) {
-	felm_t tt;
-
-	fpcopy(a, tt);
-	fpinv_chain_mont(tt);
-	fpsqr_mont(tt, tt);
-	fpsqr_mont(tt, tt);
-	fpmul_mont(a, tt, a);
-}
-
-// Copy a GF(p^2) element, c = a.
-void fp2copy(const f2elm_t *a, f2elm_t *c) {
-	fpcopy(a->e[0], c->e[0]);
-	fpcopy(a->e[1], c->e[1]);
-}
-
-// Zero a GF(p^2) element, a = 0.
-void fp2zero(f2elm_t *a) {
-	fpzero(a->e[0]);
-	fpzero(a->e[1]);
-}
-
-// GF(p^2) negation, a = -a in GF(p^2).
-void fp2neg(f2elm_t *a) {
-	fpneg(a->e[0]);
-	fpneg(a->e[1]);
-}
-
-// GF(p^2) addition, c = a+b in GF(p^2).
-__inline void fp2add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) {
-	fpadd(a->e[0], b->e[0], c->e[0]);
-	fpadd(a->e[1], b->e[1], c->e[1]);
-}
-
-// GF(p^2) subtraction, c = a-b in GF(p^2).
-__inline void fp2sub(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) {
-	fpsub(a->e[0], b->e[0], c->e[0]);
-	fpsub(a->e[1], b->e[1], c->e[1]);
-}
-
-// GF(p^2) division by two, c = a/2  in GF(p^2).
-void fp2div2(const f2elm_t *a, f2elm_t *c) {
-	fpdiv2(a->e[0], c->e[0]);
-	fpdiv2(a->e[1], c->e[1]);
-}
-
-// Modular correction, a = a in GF(p^2).
-void fp2correction(f2elm_t *a) {
-	fpcorrection(a->e[0]);
-	fpcorrection(a->e[1]);
-}
-
-// Multiprecision addition, c = a+b.
-__inline static void mp_addfast(const digit_t *a, const digit_t *b, digit_t *c) {
-#if defined(S2N_SIKEP434R2_ASM)
-    if (s2n_sikep434r2_asm_is_enabled()) {
-        mp_add_asm(a, b, c);
-        return;
-    }
-#endif
-
-	mp_add(a, b, c, NWORDS_FIELD);
-}
-
-// GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2).
-// Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1]
-// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1]
-void fp2sqr_mont(const f2elm_t *a, f2elm_t *c) {
-	felm_t t1, t2, t3;
-
-	mp_addfast(a->e[0], a->e[1], t1); // t1 = a0+a1
-	fpsub(a->e[0], a->e[1], t2);      // t2 = a0-a1
-	mp_addfast(a->e[0], a->e[0], t3); // t3 = 2a0
-	fpmul_mont(t1, t2, c->e[0]);   // c0 = (a0+a1)(a0-a1)
-	fpmul_mont(t3, a->e[1], c->e[1]); // c1 = 2a0*a1
-}
-
-// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit.
-unsigned int mp_sub(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) {
-	unsigned int i, borrow = 0;
-
-	for (i = 0; i < nwords; i++)
-		SUBC(borrow, a[i], b[i], borrow, c[i]);
-
-	return borrow;
-}
-
-// Multiprecision subtraction followed by addition with p*2^MAXBITS_FIELD, c = a-b+(p*2^MAXBITS_FIELD) if a-b < 0, otherwise c=a-b.
-__inline static void mp_subaddfast(const digit_t *a, const digit_t *b, digit_t *c) {
-#if defined(S2N_SIKEP434R2_ASM)
-    if (s2n_sikep434r2_asm_is_enabled()) {
-        mp_subaddx2_asm(a, b, c);
-        return;
-    }
-#endif
-
-	felm_t t1;
-
-	digit_t mask = 0 - (digit_t) mp_sub(a, b, c, 2 * NWORDS_FIELD);
-	for (int i = 0; i < NWORDS_FIELD; i++)
-		t1[i] = ((const digit_t *) PRIME)[i] & mask;
-	mp_addfast((digit_t *) &c[NWORDS_FIELD], t1, (digit_t *) &c[NWORDS_FIELD]);
-}
-
-// Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD.
-__inline static void mp_dblsubfast(const digit_t *a, const digit_t *b, digit_t *c) {
-#if defined(S2N_SIKEP434R2_ASM)
-    if (s2n_sikep434r2_asm_is_enabled()) {
-        mp_dblsubx2_asm(a, b, c);
-        return;
-    }
-#endif
-
-	mp_sub(c, a, c, 2 * NWORDS_FIELD);
-	mp_sub(c, b, c, 2 * NWORDS_FIELD);
-}
-
-// GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2).
-// Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1]
-// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1]
-void fp2mul_mont(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) {
-	felm_t t1, t2;
-	dfelm_t tt1, tt2, tt3;
-
-	mp_addfast(a->e[0], a->e[1], t1);            // t1 = a0+a1
-	mp_addfast(b->e[0], b->e[1], t2);            // t2 = b0+b1
-	mp_mul(a->e[0], b->e[0], tt1, NWORDS_FIELD); // tt1 = a0*b0
-	mp_mul(a->e[1], b->e[1], tt2, NWORDS_FIELD); // tt2 = a1*b1
-	mp_mul(t1, t2, tt3, NWORDS_FIELD);     // tt3 = (a0+a1)*(b0+b1)
-	mp_dblsubfast(tt1, tt2, tt3);          // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1
-	mp_subaddfast(tt1, tt2, tt1);          // tt1 = a0*b0 - a1*b1 + p*2^MAXBITS_FIELD if a0*b0 - a1*b1 < 0, else tt1 = a0*b0 - a1*b1
-	rdc_mont(tt3, c->e[1]);                   // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1
-	rdc_mont(tt1, c->e[0]);                   // c[0] = a0*b0 - a1*b1
-}
-
-// Chain to compute a^(p-3)/4 using Montgomery arithmetic.
-void fpinv_chain_mont(felm_t a) {
-	unsigned int i, j;
-
-	felm_t t[31], tt;
-
-	// Precomputed table
-	fpsqr_mont(a, tt);
-	fpmul_mont(a, tt, t[0]);
-	for (i = 0; i <= 29; i++)
-		fpmul_mont(t[i], tt, t[i + 1]);
-
-	fpcopy(a, tt);
-	for (i = 0; i < 7; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[5], tt, tt);
-	for (i = 0; i < 10; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[14], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[3], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[23], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[13], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[24], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[7], tt, tt);
-	for (i = 0; i < 8; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[12], tt, tt);
-	for (i = 0; i < 8; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[30], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[1], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[30], tt, tt);
-	for (i = 0; i < 7; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[21], tt, tt);
-	for (i = 0; i < 9; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[2], tt, tt);
-	for (i = 0; i < 9; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[19], tt, tt);
-	for (i = 0; i < 9; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[1], tt, tt);
-	for (i = 0; i < 7; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[24], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[26], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[16], tt, tt);
-	for (i = 0; i < 7; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[10], tt, tt);
-	for (i = 0; i < 7; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[6], tt, tt);
-	for (i = 0; i < 7; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[0], tt, tt);
-	for (i = 0; i < 9; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[20], tt, tt);
-	for (i = 0; i < 8; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[9], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[25], tt, tt);
-	for (i = 0; i < 9; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[30], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[26], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(a, tt, tt);
-	for (i = 0; i < 7; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[28], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[6], tt, tt);
-	for (i = 0; i < 6; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[10], tt, tt);
-	for (i = 0; i < 9; i++)
-		fpsqr_mont(tt, tt);
-	fpmul_mont(t[22], tt, tt);
-	for (j = 0; j < 35; j++) {
-		for (i = 0; i < 6; i++)
-			fpsqr_mont(tt, tt);
-		fpmul_mont(t[30], tt, tt);
-	}
-	fpcopy(tt, a);
-}
-
-// GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2).
-void fp2inv_mont(f2elm_t *a) {
-	f2elm_t t1;
-
-	fpsqr_mont(a->e[0], t1.e[0]);    // t10 = a0^2
-	fpsqr_mont(a->e[1], t1.e[1]);    // t11 = a1^2
-	fpadd(t1.e[0], t1.e[1], t1.e[0]); // t10 = a0^2+a1^2
-	fpinv_mont(t1.e[0]);          // t10 = (a0^2+a1^2)^-1
-	fpneg(a->e[1]);                // a = a0-i*a1
-	fpmul_mont(a->e[0], t1.e[0], a->e[0]);
-	fpmul_mont(a->e[1], t1.e[0], a->e[1]); // a = (a0-i*a1)*(a0^2+a1^2)^-1
-}
-
-// Conversion of a GF(p^2) element to Montgomery representation,
-// mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2).
-void to_fp2mont(const f2elm_t *a, f2elm_t *mc) {
-	to_mont(a->e[0], mc->e[0]);
-	to_mont(a->e[1], mc->e[1]);
-}
-
-// Conversion of a GF(p^2) element from Montgomery representation to standard representation,
-// c_i = ma_i*R^(-1) = a_i in GF(p^2).
-void from_fp2mont(const f2elm_t *ma, f2elm_t *c) {
-	from_mont(ma->e[0], c->e[0]);
-	from_mont(ma->e[1], c->e[1]);
-}
-
-// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit.
-unsigned int mp_add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) {
-	unsigned int i, carry = 0;
-
-	for (i = 0; i < nwords; i++) {
-	    /* cppcheck-suppress shiftTooManyBits */
-	    /* cppcheck-suppress unmatchedSuppression */
-	    ADDC(carry, a[i], b[i], carry, c[i]);
-	}
-
-	return carry;
-}
-
-// Multiprecision right shift by one.
-void mp_shiftr1(digit_t *x, const unsigned int nwords) {
-	unsigned int i;
-
-	for (i = 0; i < nwords - 1; i++) {
-		SHIFTR(x[i + 1], x[i], 1, x[i], RADIX);
-	}
-	x[nwords - 1] >>= 1;
-}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sidh.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sidh.c
deleted file mode 100644
index d3fdbe722c..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sidh.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH)
-*********************************************************************************************/
-
-#include "../s2n_pq_random.h"
-#include "utils/s2n_safety.h"
-
-static void init_basis(const digit_t *gen, f2elm_t *XP, f2elm_t *XQ, f2elm_t *XR) { // Initialization of basis points
-
-	fpcopy(gen, XP->e[0]);
-	fpcopy(gen + NWORDS_FIELD, XP->e[1]);
-	fpcopy(gen + 2 * NWORDS_FIELD, XQ->e[0]);
-	fpcopy(gen + 3 * NWORDS_FIELD, XQ->e[1]);
-	fpcopy(gen + 4 * NWORDS_FIELD, XR->e[0]);
-	fpcopy(gen + 5 * NWORDS_FIELD, XR->e[1]);
-}
-
-int random_mod_order_A(unsigned char *random_digits) { // Generation of Alice's secret key
-                                                        // Outputs random value in [0, 2^eA - 1]
-    GUARD_AS_POSIX(s2n_get_random_bytes(random_digits, SECRETKEY_A_BYTES));
-    random_digits[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; // Masking last byte
-    return S2N_SUCCESS;
-}
-
-int random_mod_order_B(unsigned char *random_digits) { // Generation of Bob's secret key
-                                                        // Outputs random value in [0, 2^Floor(Log(2, oB)) - 1]
-    GUARD_AS_POSIX(s2n_get_random_bytes(random_digits, SECRETKEY_B_BYTES));
-    random_digits[SECRETKEY_B_BYTES - 1] &= MASK_BOB; // Masking last byte
-    return S2N_SUCCESS;
-}
-
-int EphemeralKeyGeneration_A(const digit_t *PrivateKeyA, unsigned char *PublicKeyA) { // Alice's ephemeral public key generation
-	                                                                                        // Input:  a private key PrivateKeyA in the range [0, 2^eA - 1].
-	                                                                                        // Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes.
-	point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_ALICE];
-    f2elm_t _XPA, _XQA, _XRA, coeff[3], _A24plus = {0}, _C24 = {0}, _A = {0};
-    f2elm_t *XPA=&_XPA, *XQA=&_XQA, *XRA=&_XRA, *A24plus=&_A24plus, *C24=&_C24, *A=&_A;
-	unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0;
-
-	// Initialize basis points
-	init_basis((const digit_t *) A_gen, XPA, XQA, XRA);
-	init_basis((const digit_t *) B_gen, &phiP->X, &phiQ->X, &phiR->X);
-	fpcopy((const digit_t *) &Montgomery_one, (phiP->Z.e)[0]);
-	fpcopy((const digit_t *) &Montgomery_one, (phiQ->Z.e)[0]);
-	fpcopy((const digit_t *) &Montgomery_one, (phiR->Z.e)[0]);
-
-	// Initialize constants: A24plus = A+2C, C24 = 4C, where A=6, C=1
-	fpcopy((const digit_t *) &Montgomery_one, A24plus->e[0]);
-	fp2add(A24plus, A24plus, A24plus);
-	fp2add(A24plus, A24plus, C24);
-	fp2add(A24plus, C24, A);
-	fp2add(C24, C24, A24plus);
-
-	// Retrieve kernel point
-	LADDER3PT(XPA, XQA, XRA, PrivateKeyA, ALICE, R, A);
-
-	// Traverse tree
-	index = 0;
-	for (row = 1; row < MAX_Alice; row++) {
-		while (index < MAX_Alice - row) {
-			fp2copy(&R->X, &pts[npts]->X);
-			fp2copy(&R->Z, &pts[npts]->Z);
-			pts_index[npts++] = index;
-			m = strat_Alice[ii++];
-			xDBLe(R, R, A24plus, C24, (int) (2 * m));
-			index += m;
-		}
-		get_4_isog(R, A24plus, C24, coeff);
-
-		for (i = 0; i < npts; i++) {
-			eval_4_isog(pts[i], coeff);
-		}
-		eval_4_isog(phiP, coeff);
-		eval_4_isog(phiQ, coeff);
-		eval_4_isog(phiR, coeff);
-
-		fp2copy(&pts[npts - 1]->X, &R->X);
-		fp2copy(&pts[npts - 1]->Z, &R->Z);
-		index = pts_index[npts - 1];
-		npts -= 1;
-	}
-
-	get_4_isog(R, A24plus, C24, coeff);
-	eval_4_isog(phiP, coeff);
-	eval_4_isog(phiQ, coeff);
-	eval_4_isog(phiR, coeff);
-
-	inv_3_way(&phiP->Z, &phiQ->Z, &phiR->Z);
-	fp2mul_mont(&phiP->X, &phiP->Z, &phiP->X);
-	fp2mul_mont(&phiQ->X, &phiQ->Z, &phiQ->X);
-	fp2mul_mont(&phiR->X, &phiR->Z, &phiR->X);
-
-	// Format public key
-	fp2_encode(&phiP->X, PublicKeyA);
-	fp2_encode(&phiQ->X, PublicKeyA + FP2_ENCODED_BYTES);
-	fp2_encode(&phiR->X, PublicKeyA + 2 * FP2_ENCODED_BYTES);
-
-	return 0;
-}
-
-int EphemeralKeyGeneration_B(const digit_t *PrivateKeyB, unsigned char *PublicKeyB) { // Bob's ephemeral public key generation
-	                                                                                        // Input:  a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1].
-	                                                                                        // Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes.
-	point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_BOB];
-    f2elm_t _XPB, _XQB, _XRB, coeff[3], _A24plus = {0}, _A24minus = {0}, _A = {0};
-    f2elm_t *XPB=&_XPB, *XQB=&_XQB, *XRB=&_XRB, *A24plus=&_A24plus, *A24minus=&_A24minus, *A=&_A;
-	unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0;
-
-	// Initialize basis points
-	init_basis((const digit_t *) B_gen, XPB, XQB, XRB);
-	init_basis((const digit_t *) A_gen, &phiP->X, &phiQ->X, &phiR->X);
-	fpcopy((const digit_t *) &Montgomery_one, (phiP->Z.e)[0]);
-	fpcopy((const digit_t *) &Montgomery_one, (phiQ->Z.e)[0]);
-	fpcopy((const digit_t *) &Montgomery_one, (phiR->Z.e)[0]);
-
-	// Initialize constants: A24minus = A-2C, A24plus = A+2C, where A=6, C=1
-	fpcopy((const digit_t *) &Montgomery_one, A24plus->e[0]);
-	fp2add(A24plus, A24plus, A24plus);
-	fp2add(A24plus, A24plus, A24minus);
-	fp2add(A24plus, A24minus, A);
-	fp2add(A24minus, A24minus, A24plus);
-
-	// Retrieve kernel point
-	LADDER3PT(XPB, XQB, XRB, PrivateKeyB, BOB, R, A);
-
-	// Traverse tree
-	index = 0;
-	for (row = 1; row < MAX_Bob; row++) {
-		while (index < MAX_Bob - row) {
-			fp2copy(&R->X, &pts[npts]->X);
-			fp2copy(&R->Z, &pts[npts]->Z);
-			pts_index[npts++] = index;
-			m = strat_Bob[ii++];
-			xTPLe(R, R, A24minus, A24plus, (int) m);
-			index += m;
-		}
-		get_3_isog(R, A24minus, A24plus, coeff);
-
-		for (i = 0; i < npts; i++) {
-			eval_3_isog(pts[i], coeff);
-		}
-		eval_3_isog(phiP, coeff);
-		eval_3_isog(phiQ, coeff);
-		eval_3_isog(phiR, coeff);
-
-		fp2copy(&pts[npts - 1]->X, &R->X);
-		fp2copy(&pts[npts - 1]->Z, &R->Z);
-		index = pts_index[npts - 1];
-		npts -= 1;
-	}
-
-	get_3_isog(R, A24minus, A24plus, coeff);
-	eval_3_isog(phiP, coeff);
-	eval_3_isog(phiQ, coeff);
-	eval_3_isog(phiR, coeff);
-
-	inv_3_way(&phiP->Z, &phiQ->Z, &phiR->Z);
-	fp2mul_mont(&phiP->X, &phiP->Z, &phiP->X);
-	fp2mul_mont(&phiQ->X, &phiQ->Z, &phiQ->X);
-	fp2mul_mont(&phiR->X, &phiR->Z, &phiR->X);
-
-	// Format public key
-	fp2_encode(&phiP->X, PublicKeyB);
-	fp2_encode(&phiQ->X, PublicKeyB + FP2_ENCODED_BYTES);
-	fp2_encode(&phiR->X, PublicKeyB + 2 * FP2_ENCODED_BYTES);
-
-	return 0;
-}
-
-int EphemeralSecretAgreement_A(const digit_t *PrivateKeyA, const unsigned char *PublicKeyB, unsigned char *SharedSecretA) { // Alice's ephemeral shared secret computation
-	                                                                                                                              // It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB
-	                                                                                                                              // Inputs: Alice's PrivateKeyA is an integer in the range [0, oA-1].
-	                                                                                                                              //         Bob's PublicKeyB consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes.
-	                                                                                                                              // Output: a shared secret SharedSecretA that consists of one element in GF(p^2) encoded by removing leading 0 bytes.
-	point_proj_t R, pts[MAX_INT_POINTS_ALICE];
-	f2elm_t coeff[3], PKB[3], _jinv;
-    f2elm_t _A24plus = {0}, _C24 = {0}, _A = {0};
-    f2elm_t *jinv=&_jinv, *A24plus=&_A24plus, *C24=&_C24, *A=&_A;
-	unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0;
-
-	// Initialize images of Bob's basis
-	fp2_decode(PublicKeyB, &PKB[0]);
-	fp2_decode(PublicKeyB + FP2_ENCODED_BYTES, &PKB[1]);
-	fp2_decode(PublicKeyB + 2 * FP2_ENCODED_BYTES, &PKB[2]);
-
-	// Initialize constants: A24plus = A+2C, C24 = 4C, where C=1
-	get_A(&PKB[0], &PKB[1], &PKB[2], A);
-	fpadd((const digit_t *) &Montgomery_one, (const digit_t *) &Montgomery_one, C24->e[0]);
-	fp2add(A, C24, A24plus);
-	fpadd(C24->e[0], C24->e[0], C24->e[0]);
-
-	// Retrieve kernel point
-	LADDER3PT(&PKB[0], &PKB[1], &PKB[2], PrivateKeyA, ALICE, R, A);
-
-	// Traverse tree
-	index = 0;
-	for (row = 1; row < MAX_Alice; row++) {
-		while (index < MAX_Alice - row) {
-			fp2copy(&R->X, &pts[npts]->X);
-			fp2copy(&R->Z, &pts[npts]->Z);
-			pts_index[npts++] = index;
-			m = strat_Alice[ii++];
-			xDBLe(R, R, A24plus, C24, (int) (2 * m));
-			index += m;
-		}
-		get_4_isog(R, A24plus, C24, coeff);
-
-		for (i = 0; i < npts; i++) {
-			eval_4_isog(pts[i], coeff);
-		}
-
-		fp2copy(&pts[npts - 1]->X, &R->X);
-		fp2copy(&pts[npts - 1]->Z, &R->Z);
-		index = pts_index[npts - 1];
-		npts -= 1;
-	}
-
-	get_4_isog(R, A24plus, C24, coeff);
-	fp2add(A24plus, A24plus, A24plus);
-	fp2sub(A24plus, C24, A24plus);
-	fp2add(A24plus, A24plus, A24plus);
-	j_inv(A24plus, C24, jinv);
-	fp2_encode(jinv, SharedSecretA); // Format shared secret
-
-	return 0;
-}
-
-int EphemeralSecretAgreement_B(const digit_t *PrivateKeyB, const unsigned char *PublicKeyA, unsigned char *SharedSecretB) { // Bob's ephemeral shared secret computation
-	                                                                                                                              // It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA
-	                                                                                                                              // Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,oB)) - 1].
-	                                                                                                                              //         Alice's PublicKeyA consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes.
-	                                                                                                                              // Output: a shared secret SharedSecretB that consists of one element in GF(p^2) encoded by removing leading 0 bytes.
-	point_proj_t R, pts[MAX_INT_POINTS_BOB];
-	f2elm_t coeff[3], PKB[3], _jinv;
-    f2elm_t _A24plus = {0}, _A24minus = {0}, _A = {0};
-    f2elm_t *jinv=&_jinv, *A24plus=&_A24plus, *A24minus=&_A24minus, *A=&_A;
-	unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0;
-
-	// Initialize images of Alice's basis
-	fp2_decode(PublicKeyA, &PKB[0]);
-	fp2_decode(PublicKeyA + FP2_ENCODED_BYTES, &PKB[1]);
-	fp2_decode(PublicKeyA + 2 * FP2_ENCODED_BYTES, &PKB[2]);
-
-	// Initialize constants: A24plus = A+2C, A24minus = A-2C, where C=1
-	get_A(&PKB[0], &PKB[1], &PKB[2], A);
-	fpadd((const digit_t *) &Montgomery_one, (const digit_t *) &Montgomery_one, A24minus->e[0]);
-	fp2add(A, A24minus, A24plus);
-	fp2sub(A, A24minus, A24minus);
-
-	// Retrieve kernel point
-	LADDER3PT(&PKB[0], &PKB[1], &PKB[2], PrivateKeyB, BOB, R, A);
-
-	// Traverse tree
-	index = 0;
-	for (row = 1; row < MAX_Bob; row++) {
-		while (index < MAX_Bob - row) {
-			fp2copy(&R->X, &pts[npts]->X);
-			fp2copy(&R->Z, &pts[npts]->Z);
-			pts_index[npts++] = index;
-			m = strat_Bob[ii++];
-			xTPLe(R, R, A24minus, A24plus, (int) m);
-			index += m;
-		}
-		get_3_isog(R, A24minus, A24plus, coeff);
-
-		for (i = 0; i < npts; i++) {
-			eval_3_isog(pts[i], coeff);
-		}
-
-		fp2copy(&pts[npts - 1]->X, &R->X);
-		fp2copy(&pts[npts - 1]->Z, &R->Z);
-		index = pts_index[npts - 1];
-		npts -= 1;
-	}
-
-	get_3_isog(R, A24minus, A24plus, coeff);
-	fp2add(A24plus, A24minus, A);
-	fp2add(A, A, A);
-	fp2sub(A24plus, A24minus, A24plus);
-	j_inv(A, A24plus, jinv);
-	fp2_encode(jinv, SharedSecretB); // Format shared secret
-
-	return 0;
-}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sike_r2_kem.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sike_r2_kem.c
deleted file mode 100644
index 7768ad3650..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sike_r2_kem.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: supersingular isogeny key encapsulation (SIKE) protocol
-*********************************************************************************************/
-
-#include <string.h>
-#include "../s2n_pq_random.h"
-#include "fips202.h"
-#include "utils/s2n_safety.h"
-#include "tls/s2n_kem.h"
-#include "pq-crypto/s2n_pq.h"
-
-int SIKE_P434_r2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
-    // SIKE's key generation
-    // Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes)
-    //          public key pk (CRYPTO_PUBLICKEYBYTES bytes)
-    ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
-
-    digit_t _sk[(SECRETKEY_B_BYTES / sizeof(digit_t)) + 1];
-
-    // Generate lower portion of secret key sk <- s||SK
-    GUARD_AS_POSIX(s2n_get_random_bytes(sk, MSG_BYTES));
-    GUARD(random_mod_order_B((unsigned char *)_sk));
-
-    // Generate public key pk
-    EphemeralKeyGeneration_B(_sk, pk);
-
-    memcpy(sk + MSG_BYTES, _sk, SECRETKEY_B_BYTES);
-
-    // Append public key pk to secret key sk
-    memcpy(&sk[MSG_BYTES + SECRETKEY_B_BYTES], pk, CRYPTO_PUBLICKEYBYTES);
-
-    return 0;
-}
-
-int SIKE_P434_r2_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) {
-    // SIKE's encapsulation
-    // Input:   public key pk         (CRYPTO_PUBLICKEYBYTES bytes)
-    // Outputs: shared secret ss      (CRYPTO_BYTES bytes)
-    //          ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes)
-    ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
-
-    union {
-        unsigned char b[SECRETKEY_A_BYTES];
-        digit_t       d[SECRETKEY_A_BYTES/sizeof(digit_t)];
-    } ephemeralsk;
-    unsigned char jinvariant[FP2_ENCODED_BYTES];
-    unsigned char h[MSG_BYTES];
-    unsigned char temp[CRYPTO_CIPHERTEXTBYTES + MSG_BYTES];
-
-    // Generate ephemeralsk <- G(m||pk) mod oA
-    GUARD_AS_POSIX(s2n_get_random_bytes(temp, MSG_BYTES));
-    memcpy(&temp[MSG_BYTES], pk, CRYPTO_PUBLICKEYBYTES);
-    shake256(ephemeralsk.b, SECRETKEY_A_BYTES, temp, CRYPTO_PUBLICKEYBYTES + MSG_BYTES);
-
-    /* ephemeralsk is a union; the memory set here through .b will get accessed through the .d member later */
-    /* cppcheck-suppress unreadVariable */
-    /* cppcheck-suppress unmatchedSuppression */
-    ephemeralsk.b[SECRETKEY_A_BYTES - 1] &= MASK_ALICE;
-
-    // Encrypt
-    EphemeralKeyGeneration_A(ephemeralsk.d, ct);
-    EphemeralSecretAgreement_A(ephemeralsk.d, pk, jinvariant);
-    shake256(h, MSG_BYTES, jinvariant, FP2_ENCODED_BYTES);
-    for (int i = 0; i < MSG_BYTES; i++) {
-        ct[i + CRYPTO_PUBLICKEYBYTES] = temp[i] ^ h[i];
-    }
-    // Generate shared secret ss <- H(m||ct)
-    memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES);
-    shake256(ss, CRYPTO_BYTES, temp, CRYPTO_CIPHERTEXTBYTES + MSG_BYTES);
-
-    return 0;
-}
-
-int SIKE_P434_r2_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) {
-    // SIKE's decapsulation
-    // Input:   secret key sk         (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes)
-    //          ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes)
-    // Outputs: shared secret ss      (CRYPTO_BYTES bytes)
-    ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
-
-    union {
-        unsigned char b[SECRETKEY_A_BYTES];
-        digit_t       d[SECRETKEY_A_BYTES/sizeof(digit_t)];
-    } ephemeralsk_;
-    unsigned char jinvariant_[FP2_ENCODED_BYTES];
-    unsigned char h_[MSG_BYTES];
-    unsigned char c0_[CRYPTO_PUBLICKEYBYTES];
-    unsigned char temp[CRYPTO_CIPHERTEXTBYTES + MSG_BYTES];
-
-    digit_t _sk[(SECRETKEY_B_BYTES / sizeof(digit_t)) + 1];
-    memcpy(_sk, sk + MSG_BYTES, SECRETKEY_B_BYTES);
-
-    // Decrypt
-    EphemeralSecretAgreement_B(_sk, ct, jinvariant_);
-    shake256(h_, MSG_BYTES, jinvariant_, FP2_ENCODED_BYTES);
-    for (int i = 0; i < MSG_BYTES; i++) {
-        temp[i] = ct[i + CRYPTO_PUBLICKEYBYTES] ^ h_[i];
-    }
-    // Generate ephemeralsk_ <- G(m||pk) mod oA
-    memcpy(&temp[MSG_BYTES], &sk[MSG_BYTES + SECRETKEY_B_BYTES], CRYPTO_PUBLICKEYBYTES);
-    shake256(ephemeralsk_.b, SECRETKEY_A_BYTES, temp, CRYPTO_PUBLICKEYBYTES + MSG_BYTES);
-
-    /* ephemeralsk_ is a union; the memory set here through .b will get accessed through the .d member later */
-    /* cppcheck-suppress unreadVariable */
-    /* cppcheck-suppress uninitvar */
-    /* cppcheck-suppress unmatchedSuppression */
-    ephemeralsk_.b[SECRETKEY_A_BYTES - 1] &= MASK_ALICE;
-
-    // Generate shared secret ss <- H(m||ct) or output ss <- H(s||ct)
-    EphemeralKeyGeneration_A(ephemeralsk_.d, c0_);
-    if (memcmp(c0_, ct, CRYPTO_PUBLICKEYBYTES) != 0) {
-        memcpy(temp, sk, MSG_BYTES);
-    }
-    memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES);
-    shake256(ss, CRYPTO_BYTES, temp, CRYPTO_CIPHERTEXTBYTES + MSG_BYTES);
-
-    return 0;
-}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sikep434r2_fp_x64_asm.S b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sikep434r2_fp_x64_asm.S
deleted file mode 100644
index 831fc1b7fb..0000000000
--- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sikep434r2_fp_x64_asm.S
+++ /dev/null
@@ -1,962 +0,0 @@
-//*******************************************************************************************
-// SIDH: an efficient supersingular isogeny cryptography library
-//
-// Abstract: field arithmetic in x64 assembly for P434 on Linux
-//*******************************************************************************************  
-
-.intel_syntax noprefix
-
-/* Requires bmi2 instruction set for mulx. adx instructions are optional, but preferred. */
-
-// Registers that are used for parameter passing:
-#define reg_p1  rdi
-#define reg_p2  rsi
-#define reg_p3  rdx
-
-// Define addition instructions
-#ifdef S2N_ADX
-
-#define ADD1    adox
-#define ADC1    adox
-#define ADD2    adcx
-#define ADC2    adcx
-
-#else // S2N_ADX
-
-#define ADD1    add
-#define ADC1    adc
-#define ADD2    add
-#define ADC2    adc
-
-#endif // S2N_ADX
-
-// The constants below (asm_p434, asm_p434p1, and asm_p434x2) are duplicated from
-// P434.c, and correspond to the arrays p434, p434p1, and p434x2. The values are
-// idenctical; they are just represented here as standard (base 10) ints, instead
-// of hex. If, for any reason, the constants are changed in one file, they should be
-// updated in the other file as well.
-
-.text
-.align 32
-.type   asm_p434, @object
-.size   asm_p434, 56
-asm_p434:
-  .quad   -1
-  .quad   -1
-  .quad   -1
-  .quad   -161717841442111489
-  .quad   8918917783347572387
-  .quad   7853257225132122198
-  .quad   620258357900100
-.align 32
-.type   asm_p434p1, @object
-.size   asm_p434p1, 56
-asm_p434p1:
-  .quad   0
-  .quad   0
-  .quad   0
-  .quad   -161717841442111488
-  .quad   8918917783347572387
-  .quad   7853257225132122198
-  .quad   620258357900100
-.align 32
-.type   asm_p434x2, @object
-.size   asm_p434x2, 56
-asm_p434x2:
-  .quad   -2
-  .quad   -1
-  .quad   -1
-  .quad   -323435682884222977
-  .quad   -608908507014406841
-  .quad   -2740229623445307220
-  .quad   1240516715800200
-
-//***********************************************************************
-//  Field addition
-//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
-//*********************************************************************** 
-.global fpadd434_asm
-fpadd434_asm:
-  push   r12
-  push   r13
-  push   r14
-  push   r15
-  push   rbx
-  push   rbp
-  
-  xor    rax, rax
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  mov    r13, [reg_p1+40]
-  mov    r14, [reg_p1+48]
-  add    r8, [reg_p2] 
-  adc    r9, [reg_p2+8] 
-  adc    r10, [reg_p2+16] 
-  adc    r11, [reg_p2+24] 
-  adc    r12, [reg_p2+32] 
-  adc    r13, [reg_p2+40] 
-  adc    r14, [reg_p2+48]
-
-  mov    rbx, [rip+asm_p434x2]
-  sub    r8, rbx
-  mov    rcx, [rip+asm_p434x2+8]
-  sbb    r9, rcx
-  sbb    r10, rcx
-  mov    rdi, [rip+asm_p434x2+24]
-  sbb    r11, rdi
-  mov    rsi, [rip+asm_p434x2+32]
-  sbb    r12, rsi
-  mov    rbp, [rip+asm_p434x2+40]
-  sbb    r13, rbp
-  mov    r15, [rip+asm_p434x2+48]
-  sbb    r14, r15
-  sbb    rax, 0
-  
-  and    rbx, rax
-  and    rcx, rax
-  and    rdi, rax
-  and    rsi, rax
-  and    rbp, rax
-  and    r15, rax
-  
-  add    r8, rbx  
-  adc    r9, rcx  
-  adc    r10, rcx  
-  adc    r11, rdi 
-  adc    r12, rsi 
-  adc    r13, rbp   
-  adc    r14, r15
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9 
-  mov    [reg_p3+16], r10 
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], r12 
-  mov    [reg_p3+40], r13 
-  mov    [reg_p3+48], r14
-  
-  pop    rbp
-  pop    rbx
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-
-//***********************************************************************
-//  Field subtraction
-//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
-//*********************************************************************** 
-.global fpsub434_asm
-fpsub434_asm:
-  push   r12
-  push   r13
-  push   r14
-  
-  xor    rax, rax
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  mov    r13, [reg_p1+40]
-  mov    r14, [reg_p1+48]
-  sub    r8, [reg_p2] 
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  sbb    r13, [reg_p2+40] 
-  sbb    r14, [reg_p2+48]
-  sbb    rax, 0
-  
-  mov    rcx, [rip+asm_p434x2]
-  mov    rdi, [rip+asm_p434x2+8]
-  mov    rsi, [rip+asm_p434x2+24]
-  and    rcx, rax
-  and    rdi, rax
-  and    rsi, rax  
-  add    r8, rcx  
-  adc    r9, rdi  
-  adc    r10, rdi  
-  adc    r11, rsi 
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9 
-  mov    [reg_p3+16], r10 
-  mov    [reg_p3+24], r11 
-  setc   cl  
-
-  mov    r8, [rip+asm_p434x2+32]
-  mov    rdi, [rip+asm_p434x2+40]
-  mov    rsi, [rip+asm_p434x2+48]
-  and    r8, rax
-  and    rdi, rax
-  and    rsi, rax  
-  bt     rcx, 0  
-  adc    r12, r8 
-  adc    r13, rdi   
-  adc    r14, rsi
-  mov    [reg_p3+32], r12 
-  mov    [reg_p3+40], r13
-  mov    [reg_p3+48], r14
-  
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-    
-///////////////////////////////////////////////////////////////// MACRO
-// Schoolbook integer multiplication, a full row at a time
-// Inputs:  memory pointers M0 and M1
-// Outputs: memory pointer C
-// Temps:   regs T0:T9
-/////////////////////////////////////////////////////////////////
-
-#ifdef S2N_ADX
-.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
-    mov    rdx, \M0
-    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
-    mov    \C, \T1           // C0_final
-    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
-    xor    rax, rax   
-    adox   \T0, \T2        
-    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
-    adox   \T1, \T3
-           
-    mov    rdx, 8\M0
-    mulx   \T3, \T4, \M1     // T3:T4 = A1*B0
-    adox   \T2, rax 
-    xor    rax, rax   
-    mulx   \T5, \T6, 8\M1    // T5:T6 = A1*B1
-    adox   \T4, \T0
-    mov    8\C, \T4          // C1_final  
-    adcx   \T3, \T6      
-    mulx   \T6, \T0, 16\M1   // T6:T0 = A1*B2 
-    adox   \T3, \T1  
-    adcx   \T5, \T0     
-    adcx   \T6, rax 
-    adox   \T5, \T2	
-    
-    mov    rdx, 16\M0
-    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
-    adox   \T6, rax
-    xor    rax, rax 
-    mulx   \T4, \T2, 8\M1    // T4:T2 = A2*B1
-    adox   \T0, \T3   
-    mov    16\C, \T0         // C2_final 
-    adcx   \T1, \T5    
-    mulx   \T0, \T3, 16\M1   // T0:T3 = A2*B2
-    adcx   \T4, \T6  
-    adcx   \T0, rax
-    adox   \T1, \T2
-    adox   \T3, \T4
-    adox   \T0, rax
-    mov    24\C, \T1         // C3_final
-    mov    32\C, \T3         // C4_final
-    mov    40\C, \T0         // C5_final
-.endm 
-
-.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
-    mov    rdx, \M0
-    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
-    mov    \C, \T1           // C0_final
-    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
-    xor    rax, rax   
-    adox   \T0, \T2        
-    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
-    adox   \T1, \T3        
-    mulx   \T3, \T4, 24\M1   // T3:T4 = A0*B3
-    adox   \T2, \T4 
-           
-    mov    rdx, 8\M0
-    mulx   \T5, \T4, \M1     // T5:T4 = A1*B0
-    adox   \T3, rax 
-    xor    rax, rax   
-    mulx   \T6, \T7, 8\M1    // T6:T7 = A1*B1
-    adox   \T4, \T0
-    mov    8\C, \T4          // C1_final  
-    adcx   \T5, \T7      
-    mulx   \T7, \T8, 16\M1   // T7:T8 = A1*B2
-    adcx   \T6, \T8  
-    adox   \T5, \T1      
-    mulx   \T8, \T9, 24\M1   // T8:T9 = A1*B3
-    adcx   \T7, \T9        
-    adcx   \T8, rax   
-    adox   \T6, \T2
-    
-    mov    rdx, 16\M0
-    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
-    adox   \T7, \T3
-    adox   \T8, rax
-    xor    rax, rax 
-    mulx   \T2, \T3, 8\M1    // T2:T3 = A2*B1
-    adox   \T0, \T5   
-    mov    16\C, \T0         // C2_final 
-    adcx   \T1, \T3    
-    mulx   \T3, \T4, 16\M1   // T3:T4 = A2*B2
-    adcx   \T2, \T4 
-    adox   \T1, \T6       
-    mulx   \T4,\T9, 24\M1    // T3:T4 = A2*B3
-    adcx   \T3, \T9        
-    mov    rdx, 24\M0
-    adcx   \T4, rax         
-
-    adox   \T2, \T7
-    adox   \T3, \T8
-    adox   \T4, rax
-
-    mulx   \T5, \T0, \M1     // T5:T0 = A3*B0
-    xor    rax, rax 
-    mulx   \T6, \T7, 8\M1    // T6:T7 = A3*B1
-    adcx   \T5, \T7 
-    adox   \T1, \T0       
-    mulx   \T7, \T8, 16\M1   // T7:T8 = A3*B2
-    adcx   \T6, \T8  
-    adox   \T2, \T5      
-    mulx   \T8, \T9, 24\M1   // T8:T9 = A3*B3
-    adcx   \T7, \T9        
-    adcx   \T8, rax         
-
-    adox   \T3, \T6
-    adox   \T4, \T7
-    adox   \T8, rax
-    mov    24\C, \T1         // C3_final
-    mov    32\C, \T2         // C4_final
-    mov    40\C, \T3         // C5_final
-    mov    48\C, \T4         // C6_final
-    mov    56\C, \T8         // C7_final
-.endm 
-
-#else // S2N_ADX
-
-.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
-    mov    rdx, \M0
-    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
-    mov    \C, \T1           // C0_final
-    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
-    add    \T0, \T2        
-    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
-    adc    \T1, \T3
-           
-    mov    rdx, 8\M0
-    mulx   \T3, \T4, \M1     // T3:T4 = A1*B0
-    adc    \T2, 0   
-    mulx   \T5, \T6, 8\M1    // T5:T6 = A1*B1
-    add    \T4, \T0
-    mov    8\C, \T4          // C1_final
-    adc    \T3, \T1  
-    adc    \T5, \T2	    
-    mulx   \T0, \T1, 16\M1   // T0:T1 = A1*B2
-    adc    \T0, 0    
-
-    add    \T3, \T6  
-    adc    \T5, \T1     
-    adc    \T0, 0
-    
-    mov    rdx, 16\M0
-    mulx   \T1, \T2, \M1     // T1:T2 = A2*B0
-    add    \T2, \T3   
-    mov    16\C, \T2         // C2_final 
-    mulx   \T4, \T6, 8\M1    // T4:T6 = A2*B1
-    adc    \T1, \T5    
-    adc    \T0, \T4 
-    mulx   \T2, \T3, 16\M1   // T0:T3 = A2*B2 
-    adc    \T2, 0
-    add    \T1, \T6
-    adc    \T0, \T3
-    adc    \T2, 0
-    mov    24\C, \T1         // C3_final
-    mov    32\C, \T0         // C4_final
-    mov    40\C, \T2         // C5_final
-.endm 
-
-.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
-    mov    rdx, \M0
-    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
-    mov    \C, \T1           // C0_final
-    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
-    add    \T0, \T2        
-    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
-    adc    \T1, \T3         
-    mulx   \T3, \T4, 24\M1   // T3:T4 = A0*B3
-    adc    \T2, \T4        
-    mov    rdx, 8\M0
-    adc    \T3, 0         
-
-    mulx   \T5, \T4, \M1     // T5:T4 = A1*B0
-    mulx   \T6, \T7, 8\M1    // T6:T7 = A1*B1
-    add    \T5, \T7        
-    mulx   \T7, \T8, 16\M1   // T7:T8 = A1*B2
-    adc    \T6, \T8        
-    mulx   \T8, \T9, 24\M1   // T8:T9 = A1*B3
-    adc    \T7, \T9        
-    adc    \T8, 0         
-
-    add    \T4, \T0
-    mov    8\C, \T4          // C1_final
-    adc    \T5, \T1
-    adc    \T6, \T2
-    adc    \T7, \T3
-    mov    rdx, 16\M0
-    adc    \T8, 0
-
-    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
-    mulx   \T2, \T3, 8\M1    // T2:T3 = A2*B1
-    add    \T1, \T3        
-    mulx   \T3, \T4, 16\M1   // T3:T4 = A2*B2
-    adc    \T2, \T4        
-    mulx   \T4,\T9, 24\M1    // T3:T4 = A2*B3
-    adc    \T3, \T9        
-    mov    rdx, 24\M0
-    adc    \T4, 0          
-
-    add    \T0, \T5
-    mov    16\C, \T0         // C2_final
-    adc    \T1, \T6
-    adc    \T2, \T7
-    adc    \T3, \T8
-    adc    \T4, 0
-
-    mulx   \T5, \T0, \M1     // T5:T0 = A3*B0
-    mulx   \T6, \T7, 8\M1    // T6:T7 = A3*B1
-    add    \T5, \T7        
-    mulx   \T7, \T8, 16\M1   // T7:T8 = A3*B2
-    adc    \T6, \T8        
-    mulx   \T8, \T9, 24\M1   // T8:T9 = A3*B3
-    adc    \T7, \T9         
-    adc    \T8, 0         
-
-    add    \T1, \T0
-    mov    24\C, \T1         // C3_final
-    adc    \T2, \T5
-    mov    32\C, \T2         // C4_final
-    adc    \T3, \T6
-    mov    40\C, \T3         // C5_final
-    adc    \T4, \T7
-    mov    48\C, \T4         // C6_final
-    adc    \T8, 0
-    mov    56\C, \T8         // C7_final
-.endm
-#endif // S2N_ADX
-
-//*****************************************************************************
-//  434-bit multiplication using Karatsuba (one level), schoolbook (one level)
-//***************************************************************************** 
-.global mul434_asm
-mul434_asm:    
-    push   r12
-    push   r13 
-    push   r14 
-    push   r15
-    mov    rcx, reg_p3 
-
-    // r8-r11 <- AH + AL, rax <- mask
-    xor    rax, rax
-    mov    r8, [reg_p1]
-    mov    r9, [reg_p1+8]
-    mov    r10, [reg_p1+16]
-    mov    r11, [reg_p1+24] 
-    push   rbx 
-    push   rbp
-    sub    rsp, 96
-    add    r8, [reg_p1+32]
-    adc    r9, [reg_p1+40]
-    adc    r10, [reg_p1+48]
-    adc    r11, 0
-    sbb    rax, 0
-    mov    [rsp], r8
-    mov    [rsp+8], r9
-    mov    [rsp+16], r10
-    mov    [rsp+24], r11
-
-    // r12-r15 <- BH + BL, rbx <- mask
-    xor    rbx, rbx
-    mov    r12, [reg_p2]
-    mov    r13, [reg_p2+8]
-    mov    r14, [reg_p2+16]
-    mov    r15, [reg_p2+24]
-    add    r12, [reg_p2+32]
-    adc    r13, [reg_p2+40]
-    adc    r14, [reg_p2+48]
-    adc    r15, 0
-    sbb    rbx, 0
-    mov    [rsp+32], r12
-    mov    [rsp+40], r13
-    mov    [rsp+48], r14
-    mov    [rsp+56], r15
-    
-    // r12-r15 <- masked (BH + BL)
-    and    r12, rax
-    and    r13, rax
-    and    r14, rax
-    and    r15, rax
-
-    // r8-r11 <- masked (AH + AL)
-    and    r8, rbx
-    and    r9, rbx
-    and    r10, rbx
-    and    r11, rbx
-
-    // r8-r11 <- masked (AH + AL) + masked (BH + BL)
-    add    r8, r12
-    adc    r9, r13
-    adc    r10, r14
-    adc    r11, r15
-    mov    [rsp+64], r8
-    mov    [rsp+72], r9
-    mov    [rsp+80], r10
-    mov    [rsp+88], r11
-
-    // [rsp] <- (AH+AL) x (BH+BL), low part 
-    MUL256_SCHOOL  [rsp], [rsp+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp 
-
-    // [rcx] <- AL x BL
-    MUL256_SCHOOL  [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp     // Result C0-C3
-
-    // [rcx+64] <- AH x BH
-    MUL192_SCHOOL  [reg_p1+32], [reg_p2+32], [rcx+64], r8, r9, r10, r11, r12, r13, r14
-    
-    // r8-r11 <- (AH+AL) x (BH+BL), final step
-    mov    r8, [rsp+64]
-    mov    r9, [rsp+72]
-    mov    r10, [rsp+80]
-    mov    r11, [rsp+88]
-    mov    rax, [rsp+32]
-    add    r8, rax
-    mov    rax, [rsp+40]
-    adc    r9, rax
-    mov    rax, [rsp+48]
-    adc    r10, rax
-    mov    rax, [rsp+56]
-    adc    r11, rax
-    
-    // [rsp], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
-    mov    r12, [rsp]
-    mov    r13, [rsp+8]
-    mov    r14, [rsp+16]
-    mov    r15, [rsp+24]
-    sub    r12, [rcx]
-    sbb    r13, [rcx+8]
-    sbb    r14, [rcx+16]
-    sbb    r15, [rcx+24]
-    sbb    r8, [rcx+32]
-    sbb    r9, [rcx+40]
-    sbb    r10, [rcx+48]
-    sbb    r11, [rcx+56]
-    
-    // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
-    sub    r12, [rcx+64]
-    sbb    r13, [rcx+72]
-    sbb    r14, [rcx+80]
-    sbb    r15, [rcx+88]
-    sbb    r8, [rcx+96]
-    sbb    r9, [rcx+104]
-    sbb    r10, 0
-    sbb    r11, 0
-    
-    add    r12, [rcx+32]
-    mov    [rcx+32], r12    // Result C4-C7
-    adc    r13, [rcx+40]
-    mov    [rcx+40], r13 
-    adc    r14, [rcx+48]
-    mov    [rcx+48], r14 
-    adc    r15, [rcx+56]
-    mov    [rcx+56], r15
-    adc    r8, [rcx+64] 
-    mov    [rcx+64], r8    // Result C8-C15
-    adc    r9, [rcx+72]
-    mov    [rcx+72], r9 
-    adc    r10, [rcx+80]
-    mov    [rcx+80], r10
-    adc    r11, [rcx+88]
-    mov    [rcx+88], r11
-    mov    r12, [rcx+96]
-    adc    r12, 0
-    mov    [rcx+96], r12 
-    mov    r13, [rcx+104]
-    adc    r13, 0
-    mov    [rcx+104], r13
-    
-    add    rsp, 96    
-    pop    rbp  
-    pop    rbx
-    pop    r15
-    pop    r14
-    pop    r13
-    pop    r12
-    ret
-
-///////////////////////////////////////////////////////////////// MACRO
-// Schoolbook integer multiplication
-// Inputs:  memory pointers M0 and M1
-// Outputs: regs T0:T5
-// Temps:   regs T7:T6
-/////////////////////////////////////////////////////////////////
-.macro MUL64x256_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5 
-    mov    rdx, \M0
-    mulx   \T1, \T0, \M1       // T0 <- C0_final    
-    mulx   \T2, \T4, 8\M1
-    xor    rax, rax
-    mulx   \T3, \T5, 16\M1 
-    ADD1   \T1, \T4            // T1 <- C1_final   
-    ADC1   \T2, \T5            // T2 <- C2_final 
-    mulx   \T4, \T5, 24\M1
-    ADC1   \T3, \T5            // T3 <- C3_final
-    ADC1   \T4, rax            // T4 <- C4_final
-.endm
-
-#ifdef S2N_ADX
-.macro MUL128x256_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6 
-    mov    rdx, \M0
-    mulx   \T1, \T0, \M1       // T0 <- C0_final    
-    mulx   \T2, \T4, 8\M1
-    xor    rax, rax
-    mulx   \T3, \T5, 16\M1 
-    ADD1   \T1, \T4               
-    ADC1   \T2, \T5     
-    mulx   \T4, \T5, 24\M1
-    ADC1   \T3, \T5 
-    ADC1   \T4, rax   
-    
-    xor    rax, rax
-    mov    rdx, 8\M0 
-    mulx   \T6, \T5, \M1 
-    ADD2   \T1, \T5            // T1 <- C1_final 
-    ADC2   \T2, \T6     
-    mulx   \T5, \T6, 8\M1
-    ADC2   \T3, \T5 
-    ADD1   \T2, \T6        
-    mulx   \T5, \T6, 16\M1
-    ADC2   \T4, \T5 
-    ADC1   \T3, \T6     
-    mulx   \T5, \T6, 24\M1   
-    ADC2   \T5, rax         
-    ADC1   \T4, \T6  
-    ADC1   \T5, rax 
-.endm
-
-#else // S2N_ADX
-
-.macro MUL128x256_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6 
-    mov    rdx, \M0
-    mulx   \T1, \T0, \M1       // T0 <- C0_final    
-    mulx   \T2, \T4, 8\M1
-    mulx   \T3, \T5, 16\M1 
-    add    \T1, \T4               
-    adc    \T2, \T5     
-    mulx   \T4, \T5, 24\M1
-    adc    \T3, \T5 
-    adc    \T4, 0   
-    
-    mov    rdx, 8\M0 
-    mulx   \T6, \T5, \M1 
-    add    \T1, \T5            // T1 <- C1_final 
-    adc    \T2, \T6     
-    mulx   \T5, \T6, 8\M1
-    adc    \T3, \T5       
-    mulx   \T5, rax, 16\M1
-    adc    \T4, \T5     
-    mulx   \T5, rdx, 24\M1 
-    adc    \T5, 0
-    add    \T2, \T6  
-    adc    \T3, rax        
-    adc    \T4, rdx  
-    adc    \T5, 0 
-.endm
-#endif // S2N_ADX
-
-//**************************************************************************************
-//  Montgomery reduction
-//  Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
-//  Operation: c [reg_p2] = a [reg_p1]
-//  NOTE: a=c is not allowed
-//************************************************************************************** 
-.global rdc434_asm
-rdc434_asm:
-    push   r12
-    push   r13 
-
-    // a[0-1] x p434p1_nz --> result: r8:r13 
-    MUL128x256_SCHOOL [reg_p1], [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13, rcx
-
-    xor    rcx, rcx
-    add    r8, [reg_p1+24]  
-    adc    r9, [reg_p1+32]  
-    adc    r10, [reg_p1+40]   
-    adc    r11, [reg_p1+48]   
-    adc    r12, [reg_p1+56]   
-    adc    r13, [reg_p1+64]   
-    adc    rcx, [reg_p1+72]  
-    mov    [reg_p1+24], r8  
-    mov    [reg_p1+32], r9  
-    mov    [reg_p1+40], r10  
-    mov    [reg_p1+48], r11  
-    mov    [reg_p1+56], r12  
-    mov    [reg_p1+64], r13  
-    mov    [reg_p1+72], rcx  
-    mov    r8, [reg_p1+80]  
-    mov    r9, [reg_p1+88]  
-    mov    r10, [reg_p1+96]
-    mov    r11, [reg_p1+104]
-    adc    r8, 0
-    adc    r9, 0
-    adc    r10, 0
-    adc    r11, 0
-    mov    [reg_p1+80], r8  
-    mov    [reg_p1+88], r9  
-    mov    [reg_p1+96], r10
-    mov    [reg_p1+104], r11
-
-    // a[2-3] x p434p1_nz --> result: r8:r13
-    MUL128x256_SCHOOL [reg_p1+16], [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13, rcx
-
-    xor    rcx, rcx
-    add    r8, [reg_p1+40]  
-    adc    r9, [reg_p1+48]  
-    adc    r10, [reg_p1+56]   
-    adc    r11, [reg_p1+64]  
-    adc    r12, [reg_p1+72]   
-    adc    r13, [reg_p1+80]   
-    adc    rcx, [reg_p1+88]
-    mov    [reg_p1+40], r8  
-    mov    [reg_p1+48], r9  
-    mov    [reg_p1+56], r10  
-    mov    [reg_p1+64], r11   
-    mov    [reg_p1+72], r12  
-    mov    [reg_p1+80], r13  
-    mov    [reg_p1+88], rcx
-    mov    r8, [reg_p1+96]
-    mov    r9, [reg_p1+104]
-    adc    r8, 0
-    adc    r9, 0 
-    mov    [reg_p1+96], r8  
-    mov    [reg_p1+104], r9
-
-    // a[4-5] x p434p1_nz --> result: r8:r13
-    MUL128x256_SCHOOL [reg_p1+32], [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13, rcx
-
-    xor    rcx, rcx
-    add    r8, [reg_p1+56]  
-    adc    r9, [reg_p1+64]  
-    adc    r10, [reg_p1+72]   
-    adc    r11, [reg_p1+80]  
-    adc    r12, [reg_p1+88]   
-    adc    r13, [reg_p1+96]   
-    adc    rcx, [reg_p1+104]
-    mov    [reg_p2], r8        // Final result c0-c1
-    mov    [reg_p2+8], r9         
-    mov    [reg_p1+72], r10 
-    mov    [reg_p1+80], r11   
-    mov    [reg_p1+88], r12  
-    mov    [reg_p1+96], r13  
-    mov    [reg_p1+104], rcx
-
-    // a[6-7] x p434p1_nz --> result: r8:r12
-    MUL64x256_SCHOOL [reg_p1+48], [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13
-    
-    // Final result c2:c6
-    add    r8, [reg_p1+72]  
-    adc    r9, [reg_p1+80]  
-    adc    r10, [reg_p1+88]   
-    adc    r11, [reg_p1+96]  
-    adc    r12, [reg_p1+104] 
-    mov    [reg_p2+16], r8  
-    mov    [reg_p2+24], r9  
-    mov    [reg_p2+32], r10  
-    mov    [reg_p2+40], r11   
-    mov    [reg_p2+48], r12
-
-    pop    r13
-    pop    r12
-    ret
-
-//***********************************************************************
-//  434-bit multiprecision addition
-//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
-//*********************************************************************** 
-.global mp_add434_asm
-mp_add434_asm: 
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  add    r8, [reg_p2] 
-  adc    r9, [reg_p2+8] 
-  adc    r10, [reg_p2+16] 
-  adc    r11, [reg_p2+24] 
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9
-  mov    [reg_p3+16], r10
-  mov    [reg_p3+24], r11
-  
-  mov    r8, [reg_p1+32]
-  mov    r9, [reg_p1+40]
-  mov    r10, [reg_p1+48]
-  adc    r8, [reg_p2+32] 
-  adc    r9, [reg_p2+40] 
-  adc    r10, [reg_p2+48] 
-  mov    [reg_p3+32], r8
-  mov    [reg_p3+40], r9
-  mov    [reg_p3+48], r10
-  ret
-
-//***********************************************************************
-//  2x434-bit multiprecision subtraction/addition
-//  Operation: c [x2] = a [x0] - b [x1]. If c < 0, add p434*2^448
-//*********************************************************************** 
-.global mp_subadd434x2_asm
-mp_subadd434x2_asm:
-  push   r12
-  push   r13 
-  push   r14 
-  push   r15 
-  xor    rax, rax
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  sub    r8, [reg_p2] 
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9
-  mov    [reg_p3+16], r10
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], r12
-
-  mov    r8, [reg_p1+40]
-  mov    r9, [reg_p1+48]
-  mov    r10, [reg_p1+56] 
-  mov    r11, [reg_p1+64]
-  mov    r12, [reg_p1+72] 
-  sbb    r8, [reg_p2+40] 
-  sbb    r9, [reg_p2+48] 
-  sbb    r10, [reg_p2+56]
-  sbb    r11, [reg_p2+64] 
-  sbb    r12, [reg_p2+72]
-  mov    [reg_p3+40], r8
-  mov    [reg_p3+48], r9
-  mov    [reg_p3+56], r10
-  
-  mov    r13, [reg_p1+80]
-  mov    r14, [reg_p1+88] 
-  mov    r15, [reg_p1+96]
-  mov    rcx, [reg_p1+104]
-  sbb    r13, [reg_p2+80]
-  sbb    r14, [reg_p2+88]
-  sbb    r15, [reg_p2+96] 
-  sbb    rcx, [reg_p2+104] 
-  sbb    rax, 0
-  
-  // Add p434 anded with the mask in rax 
-  mov    r8, [rip+asm_p434]
-  mov    r9, [rip+asm_p434+24]
-  mov    r10, [rip+asm_p434+32]
-  mov    rdi, [rip+asm_p434+40]
-  mov    rsi, [rip+asm_p434+48]
-  and    r8, rax
-  and    r9, rax
-  and    r10, rax
-  and    rdi, rax
-  and    rsi, rax
-  mov    rax, [reg_p3+56]
-  add    rax, r8
-  adc    r11, r8
-  adc    r12, r8
-  adc    r13, r9
-  adc    r14, r10
-  adc    r15, rdi
-  adc    rcx, rsi
-  
-  mov    [reg_p3+56], rax
-  mov    [reg_p3+64], r11
-  mov    [reg_p3+72], r12
-  mov    [reg_p3+80], r13
-  mov    [reg_p3+88], r14
-  mov    [reg_p3+96], r15
-  mov    [reg_p3+104], rcx
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-
-//***********************************************************************
-//  Double 2x434-bit multiprecision subtraction
-//  Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2]
-//*********************************************************************** 
-.global mp_dblsub434x2_asm
-mp_dblsub434x2_asm:
-  push   r12
-  push   r13
-  
-  xor    rax, rax
-  mov    r8, [reg_p3]
-  mov    r9, [reg_p3+8]
-  mov    r10, [reg_p3+16]
-  mov    r11, [reg_p3+24]
-  mov    r12, [reg_p3+32]
-  mov    r13, [reg_p3+40]
-  mov    rcx, [reg_p3+48]
-  sub    r8, [reg_p1]
-  sbb    r9, [reg_p1+8] 
-  sbb    r10, [reg_p1+16] 
-  sbb    r11, [reg_p1+24] 
-  sbb    r12, [reg_p1+32] 
-  sbb    r13, [reg_p1+40] 
-  sbb    rcx, [reg_p1+48]
-  adc    rax, 0
-  sub    r8, [reg_p2]
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  sbb    r13, [reg_p2+40] 
-  sbb    rcx, [reg_p2+48]
-  adc    rax, 0  
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9
-  mov    [reg_p3+16], r10
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], r12
-  mov    [reg_p3+40], r13
-  mov    [reg_p3+48], rcx
-    
-  mov    r8, [reg_p3+56]
-  mov    r9, [reg_p3+64]
-  mov    r10, [reg_p3+72]
-  mov    r11, [reg_p3+80]
-  mov    r12, [reg_p3+88]
-  mov    r13, [reg_p3+96]
-  mov    rcx, [reg_p3+104]
-  sub    r8, rax 
-  sbb    r8, [reg_p1+56] 
-  sbb    r9, [reg_p1+64] 
-  sbb    r10, [reg_p1+72] 
-  sbb    r11, [reg_p1+80] 
-  sbb    r12, [reg_p1+88] 
-  sbb    r13, [reg_p1+96] 
-  sbb    rcx, [reg_p1+104]
-  sub    r8, [reg_p2+56] 
-  sbb    r9, [reg_p2+64] 
-  sbb    r10, [reg_p2+72] 
-  sbb    r11, [reg_p2+80] 
-  sbb    r12, [reg_p2+88] 
-  sbb    r13, [reg_p2+96] 
-  sbb    rcx, [reg_p2+104] 
-  mov    [reg_p3+56], r8
-  mov    [reg_p3+64], r9
-  mov    [reg_p3+72], r10
-  mov    [reg_p3+80], r11
-  mov    [reg_p3+88], r12
-  mov    [reg_p3+96], r13
-  mov    [reg_p3+104], rcx
-  
-  pop    r13
-  pop    r12
-  ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.c
new file mode 100644
index 0000000000..7ce71ae3d3
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.c
@@ -0,0 +1,146 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: supersingular isogeny parameters and generation of functions for P434
+*********************************************************************************************/
+
+#include "sikep434r3.h"
+
+/* Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points:
+ *
+ * Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at
+ * the leftmost position (i.e., little endian format). Elements (a+b*i) over GF(p^2), where a and b are
+ * defined over GF(p), are encoded as {a, b}, with a in the least significant position. Elliptic curve
+ * points P = (x,y) are encoded as {x, y}, with x in the least significant position. Internally, the
+ * number of digits used to represent all these elements is obtained by approximating the number of bits
+ * to the immediately greater multiple of 32. For example, a 434-bit field element is represented with
+ * Ceil(434 / 64) = 7 64-bit digits or Ceil(434 / 32) = 14 32-bit digits.
+ *
+ * Curve isogeny system "SIDHp434". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over
+ * GF(p434^2), where A=6, B=1, C=1 and p434 = 2^216*3^137-1 */
+
+const uint64_t p434[S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+        0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+        0xFDC1767AE2FFFFFF, 0x7BC65C783158AEA3, 0x6CFC5FD681C52056,
+        0x0002341F27177344
+};
+
+const uint64_t p434x2[S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+        0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+        0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47, 0xD9F8BFAD038A40AC,
+        0x0004683E4E2EE688
+};
+
+const uint64_t p434x4[S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+        0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+        0xF705D9EB8BFFFFFF, 0xEF1971E0C562BA8F, 0xB3F17F5A07148159,
+        0x0008D07C9C5DCD11
+};
+
+const uint64_t p434p1[S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+        0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+        0xFDC1767AE3000000, 0x7BC65C783158AEA3, 0x6CFC5FD681C52056,
+        0x0002341F27177344
+};
+
+/* Alice's generator values {XPA0 + XPA1*i, XQA0 + xQA1*i, XRA0 + XRA1*i} in GF(p434^2),
+ * expressed in Montgomery representation */
+const uint64_t A_gen[6*S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+        0x05ADF455C5C345BF, 0x91935C5CC767AC2B, 0xAFE4E879951F0257,
+        0x70E792DC89FA27B1, 0xF797F526BB48C8CD, 0x2181DB6131AF621F,
+        0x00000A1C08B1ECC4, /* XPA0 */
+
+        0x74840EB87CDA7788, 0x2971AA0ECF9F9D0B, 0xCB5732BDF41715D5,
+        0x8CD8E51F7AACFFAA, 0xA7F424730D7E419F, 0xD671EB919A179E8C,
+        0x0000FFA26C5A924A, /* XPA1 */
+
+        0xFEC6E64588B7273B, 0xD2A626D74CBBF1C6, 0xF8F58F07A78098C7,
+        0xE23941F470841B03, 0x1B63EDA2045538DD, 0x735CFEB0FFD49215,
+        0x0001C4CB77542876, /* XQA0 */
+
+        0xADB0F733C17FFDD6, 0x6AFFBD037DA0A050, 0x680EC43DB144E02F,
+        0x1E2E5D5FF524E374, 0xE2DDA115260E2995, 0xA6E4B552E2EDE508,
+        0x00018ECCDDF4B53E, /* XQA1 */
+
+        0x01BA4DB518CD6C7D, 0x2CB0251FE3CC0611, 0x259B0C6949A9121B,
+        0x60E17AC16D2F82AD, 0x3AA41F1CE175D92D, 0x413FBE6A9B9BC4F3,
+        0x00022A81D8D55643, /* XRA0 */
+
+        0xB8ADBC70FC82E54A, 0xEF9CDDB0D5FADDED, 0x5820C734C80096A0,
+        0x7799994BAA96E0E4, 0x044961599E379AF8, 0xDB2B94FBF09F27E2,
+        0x0000B87FC716C0C6 /* XRA1 */
+};
+
+/* Bob's generator values {XPB0, XQB0, XRB0 + XRB1*i} in GF(p434^2), expressed in Montgomery representation */
+const uint64_t B_gen[6*S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+        0x6E5497556EDD48A3, 0x2A61B501546F1C05, 0xEB919446D049887D,
+        0x5864A4A69D450C4F, 0xB883F276A6490D2B, 0x22CC287022D5F5B9,
+        0x0001BED4772E551F, /* XPB0 */
+
+        0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+        0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+        0x0000000000000000, /* XPB1 */
+
+        0xFAE2A3F93D8B6B8E, 0x494871F51700FE1C, 0xEF1A94228413C27C,
+        0x498FF4A4AF60BD62, 0xB00AD2A708267E8A, 0xF4328294E017837F,
+        0x000034080181D8AE, /* XQB0 */
+
+        0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+        0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+        0x0000000000000000, /* XQB1 */
+
+        0x283B34FAFEFDC8E4, 0x9208F44977C3E647, 0x7DEAE962816F4E9A,
+        0x68A2BA8AA262EC9D, 0x8176F112EA43F45B, 0x02106D022634F504,
+        0x00007E8A50F02E37, /* XRB0 */
+
+        0xB378B7C1DA22CCB1, 0x6D089C99AD1D9230, 0xEBE15711813E2369,
+        0x2B35A68239D48A53, 0x445F6FD138407C93, 0xBEF93B29A3F6B54B,
+        0x000173FA910377D3 /* XRB1 */
+};
+
+/* Montgomery constant Montgomery_R2 = (2^448)^2 mod p434 */
+const uint64_t Montgomery_R2[S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+        0x28E55B65DCD69B30, 0xACEC7367768798C2, 0xAB27973F8311688D,
+        0x175CC6AF8D6C7C0B, 0xABCD92BF2DDE347E, 0x69E16A61C7686D9A,
+        0x000025A89BCDD12A
+};
+
+/* Value one in Montgomery representation */
+const uint64_t Montgomery_one[S2N_SIKE_P434_R3_NWORDS64_FIELD] = {
+        0x000000000000742C, 0x0000000000000000, 0x0000000000000000,
+        0xB90FF404FC000000, 0xD801A4FB559FACD4, 0xE93254545F77410C,
+        0x0000ECEEA7BD2EDA
+};
+
+/* Fixed parameters for isogeny tree computation */
+const unsigned int strat_Alice[S2N_SIKE_P434_R3_MAX_ALICE-1] = {
+        48, 28, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 13, 7, 4,
+        2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 4, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 21, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2,
+        1, 1, 1, 1, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1
+};
+
+const unsigned int strat_Bob[S2N_SIKE_P434_R3_MAX_BOB-1] = {
+        66, 33, 17, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1,
+        2, 1, 1, 16, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 32,
+        16, 8, 4, 3, 1, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4,
+        2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1
+};
+
+/* Returns true if the machine is big endian */
+bool is_big_endian()
+{
+    uint16_t i = 1;
+    uint8_t *ptr = (uint8_t *)&i;
+    return !(*ptr);
+}
+
+uint32_t bswap32(uint32_t x)
+{
+    uint32_t i = (x >> 16) | (x << 16);
+    return ((i & UINT32_C(0xff00ff00)) >> 8) | ((i & UINT32_C(0x00ff00ff)) << 8);
+}
+
+uint64_t bswap64(uint64_t x)
+{
+    return bswap32(x >> 32) | (((uint64_t)bswap32(x)) << 32);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.h
new file mode 100644
index 0000000000..5b797b1d7f
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.h
@@ -0,0 +1,181 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: supersingular isogeny parameters, generation of functions for P434;
+*           configuration and platform-dependent macros
+*********************************************************************************************/  
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+/* All sikep434r3 functions and global variables in the pq-crypto/sike_r3 directory
+ * should be defined using this namespace macro to avoid symbol collisions. For example,
+ * in foo.h, declare a function as follows:
+ *
+ * #define foo_function S2N_SIKE_P434_R3_NAMESPACE(foo_function)
+ * int foo_function(int foo_argument); */
+#define S2N_SIKE_P434_R3_NAMESPACE(s) s2n_sike_p434_r3_##s
+
+/* Endian-related functionality */
+/* Returns true if the machine is big endian */
+#define is_big_endian S2N_SIKE_P434_R3_NAMESPACE(is_big_endian)
+bool is_big_endian(void);
+
+#define bswap32 S2N_SIKE_P434_R3_NAMESPACE(bswap32)
+uint32_t bswap32(uint32_t x);
+
+#define bswap64 S2N_SIKE_P434_R3_NAMESPACE(bswap64)
+uint64_t bswap64(uint64_t x);
+
+/* Arch specific definitions */
+#define digit_t S2N_SIKE_P434_R3_NAMESPACE(digit_t)
+#define hdigit_t S2N_SIKE_P434_R3_NAMESPACE(hdigit_t)
+#if defined(_AMD64_) || defined(__x86_64) || defined(__x86_64__) || defined(__aarch64__) || defined(_S390X_) || defined(_ARM64_) || defined(__powerpc64__) || (defined(__riscv) && (__riscv_xlen == 64))
+    #define S2N_SIKE_P434_R3_NWORDS_FIELD    7 /* Number of words of a 434-bit field element */
+    #define S2N_SIKE_P434_R3_ZERO_WORDS      3 /* Number of "0" digits in the least significant part of p434 + 1 */
+    #define S2N_SIKE_P434_R3_RADIX           64
+    #define S2N_SIKE_P434_R3_LOG2RADIX       6
+    #define S2N_SIKE_P434_R3_BSWAP_DIGIT(i)  bswap64((i))
+    typedef uint64_t digit_t;
+    typedef uint32_t hdigit_t;
+#elif defined(_X86_) || defined(_ARM_) || defined(__arm__) || defined(__i386__)
+    #define S2N_SIKE_P434_R3_NWORDS_FIELD    14 /* Number of words of a 434-bit field element */
+    #define S2N_SIKE_P434_R3_ZERO_WORDS      6  /* Number of "0" digits in the least significant part of p434 + 1 */
+    #define S2N_SIKE_P434_R3_RADIX           32
+    #define S2N_SIKE_P434_R3_LOG2RADIX       5
+    #define S2N_SIKE_P434_R3_BSWAP_DIGIT(i)  bswap32((i))
+    typedef uint32_t digit_t;
+    typedef uint16_t hdigit_t;
+#else
+    #error -- "Unsupported ARCHITECTURE"
+#endif
+
+/* Basic constants */
+#define S2N_SIKE_P434_R3_NBITS_FIELD     434
+#define S2N_SIKE_P434_R3_MAXBITS_FIELD   448
+/* Number of 64-bit words of a 434-bit field element */
+#define S2N_SIKE_P434_R3_NWORDS64_FIELD  ((S2N_SIKE_P434_R3_NBITS_FIELD+63)/64)
+#define S2N_SIKE_P434_R3_NBITS_ORDER     256
+/* Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. */
+#define S2N_SIKE_P434_R3_NWORDS_ORDER    ((S2N_SIKE_P434_R3_NBITS_ORDER+S2N_SIKE_P434_R3_RADIX-1)/S2N_SIKE_P434_R3_RADIX)
+#define S2N_SIKE_P434_R3_ALICE           0
+#define S2N_SIKE_P434_R3_BOB             1
+#define S2N_SIKE_P434_R3_OALICE_BITS     216
+#define S2N_SIKE_P434_R3_OBOB_BITS       218
+#define S2N_SIKE_P434_R3_MASK_ALICE      0xFF
+#define S2N_SIKE_P434_R3_MASK_BOB        0x01
+
+/* Fixed parameters for isogeny tree computation */
+#define S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE    7
+#define S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB      8
+#define S2N_SIKE_P434_R3_MAX_ALICE               108
+#define S2N_SIKE_P434_R3_MAX_BOB                 137
+#define S2N_SIKE_P434_R3_MSG_BYTES               16
+#define S2N_SIKE_P434_R3_SECRETKEY_A_BYTES       ((S2N_SIKE_P434_R3_OALICE_BITS + 7) / 8)
+#define S2N_SIKE_P434_R3_SECRETKEY_B_BYTES       ((S2N_SIKE_P434_R3_OBOB_BITS - 1 + 7) / 8)
+#define S2N_SIKE_P434_R3_FP2_ENCODED_BYTES       (2 * ((S2N_SIKE_P434_R3_NBITS_FIELD + 7) / 8))
+
+/* SIDH's basic element definitions and point representations */
+/* Datatype for representing 434-bit field elements (448-bit max.) */
+#define felm_t S2N_SIKE_P434_R3_NAMESPACE(felm_t)
+typedef digit_t felm_t[S2N_SIKE_P434_R3_NWORDS_FIELD];
+
+/* Datatype for representing double-precision 2x434-bit field elements (2x448-bit max.) */
+#define dfelm_t S2N_SIKE_P434_R3_NAMESPACE(dfelm_t)
+typedef digit_t dfelm_t[2*S2N_SIKE_P434_R3_NWORDS_FIELD];
+
+/* Datatype for representing quadratic extension field elements GF(p434^2) */
+#define f2elm_t S2N_SIKE_P434_R3_NAMESPACE(f2elm_t)
+#define felm_s S2N_SIKE_P434_R3_NAMESPACE(felm_s)
+typedef struct felm_s {
+    felm_t e[2];
+} f2elm_t;
+
+/* Point representation in projective XZ Montgomery coordinates. */
+#define point_proj S2N_SIKE_P434_R3_NAMESPACE(point_proj)
+typedef struct { f2elm_t X; f2elm_t Z; } point_proj;
+#define point_proj_t S2N_SIKE_P434_R3_NAMESPACE(point_proj_t)
+typedef point_proj point_proj_t[1];
+
+/* Macro to avoid compiler warnings when detecting unreferenced parameters */
+#define S2N_SIKE_P434_R3_UNREFERENCED_PARAMETER(PAR) ((void)(PAR))
+
+/********************** Constant-time unsigned comparisons ***********************/
+/* The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise */
+
+/* Is x != 0? */
+static __inline unsigned int is_digit_nonzero_ct(const digit_t x)
+{
+    return (unsigned int)((x | (0-x)) >> (S2N_SIKE_P434_R3_RADIX-1));
+}
+
+/* Is x = 0? */
+static __inline unsigned int is_digit_zero_ct(const digit_t x)
+{
+    return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
+}
+
+/* Is x < y? */
+static __inline unsigned int is_digit_lessthan_ct(const digit_t x, const digit_t y)
+{
+    return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (S2N_SIKE_P434_R3_RADIX-1));
+}
+
+/* Definitions for generic C implementation */
+
+typedef uint64_t uint128_t[2];
+
+/* Digit multiplication */
+#define S2N_SIKE_P434_R3_MUL(multiplier, multiplicand, hi, lo)                                    \
+    digit_x_digit((multiplier), (multiplicand), &(lo));
+
+/* Digit addition with carry */
+#define S2N_SIKE_P434_R3_ADDC(carryIn, addend1, addend2, carryOut, sumOut)                        \
+    { digit_t tempReg = (addend1) + (digit_t)(carryIn);                                           \
+    (sumOut) = (addend2) + tempReg;                                                               \
+    (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); }
+
+/* Digit subtraction with borrow */
+#define S2N_SIKE_P434_R3_SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut)            \
+    { digit_t tempReg = (minuend) - (subtrahend);                                                 \
+    unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg)));  \
+    (differenceOut) = tempReg - (digit_t)(borrowIn);                                              \
+    (borrowOut) = borrowReg; }
+
+/* Shift right with flexible datatype */
+#define S2N_SIKE_P434_R3_SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize)                        \
+    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << ((DigitSize) - (shift)));
+
+/* Fixed parameters for computation */
+#define p434 S2N_SIKE_P434_R3_NAMESPACE(p434)
+extern const uint64_t p434[S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define p434x2 S2N_SIKE_P434_R3_NAMESPACE(p434x2)
+extern const uint64_t p434x2[S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define p434x4 S2N_SIKE_P434_R3_NAMESPACE(p434x4)
+extern const uint64_t p434x4[S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define p434p1 S2N_SIKE_P434_R3_NAMESPACE(p434p1)
+extern const uint64_t p434p1[S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define A_gen S2N_SIKE_P434_R3_NAMESPACE(A_gen)
+extern const uint64_t A_gen[6*S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define B_gen S2N_SIKE_P434_R3_NAMESPACE(B_gen)
+extern const uint64_t B_gen[6*S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define Montgomery_R2 S2N_SIKE_P434_R3_NAMESPACE(Montgomery_R2)
+extern const uint64_t Montgomery_R2[S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define Montgomery_one S2N_SIKE_P434_R3_NAMESPACE(Montgomery_one)
+extern const uint64_t Montgomery_one[S2N_SIKE_P434_R3_NWORDS64_FIELD];
+
+#define strat_Alice S2N_SIKE_P434_R3_NAMESPACE(strat_Alice)
+extern const unsigned int strat_Alice[S2N_SIKE_P434_R3_MAX_ALICE-1];
+
+#define strat_Bob S2N_SIKE_P434_R3_NAMESPACE(strat_Bob)
+extern const unsigned int strat_Bob[S2N_SIKE_P434_R3_MAX_BOB-1];
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_api.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_api.h
new file mode 100644
index 0000000000..cf3c4feb85
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_api.h
@@ -0,0 +1,78 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: API header file for P434
+*********************************************************************************************/  
+
+#pragma once
+
+#include "sikep434r3.h"
+
+/*********************** Key encapsulation mechanism API ***********************/
+/* Encoding of keys for KEM-based isogeny system "SIKEp434" (wire format):
+ *
+ * Elements over GF(p434) are encoded in 55 octets in little endian format (i.e., the least
+ * significant octet is located in the lowest memory address). Elements (a+b*i) over GF(p434^2),
+ * where a and b are defined over GF(p434), are encoded as {a, b}, with a in the lowest memory portion.
+ *
+ * Private keys sk consist of the concatenation of a 16-byte random value, a value in the range
+ * [0, 2^217-1] and the public key pk. In the SIKE API, private keys are encoded in 374 octets in
+ * little endian format. Public keys pk consist of 3 elements in GF(p434^2). In the SIKE API, pk
+ * is encoded in 330 octets. Ciphertexts ct consist of the concatenation of a public key value
+ * and a 16-byte value. In the SIKE API, ct is encoded in 330 + 16 = 346 octets. Shared keys ss
+ * consist of a value of 16 octets. */
+
+/*********************** Ephemeral key exchange API ***********************/
+
+/* SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use
+ * it with static keys. See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith,
+ * C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016. Extended version available at:
+ * http://eprint.iacr.org/2016/859 */
+
+/* Generation of Bob's secret key
+ * Outputs random value in [0, 2^Floor(Log(2,3^137)) - 1] to be used as Bob's private key */
+#define random_mod_order_B S2N_SIKE_P434_R3_NAMESPACE(random_mod_order_B)
+int random_mod_order_B(unsigned char* random_digits);
+
+/* Alice's ephemeral public key generation
+ * Input:  a private key PrivateKeyA in the range [0, 2^216 - 1], stored in 27 bytes.
+ * Output: the public key PublicKeyA consisting of 3 GF(p434^2) elements encoded in 330 bytes. */
+#define EphemeralKeyGeneration_A S2N_SIKE_P434_R3_NAMESPACE(EphemeralKeyGeneration_A)
+int EphemeralKeyGeneration_A(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA);
+
+/* Bob's ephemeral key-pair generation
+ * It produces a private key PrivateKeyB and computes the public key PublicKeyB.
+ * The private key is an integer in the range [0, 2^Floor(Log(2,3^137)) - 1], stored in 28 bytes.
+ * The public key consists of 3 GF(p434^2) elements encoded in 330 bytes. */
+#define EphemeralKeyGeneration_B S2N_SIKE_P434_R3_NAMESPACE(EphemeralKeyGeneration_B)
+int EphemeralKeyGeneration_B(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB);
+
+/* Alice's ephemeral shared secret computation
+ * It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB
+ * Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^216 - 1], stored in 27 bytes.
+ *         Bob's PublicKeyB consists of 3 GF(p434^2) elements encoded in 330 bytes.
+ * Output: a shared secret SharedSecretA that consists of one element in GF(p434^2) encoded in 110 bytes. */
+#define EphemeralSecretAgreement_A S2N_SIKE_P434_R3_NAMESPACE(EphemeralSecretAgreement_A)
+int EphemeralSecretAgreement_A(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA);
+
+/* Bob's ephemeral shared secret computation
+ * It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA
+ * Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^137)) - 1], stored in 28 bytes.
+ *         Alice's PublicKeyA consists of 3 GF(p434^2) elements encoded in 330 bytes.
+ * Output: a shared secret SharedSecretB that consists of one element in GF(p434^2) encoded in 110 bytes. */
+#define EphemeralSecretAgreement_B S2N_SIKE_P434_R3_NAMESPACE(EphemeralSecretAgreement_B)
+int EphemeralSecretAgreement_B(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB);
+
+/* Encoding of keys for KEX-based isogeny system "SIDHp434" (wire format):
+ *
+ * Elements over GF(p434) are encoded in 55 octets in little endian format (i.e., the
+ * least significant octet is located in the lowest memory address). Elements (a+b*i)
+ * over GF(p434^2), where a and b are defined over GF(p434), are encoded as {a, b}, with
+ * a in the lowest memory portion.
+ *
+ * Private keys PrivateKeyA and PrivateKeyB can have values in the range [0, 2^216-1] and
+ * [0, 2^Floor(Log(2,3^137)) - 1], resp. In the SIDH API, Alice's and Bob's private keys
+ * are encoded in 27 and 28 octets, resp., in little endian format. Public keys PublicKeyA
+ * and PublicKeyB consist of 3 elements in GF(p434^2). In the SIDH API, they are encoded in
+ * 330 octets. Shared keys SharedSecretA and SharedSecretB consist of one element in GF(p434^2).
+ * In the SIDH API, they are encoded in 110 octets. */
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.c
new file mode 100644
index 0000000000..e5ae4e7c7e
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.c
@@ -0,0 +1,348 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: elliptic curve and isogeny functions
+*********************************************************************************************/
+
+#include "sikep434r3.h"
+#include "sikep434r3_fpx.h"
+#include "sikep434r3_ec_isogeny.h"
+
+/* Doubling of a Montgomery point in projective coordinates (X:Z).
+ * Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C.
+ * Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). */
+void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24)
+{
+    f2elm_t _t0, _t1;
+    f2elm_t *t0=&_t0, *t1=&_t1;
+    
+    mp2_sub_p2(&P->X, &P->Z, t0);                     /* t0 = X1-Z1 */
+    mp2_add(&P->X, &P->Z, t1);                        /* t1 = X1+Z1 */
+    fp2sqr_mont(t0, t0);                              /* t0 = (X1-Z1)^2 */
+    fp2sqr_mont(t1, t1);                              /* t1 = (X1+Z1)^2 */
+    fp2mul_mont(C24, t0, &Q->Z);                      /* Z2 = C24*(X1-Z1)^2 */
+    fp2mul_mont(t1, &Q->Z, &Q->X);                    /* X2 = C24*(X1-Z1)^2*(X1+Z1)^2 */
+    mp2_sub_p2(t1, t0, t1);                           /* t1 = (X1+Z1)^2-(X1-Z1)^2 */
+    fp2mul_mont(A24plus, t1, t0);                     /* t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] */
+    mp2_add(&Q->Z, t0, &Q->Z);                        /* Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 */
+    fp2mul_mont(&Q->Z, t1, &Q->Z);                    /* Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] */
+}
+
+/* Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings.
+ * Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C.
+ * Output: projective Montgomery x-coordinates Q <- (2^e)*P. */
+void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24, const int e)
+{
+    int i;
+    
+    copy_words((const digit_t*)P, (digit_t*)Q, 2*2*S2N_SIKE_P434_R3_NWORDS_FIELD);
+
+    for (i = 0; i < e; i++) {
+        xDBL(Q, Q, A24plus, C24);
+    }
+}
+
+/* Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4.
+ * Input:  projective point of order four P = (X4:Z4).
+ * Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients
+ * that are used to evaluate the isogeny at a point in eval_4_isog(). */
+void get_4_isog(const point_proj_t P, f2elm_t *A24plus, f2elm_t *C24, f2elm_t *coeff)
+{
+    mp2_sub_p2(&P->X, &P->Z, &coeff[1]);               /* coeff[1] = X4-Z4 */
+    mp2_add(&P->X, &P->Z, &coeff[2]);                  /* coeff[2] = X4+Z4 */
+    fp2sqr_mont(&P->Z, &coeff[0]);                     /* coeff[0] = Z4^2 */
+    mp2_add(&coeff[0], &coeff[0], &coeff[0]);          /* coeff[0] = 2*Z4^2 */
+    fp2sqr_mont(&coeff[0], C24);                       /* C24 = 4*Z4^4 */
+    mp2_add(&coeff[0], &coeff[0], &coeff[0]);          /* coeff[0] = 4*Z4^2 */
+    fp2sqr_mont(&P->X, A24plus);                       /* A24plus = X4^2 */
+    mp2_add(A24plus, A24plus, A24plus);                /* A24plus = 2*X4^2 */
+    fp2sqr_mont(A24plus, A24plus);                     /* A24plus = 4*X4^4 */
+}
+
+/* Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined
+ * by the 3 coefficients in coeff (computed in the function get_4_isog()).
+ * Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z).
+ * Output: the projective point P = phi(P) = (X:Z) in the codomain.  */
+void eval_4_isog(point_proj_t P, f2elm_t *coeff)
+{
+    f2elm_t _t0, _t1;
+    f2elm_t *t0=&_t0, *t1=&_t1;
+    
+    mp2_add(&P->X, &P->Z, t0);                        /* t0 = X+Z */
+    mp2_sub_p2(&P->X, &P->Z, t1);                     /* t1 = X-Z */
+    fp2mul_mont(t0, &coeff[1], &P->X);                /* X = (X+Z)*coeff[1] */
+    fp2mul_mont(t1, &coeff[2], &P->Z);                /* Z = (X-Z)*coeff[2] */
+    fp2mul_mont(t0, t1, t0);                          /* t0 = (X+Z)*(X-Z) */
+    fp2mul_mont(&coeff[0], t0, t0);                   /* t0 = coeff[0]*(X+Z)*(X-Z) */
+    mp2_add(&P->X, &P->Z, t1);                        /* t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] */
+    mp2_sub_p2(&P->X, &P->Z, &P->Z);                  /* Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] */
+    fp2sqr_mont(t1, t1);                              /* t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 */
+    fp2sqr_mont(&P->Z, &P->Z);                        /* Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 */
+    mp2_add(t1, t0, &P->X);                           /* X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 */
+    mp2_sub_p2(&P->Z, t0, t0);                        /* t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) */
+    fp2mul_mont(&P->X, t1, &P->X);                    /* Xfinal */
+    fp2mul_mont(&P->Z, t0, &P->Z);                    /* Zfinal */
+}
+
+/* Tripling of a Montgomery point in projective coordinates (X:Z).
+ * Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C.
+ * Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3).  */
+void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus)
+{
+    f2elm_t _t0, _t1, _t2, _t3, _t4, _t5, _t6;
+    f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3, *t4=&_t4, *t5=&_t5, *t6=&_t6;
+
+    mp2_sub_p2(&P->X, &P->Z, t0);                     /* t0 = X-Z */
+    fp2sqr_mont(t0, t2);                              /* t2 = (X-Z)^2 */
+    mp2_add(&P->X, &P->Z, t1);                        /* t1 = X+Z */
+    fp2sqr_mont(t1, t3);                              /* t3 = (X+Z)^2 */
+    mp2_add(&P->X, &P->X, t4);                        /* t4 = 2*X */
+    mp2_add(&P->Z, &P->Z, t0);                        /* t0 = 2*Z */
+    fp2sqr_mont(t4, t1);                              /* t1 = 4*X^2 */
+    mp2_sub_p2(t1, t3, t1);                           /* t1 = 4*X^2 - (X+Z)^2 */
+    mp2_sub_p2(t1, t2, t1);                           /* t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 */
+    fp2mul_mont(A24plus, t3, t5);                     /* t5 = A24plus*(X+Z)^2 */
+    fp2mul_mont(t3, t5, t3);                          /* t3 = A24plus*(X+Z)^4 */
+    fp2mul_mont(A24minus, t2, t6);                    /* t6 = A24minus*(X-Z)^2 */
+    fp2mul_mont(t2, t6, t2);                          /* t2 = A24minus*(X-Z)^4 */
+    mp2_sub_p2(t2, t3, t3);                           /* t3 = A24minus*(X-Z)^4 - A24plus*(X+Z)^4 */
+    mp2_sub_p2(t5, t6, t2);                           /* t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 */
+    fp2mul_mont(t1, t2, t1);                          /* t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] */
+    fp2add(t3, t1, t2);                               /* t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^4 - A24plus*(X+Z)^4 */
+    fp2sqr_mont(t2, t2);                              /* t2 = t2^2 */
+    fp2mul_mont(t4, t2, &Q->X);                       /* X3 = 2*X*t2 */
+    fp2sub(t3, t1, t1);                               /* t1 = A24minus*(X-Z)^4 - A24plus*(X+Z)^4 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] */
+    fp2sqr_mont(t1, t1);                              /* t1 = t1^2 */
+    fp2mul_mont(t0, t1, &Q->Z);                       /* Z3 = 2*Z*t1 */
+}
+
+/* Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings.
+ * Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C.
+ * Output: projective Montgomery x-coordinates Q <- (3^e)*P. */
+void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus, const int e)
+{
+    int i;
+        
+    copy_words((const digit_t*)P, (digit_t*)Q, 2*2*S2N_SIKE_P434_R3_NWORDS_FIELD);
+
+    for (i = 0; i < e; i++) {
+        xTPL(Q, Q, A24minus, A24plus);
+    }
+}
+
+/* Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3.
+ * Input:  projective point of order three P = (X3:Z3).
+ * Output: the 3-isogenous Montgomery curve with projective coefficient A/C.  */
+void get_3_isog(const point_proj_t P, f2elm_t *A24minus, f2elm_t *A24plus, f2elm_t *coeff)
+{
+    f2elm_t _t0, _t1, _t2, _t3, _t4;
+    f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3, *t4=&_t4;
+    
+    mp2_sub_p2(&P->X, &P->Z, &coeff[0]);               /* coeff0 = X-Z */
+    fp2sqr_mont(&coeff[0], t0);                        /* t0 = (X-Z)^2 */
+    mp2_add(&P->X, &P->Z, &coeff[1]);                  /* coeff1 = X+Z */
+    fp2sqr_mont(&coeff[1], t1);                        /* t1 = (X+Z)^2 */
+    mp2_add(&P->X, &P->X, t3);                         /* t3 = 2*X */
+    fp2sqr_mont(t3, t3);                               /* t3 = 4*X^2 */
+    fp2sub(t3, t0, t2);                                /* t2 = 4*X^2 - (X-Z)^2 */
+    fp2sub(t3, t1, t3);                                /* t3 = 4*X^2 - (X+Z)^2 */
+    mp2_add(t0, t3, t4);                               /* t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 */
+    mp2_add(t4, t4, t4);                               /* t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) */
+    mp2_add(t1, t4, t4);                               /* t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 */
+    fp2mul_mont(t2, t4, A24minus);                     /* A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] */
+    mp2_add(t1, t2, t4);                               /* t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 */
+    mp2_add(t4, t4, t4);                               /* t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) */
+    mp2_add(t0, t4, t4);                               /* t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 */
+    fp2mul_mont(t3, t4, A24plus);                      /* A24plus = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] */
+}
+
+/* Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and
+ * a point P with 2 coefficients in coeff (computed in the function get_3_isog()).
+ * Inputs: projective points P = (X3:Z3) and Q = (X:Z).
+ * Output: the projective point Q <- phi(Q) = (X3:Z3).  */
+void eval_3_isog(point_proj_t Q, const f2elm_t *coeff)
+{
+    f2elm_t _t0, _t1, _t2;
+    f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2;
+
+    mp2_add(&Q->X, &Q->Z, t0);                      /* t0 = X+Z */
+    mp2_sub_p2(&Q->X, &Q->Z, t1);                   /* t1 = X-Z */
+    fp2mul_mont(&coeff[0], t0, t0);                 /* t0 = coeff0*(X+Z) */
+    fp2mul_mont(&coeff[1], t1, t1);                 /* t1 = coeff1*(X-Z) */
+    mp2_add(t0, t1, t2);                            /* t2 = coeff0*(X+Z) + coeff1*(X-Z) */
+    mp2_sub_p2(t1, t0, t0);                         /* t0 = coeff1*(X-Z) - coeff0*(X+Z) */
+    fp2sqr_mont(t2, t2);                            /* t2 = [coeff0*(X+Z) + coeff1*(X-Z)]^2 */
+    fp2sqr_mont(t0, t0);                            /* t0 = [coeff1*(X-Z) - coeff0*(X+Z)]^2 */
+    fp2mul_mont(&Q->X, t2, &Q->X);                  /* X3final = X*[coeff0*(X+Z) + coeff1*(X-Z)]^2 */
+    fp2mul_mont(&Q->Z, t0, &Q->Z);                  /* Z3final = Z*[coeff1*(X-Z) - coeff0*(X+Z)]^2 */
+}
+
+/* 3-way simultaneous inversion
+ * Input:  z1,z2,z3
+ * Output: 1/z1,1/z2,1/z3 (override inputs). */
+void inv_3_way(f2elm_t *z1, f2elm_t *z2, f2elm_t *z3)
+{
+    f2elm_t _t0, _t1, _t2, _t3;
+    f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3;
+
+    fp2mul_mont(z1, z2, t0);                      /* t0 = z1*z2 */
+    fp2mul_mont(z3, t0, t1);                      /* t1 = z1*z2*z3 */
+    fp2inv_mont(t1);                              /* t1 = 1/(z1*z2*z3) */
+    fp2mul_mont(z3, t1, t2);                      /* t2 = 1/(z1*z2) */
+    fp2mul_mont(t2, z2, t3);                      /* t3 = 1/z1 */
+    fp2mul_mont(t2, z1, z2);                      /* z2 = 1/z2 */
+    fp2mul_mont(t0, t1, z3);                      /* z3 = 1/z3 */
+    fp2copy(t3, z1);                              /* z1 = 1/z1 */
+}
+
+/* Given the x-coordinates of P, Q, and R, returns the value A corresponding to the
+ *     Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A.
+ * Input:  the x-coordinates xP, xQ, and xR of the points P, Q and R.
+ * Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x. */
+void get_A(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xR, f2elm_t *A)
+{
+    f2elm_t _t0, _t1, one = {0};
+    f2elm_t *t0=&_t0, *t1=&_t1;
+
+    
+    fpcopy((const digit_t*)&Montgomery_one,one.e[0]);
+    fp2add(xP, xQ, t1);                           /* t1 = xP+xQ */
+    fp2mul_mont(xP, xQ, t0);                      /* t0 = xP*xQ */
+    fp2mul_mont(xR, t1, A);                       /* A = xR*t1 */
+    fp2add(t0, A, A);                             /* A = A+t0 */
+    fp2mul_mont(t0, xR, t0);                      /* t0 = t0*xR */
+    fp2sub(A, &one, A);                           /* A = A-1 */
+    fp2add(t0, t0, t0);                           /* t0 = t0+t0 */
+    fp2add(t1, xR, t1);                           /* t1 = t1+xR */
+    fp2add(t0, t0, t0);                           /* t0 = t0+t0 */
+    fp2sqr_mont(A, A);                            /* A = A^2 */
+    fp2inv_mont(t0);                              /* t0 = 1/t0 */
+    fp2mul_mont(A, t0, A);                        /* A = A*t0 */
+    fp2sub(A, t1, A);                             /* Afinal = A-t1 */
+}
+
+/* Computes the j-invariant of a Montgomery curve with projective constant.
+ * Input: A,C in GF(p^2).
+ * Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve
+ *     B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x. */
+void j_inv(const f2elm_t *A, const f2elm_t *C, f2elm_t *jinv)
+{
+    f2elm_t _t0, _t1;
+    f2elm_t *t0=&_t0, *t1=&_t1;
+    
+    fp2sqr_mont(A, jinv);                           /* jinv = A^2 */
+    fp2sqr_mont(C, t1);                             /* t1 = C^2 */
+    fp2add(t1, t1, t0);                             /* t0 = t1+t1 */
+    fp2sub(jinv, t0, t0);                           /* t0 = jinv-t0 */
+    fp2sub(t0, t1, t0);                             /* t0 = t0-t1 */
+    fp2sub(t0, t1, jinv);                           /* jinv = t0-t1 */
+    fp2sqr_mont(t1, t1);                            /* t1 = t1^2 */
+    fp2mul_mont(jinv, t1, jinv);                    /* jinv = jinv*t1 */
+    fp2add(t0, t0, t0);                             /* t0 = t0+t0 */
+    fp2add(t0, t0, t0);                             /* t0 = t0+t0 */
+    fp2sqr_mont(t0, t1);                            /* t1 = t0^2 */
+    fp2mul_mont(t0, t1, t0);                        /* t0 = t0*t1 */
+    fp2add(t0, t0, t0);                             /* t0 = t0+t0 */
+    fp2add(t0, t0, t0);                             /* t0 = t0+t0 */
+    fp2inv_mont(jinv);                              /* jinv = 1/jinv */
+    fp2mul_mont(jinv, t0, jinv);                    /* jinv = t0*jinv */
+}
+
+/* Simultaneous doubling and differential addition.
+ * Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ,
+ *     affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4.
+ * Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P,
+ *     and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP.  */
+static void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t *xPQ, const f2elm_t *A24)
+{
+    f2elm_t _t0, _t1, _t2;
+    f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2;
+
+    mp2_add(&P->X, &P->Z, t0);                        /* t0 = XP+ZP */
+    mp2_sub_p2(&P->X, &P->Z, t1);                     /* t1 = XP-ZP */
+    fp2sqr_mont(t0, &P->X);                           /* XP = (XP+ZP)^2 */
+    mp2_sub_p2(&Q->X, &Q->Z, t2);                     /* t2 = XQ-ZQ */
+    mp2_add(&Q->X, &Q->Z, &Q->X);                     /* XQ = XQ+ZQ */
+    fp2mul_mont(t0, t2, t0);                          /* t0 = (XP+ZP)*(XQ-ZQ) */
+    fp2sqr_mont(t1, &P->Z);                           /* ZP = (XP-ZP)^2 */
+    fp2mul_mont(t1, &Q->X, t1);                       /* t1 = (XP-ZP)*(XQ+ZQ) */
+    mp2_sub_p2(&P->X, &P->Z, t2);                     /* t2 = (XP+ZP)^2-(XP-ZP)^2 */
+    fp2mul_mont(&P->X, &P->Z, &P->X);                 /* XP = (XP+ZP)^2*(XP-ZP)^2 */
+    fp2mul_mont(A24, t2, &Q->X);                      /* XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] */
+    mp2_sub_p2(t0, t1, &Q->Z);                        /* ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) */
+    mp2_add(&Q->X, &P->Z, &P->Z);                     /* ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 */
+    mp2_add(t0, t1, &Q->X);                           /* XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) */
+    fp2mul_mont(&P->Z, t2, &P->Z);                    /* ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] */
+    fp2sqr_mont(&Q->Z, &Q->Z);                        /* ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 */
+    fp2sqr_mont(&Q->X, &Q->X);                        /* XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 */
+    fp2mul_mont(&Q->Z, xPQ, &Q->Z);                   /* ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 */
+}
+
+/* Swap points.
+ * If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P */
+static void swap_points(point_proj_t P, point_proj_t Q, const digit_t option)
+{
+    unsigned int i;
+
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        digit_t temp = option & (P->X.e[0][i] ^ Q->X.e[0][i]);
+        P->X.e[0][i] = temp ^ P->X.e[0][i];
+        Q->X.e[0][i] = temp ^ Q->X.e[0][i];
+        temp = option & (P->X.e[1][i] ^ Q->X.e[1][i]);
+        P->X.e[1][i] = temp ^ P->X.e[1][i];
+        Q->X.e[1][i] = temp ^ Q->X.e[1][i];
+        temp = option & (P->Z.e[0][i] ^ Q->Z.e[0][i]);
+        P->Z.e[0][i] = temp ^ P->Z.e[0][i];
+        Q->Z.e[0][i] = temp ^ Q->Z.e[0][i];
+        temp = option & (P->Z.e[1][i] ^ Q->Z.e[1][i]);
+        P->Z.e[1][i] = temp ^ P->Z.e[1][i];
+        Q->Z.e[1][i] = temp ^ Q->Z.e[1][i];
+    }
+}
+
+void LADDER3PT(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xPQ, const digit_t* m,
+        const unsigned int AliceOrBob, point_proj_t R, const f2elm_t *A)
+{
+    point_proj_t R0 = {0}, R2 = {0};
+    f2elm_t _A24 = {0};
+    f2elm_t *A24 = &_A24;
+    digit_t mask;
+    int i, nbits, swap, prevbit = 0;
+
+    if (AliceOrBob == S2N_SIKE_P434_R3_ALICE) {
+        nbits = S2N_SIKE_P434_R3_OALICE_BITS;
+    } else {
+        nbits = S2N_SIKE_P434_R3_OBOB_BITS - 1;
+    }
+
+    /* Initializing constant */
+    fpcopy((const digit_t*)&Montgomery_one, A24->e[0]);
+    mp2_add(A24, A24, A24);
+    mp2_add(A, A24, A24);
+    fp2div2(A24, A24);
+    fp2div2(A24, A24);  /* A24 = (A+2)/4 */
+
+    /* Initializing points */
+    fp2copy(xQ, &R0->X);
+    fpcopy((const digit_t*)&Montgomery_one, (digit_t*)&R0->Z);
+    fp2copy(xPQ, &R2->X);
+    fpcopy((const digit_t*)&Montgomery_one, (digit_t*)&R2->Z);
+    fp2copy(xP, &R->X);
+    fpcopy((const digit_t*)&Montgomery_one, (digit_t*)&R->Z);
+    fpzero((digit_t*)(R->Z.e)[1]);
+
+    /* Main loop */
+    for (i = 0; i < nbits; i++) {
+        int bit = (m[i >> S2N_SIKE_P434_R3_LOG2RADIX] >> (i & (S2N_SIKE_P434_R3_RADIX-1))) & 1;
+        swap = bit ^ prevbit;
+        prevbit = bit;
+        mask = 0 - (digit_t)swap;
+
+        swap_points(R, R2, mask);
+        xDBLADD(R0, R2, &R->X, A24);
+        fp2mul_mont(&R2->X, &R->Z, &R2->X);
+    }
+    swap = 0 ^ prevbit;
+    mask = 0 - (digit_t)swap;
+    swap_points(R, R2, mask);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.h
new file mode 100644
index 0000000000..44245ec726
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.h
@@ -0,0 +1,46 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: elliptic curve and isogeny functions
+*********************************************************************************************/
+
+#pragma once
+
+#include "sikep434r3.h"
+
+#define xDBL S2N_SIKE_P434_R3_NAMESPACE(xDBL)
+void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24);
+
+#define xDBLe S2N_SIKE_P434_R3_NAMESPACE(xDBLe)
+void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24, const int e);
+
+#define get_4_isog S2N_SIKE_P434_R3_NAMESPACE(get_4_isog)
+void get_4_isog(const point_proj_t P, f2elm_t *A24plus, f2elm_t *C24, f2elm_t *coeff);
+
+#define eval_4_isog S2N_SIKE_P434_R3_NAMESPACE(eval_4_isog)
+void eval_4_isog(point_proj_t P, f2elm_t* coeff);
+
+#define xTPL S2N_SIKE_P434_R3_NAMESPACE(xTPL)
+void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus);
+
+#define xTPLe S2N_SIKE_P434_R3_NAMESPACE(xTPLe)
+void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus, const int e);
+
+#define get_3_isog S2N_SIKE_P434_R3_NAMESPACE(get_3_isog)
+void get_3_isog(const point_proj_t P, f2elm_t *A24minus, f2elm_t *A24plus, f2elm_t *coeff);
+
+#define eval_3_isog S2N_SIKE_P434_R3_NAMESPACE(eval_3_isog)
+void eval_3_isog(point_proj_t Q, const f2elm_t *coeff);
+
+#define inv_3_way S2N_SIKE_P434_R3_NAMESPACE(inv_3_way)
+void inv_3_way(f2elm_t *z1, f2elm_t *z2, f2elm_t *z3);
+
+#define get_A S2N_SIKE_P434_R3_NAMESPACE(get_A)
+void get_A(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xR, f2elm_t *A);
+
+#define j_inv S2N_SIKE_P434_R3_NAMESPACE(j_inv)
+void j_inv(const f2elm_t *A, const f2elm_t *C, f2elm_t *jinv);
+
+#define LADDER3PT S2N_SIKE_P434_R3_NAMESPACE(LADDER3PT)
+void LADDER3PT(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xPQ, const digit_t *m,
+        const unsigned int AliceOrBob, point_proj_t R, const f2elm_t *A);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.c
new file mode 100644
index 0000000000..413cb2b8e4
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.c
@@ -0,0 +1,417 @@
+/********************************************************************************************
+* SHA3-derived function SHAKE
+*
+* Based on the public domain implementation in crypto_hash/keccakc512/simple/ 
+* from http://bench.cr.yp.to/supercop.html by Ronny Van Keer 
+* and the public domain "TweetFips202" implementation from https://twitter.com/tweetfips202 
+* by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe
+*
+* See NIST Special Publication 800-185 for more information:
+* http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-185.pdf
+*
+*********************************************************************************************/  
+
+#include <stdint.h>
+#include <stddef.h>
+#include "sikep434r3.h"
+#include "sikep434r3_fips202.h"
+
+#define NROUNDS 24
+#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset)))
+
+/*************************************************
+ * Name:        load64
+ *
+ * Description: Load 8 bytes into uint64_t in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns the loaded 64-bit unsigned integer
+ **************************************************/
+static uint64_t load64(const uint8_t *x) {
+    uint64_t r = 0;
+    for (size_t i = 0; i < 8; ++i) {
+        r |= (uint64_t)x[i] << 8 * i;
+    }
+
+    return r;
+}
+
+/*************************************************
+ * Name:        store64
+ *
+ * Description: Store a 64-bit integer to a byte array in little-endian order
+ *
+ * Arguments:   - uint8_t *x: pointer to the output byte array
+ *              - uint64_t u: input 64-bit unsigned integer
+ **************************************************/
+static void store64(uint8_t *x, uint64_t u) {
+    for (size_t i = 0; i < 8; ++i) {
+        x[i] = (uint8_t) (u >> 8 * i);
+    }
+}
+
+static const uint64_t KeccakF_RoundConstants[NROUNDS] = {
+    (uint64_t)0x0000000000000001ULL,
+    (uint64_t)0x0000000000008082ULL,
+    (uint64_t)0x800000000000808aULL,
+    (uint64_t)0x8000000080008000ULL,
+    (uint64_t)0x000000000000808bULL,
+    (uint64_t)0x0000000080000001ULL,
+    (uint64_t)0x8000000080008081ULL,
+    (uint64_t)0x8000000000008009ULL,
+    (uint64_t)0x000000000000008aULL,
+    (uint64_t)0x0000000000000088ULL,
+    (uint64_t)0x0000000080008009ULL,
+    (uint64_t)0x000000008000000aULL,
+    (uint64_t)0x000000008000808bULL,
+    (uint64_t)0x800000000000008bULL,
+    (uint64_t)0x8000000000008089ULL,
+    (uint64_t)0x8000000000008003ULL,
+    (uint64_t)0x8000000000008002ULL,
+    (uint64_t)0x8000000000000080ULL,
+    (uint64_t)0x000000000000800aULL,
+    (uint64_t)0x800000008000000aULL,
+    (uint64_t)0x8000000080008081ULL,
+    (uint64_t)0x8000000000008080ULL,
+    (uint64_t)0x0000000080000001ULL,
+    (uint64_t)0x8000000080008008ULL,
+};
+
+static void KeccakF1600_StatePermute(uint64_t * state)
+{
+    int round;
+    uint64_t Aba, Abe, Abi, Abo, Abu;
+    uint64_t Aga, Age, Agi, Ago, Agu;
+    uint64_t Aka, Ake, Aki, Ako, Aku;
+    uint64_t Ama, Ame, Ami, Amo, Amu;
+    uint64_t Asa, Ase, Asi, Aso, Asu;
+
+    /* copyFromState(A, state) */
+    Aba = state[ 0];
+    Abe = state[ 1];
+    Abi = state[ 2];
+    Abo = state[ 3];
+    Abu = state[ 4];
+    Aga = state[ 5];
+    Age = state[ 6];
+    Agi = state[ 7];
+    Ago = state[ 8];
+    Agu = state[ 9];
+    Aka = state[10];
+    Ake = state[11];
+    Aki = state[12];
+    Ako = state[13];
+    Aku = state[14];
+    Ama = state[15];
+    Ame = state[16];
+    Ami = state[17];
+    Amo = state[18];
+    Amu = state[19];
+    Asa = state[20];
+    Ase = state[21];
+    Asi = state[22];
+    Aso = state[23];
+    Asu = state[24];
+
+    for( round = 0; round < NROUNDS; round += 2 ) {
+        uint64_t BCa, BCe, BCi, BCo, BCu;
+        uint64_t Da, De, Di, Do, Du;
+        uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
+        uint64_t Ega, Ege, Egi, Ego, Egu;
+        uint64_t Eka, Eke, Eki, Eko, Eku;
+        uint64_t Ema, Eme, Emi, Emo, Emu;
+        uint64_t Esa, Ese, Esi, Eso, Esu;
+
+        /* prepareTheta */
+        BCa = Aba^Aga^Aka^Ama^Asa;
+        BCe = Abe^Age^Ake^Ame^Ase;
+        BCi = Abi^Agi^Aki^Ami^Asi;
+        BCo = Abo^Ago^Ako^Amo^Aso;
+        BCu = Abu^Agu^Aku^Amu^Asu;
+
+        /* thetaRhoPiChiIotaPrepareTheta(round  , A, E) */
+        Da = BCu^ROL(BCe, 1);
+        De = BCa^ROL(BCi, 1);
+        Di = BCe^ROL(BCo, 1);
+        Do = BCi^ROL(BCu, 1);
+        Du = BCo^ROL(BCa, 1);
+
+        Aba ^= Da;
+        BCa = Aba;
+        Age ^= De;
+        BCe = ROL(Age, 44);
+        Aki ^= Di;
+        BCi = ROL(Aki, 43);
+        Amo ^= Do;
+        BCo = ROL(Amo, 21);
+        Asu ^= Du;
+        BCu = ROL(Asu, 14);
+        Eba =   BCa ^((~BCe)&  BCi );
+        Eba ^= (uint64_t)KeccakF_RoundConstants[round];
+        Ebe =   BCe ^((~BCi)&  BCo );
+        Ebi =   BCi ^((~BCo)&  BCu );
+        Ebo =   BCo ^((~BCu)&  BCa );
+        Ebu =   BCu ^((~BCa)&  BCe );
+
+        Abo ^= Do;
+        BCa = ROL(Abo, 28);
+        Agu ^= Du;
+        BCe = ROL(Agu, 20);
+        Aka ^= Da;
+        BCi = ROL(Aka,  3);
+        Ame ^= De;
+        BCo = ROL(Ame, 45);
+        Asi ^= Di;
+        BCu = ROL(Asi, 61);
+        Ega =   BCa ^((~BCe)&  BCi );
+        Ege =   BCe ^((~BCi)&  BCo );
+        Egi =   BCi ^((~BCo)&  BCu );
+        Ego =   BCo ^((~BCu)&  BCa );
+        Egu =   BCu ^((~BCa)&  BCe );
+
+        Abe ^= De;
+        BCa = ROL(Abe,  1);
+        Agi ^= Di;
+        BCe = ROL(Agi,  6);
+        Ako ^= Do;
+        BCi = ROL(Ako, 25);
+        Amu ^= Du;
+        BCo = ROL(Amu,  8);
+        Asa ^= Da;
+        BCu = ROL(Asa, 18);
+        Eka =   BCa ^((~BCe)&  BCi );
+        Eke =   BCe ^((~BCi)&  BCo );
+        Eki =   BCi ^((~BCo)&  BCu );
+        Eko =   BCo ^((~BCu)&  BCa );
+        Eku =   BCu ^((~BCa)&  BCe );
+
+        Abu ^= Du;
+        BCa = ROL(Abu, 27);
+        Aga ^= Da;
+        BCe = ROL(Aga, 36);
+        Ake ^= De;
+        BCi = ROL(Ake, 10);
+        Ami ^= Di;
+        BCo = ROL(Ami, 15);
+        Aso ^= Do;
+        BCu = ROL(Aso, 56);
+        Ema =   BCa ^((~BCe)&  BCi );
+        Eme =   BCe ^((~BCi)&  BCo );
+        Emi =   BCi ^((~BCo)&  BCu );
+        Emo =   BCo ^((~BCu)&  BCa );
+        Emu =   BCu ^((~BCa)&  BCe );
+
+        Abi ^= Di;
+        BCa = ROL(Abi, 62);
+        Ago ^= Do;
+        BCe = ROL(Ago, 55);
+        Aku ^= Du;
+        BCi = ROL(Aku, 39);
+        Ama ^= Da;
+        BCo = ROL(Ama, 41);
+        Ase ^= De;
+        BCu = ROL(Ase,  2);
+        Esa =   BCa ^((~BCe)&  BCi );
+        Ese =   BCe ^((~BCi)&  BCo );
+        Esi =   BCi ^((~BCo)&  BCu );
+        Eso =   BCo ^((~BCu)&  BCa );
+        Esu =   BCu ^((~BCa)&  BCe );
+
+        /* prepareTheta */
+        BCa = Eba^Ega^Eka^Ema^Esa;
+        BCe = Ebe^Ege^Eke^Eme^Ese;
+        BCi = Ebi^Egi^Eki^Emi^Esi;
+        BCo = Ebo^Ego^Eko^Emo^Eso;
+        BCu = Ebu^Egu^Eku^Emu^Esu;
+
+        /* thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */
+        Da = BCu^ROL(BCe, 1);
+        De = BCa^ROL(BCi, 1);
+        Di = BCe^ROL(BCo, 1);
+        Do = BCi^ROL(BCu, 1);
+        Du = BCo^ROL(BCa, 1);
+
+        Eba ^= Da;
+        BCa = Eba;
+        Ege ^= De;
+        BCe = ROL(Ege, 44);
+        Eki ^= Di;
+        BCi = ROL(Eki, 43);
+        Emo ^= Do;
+        BCo = ROL(Emo, 21);
+        Esu ^= Du;
+        BCu = ROL(Esu, 14);
+        Aba =   BCa ^((~BCe)&  BCi );
+        Aba ^= (uint64_t)KeccakF_RoundConstants[round+1];
+        Abe =   BCe ^((~BCi)&  BCo );
+        Abi =   BCi ^((~BCo)&  BCu );
+        Abo =   BCo ^((~BCu)&  BCa );
+        Abu =   BCu ^((~BCa)&  BCe );
+
+        Ebo ^= Do;
+        BCa = ROL(Ebo, 28);
+        Egu ^= Du;
+        BCe = ROL(Egu, 20);
+        Eka ^= Da;
+        BCi = ROL(Eka, 3);
+        Eme ^= De;
+        BCo = ROL(Eme, 45);
+        Esi ^= Di;
+        BCu = ROL(Esi, 61);
+        Aga =   BCa ^((~BCe)&  BCi );
+        Age =   BCe ^((~BCi)&  BCo );
+        Agi =   BCi ^((~BCo)&  BCu );
+        Ago =   BCo ^((~BCu)&  BCa );
+        Agu =   BCu ^((~BCa)&  BCe );
+
+        Ebe ^= De;
+        BCa = ROL(Ebe, 1);
+        Egi ^= Di;
+        BCe = ROL(Egi, 6);
+        Eko ^= Do;
+        BCi = ROL(Eko, 25);
+        Emu ^= Du;
+        BCo = ROL(Emu, 8);
+        Esa ^= Da;
+        BCu = ROL(Esa, 18);
+        Aka =   BCa ^((~BCe)&  BCi );
+        Ake =   BCe ^((~BCi)&  BCo );
+        Aki =   BCi ^((~BCo)&  BCu );
+        Ako =   BCo ^((~BCu)&  BCa );
+        Aku =   BCu ^((~BCa)&  BCe );
+
+        Ebu ^= Du;
+        BCa = ROL(Ebu, 27);
+        Ega ^= Da;
+        BCe = ROL(Ega, 36);
+        Eke ^= De;
+        BCi = ROL(Eke, 10);
+        Emi ^= Di;
+        BCo = ROL(Emi, 15);
+        Eso ^= Do;
+        BCu = ROL(Eso, 56);
+        Ama =   BCa ^((~BCe)&  BCi );
+        Ame =   BCe ^((~BCi)&  BCo );
+        Ami =   BCi ^((~BCo)&  BCu );
+        Amo =   BCo ^((~BCu)&  BCa );
+        Amu =   BCu ^((~BCa)&  BCe );
+
+        Ebi ^= Di;
+        BCa = ROL(Ebi, 62);
+        Ego ^= Do;
+        BCe = ROL(Ego, 55);
+        Eku ^= Du;
+        BCi = ROL(Eku, 39);
+        Ema ^= Da;
+        BCo = ROL(Ema, 41);
+        Ese ^= De;
+        BCu = ROL(Ese, 2);
+        Asa =   BCa ^((~BCe)&  BCi );
+        Ase =   BCe ^((~BCi)&  BCo );
+        Asi =   BCi ^((~BCo)&  BCu );
+        Aso =   BCo ^((~BCu)&  BCa );
+        Asu =   BCu ^((~BCa)&  BCe );
+    }
+
+    /* copyToState(state, A) */
+    state[ 0] = Aba;
+    state[ 1] = Abe;
+    state[ 2] = Abi;
+    state[ 3] = Abo;
+    state[ 4] = Abu;
+    state[ 5] = Aga;
+    state[ 6] = Age;
+    state[ 7] = Agi;
+    state[ 8] = Ago;
+    state[ 9] = Agu;
+    state[10] = Aka;
+    state[11] = Ake;
+    state[12] = Aki;
+    state[13] = Ako;
+    state[14] = Aku;
+    state[15] = Ama;
+    state[16] = Ame;
+    state[17] = Ami;
+    state[18] = Amo;
+    state[19] = Amu;
+    state[20] = Asa;
+    state[21] = Ase;
+    state[22] = Asi;
+    state[23] = Aso;
+    state[24] = Asu;
+}
+
+static void keccak_absorb(uint64_t *s, unsigned int r, const unsigned char *m, unsigned long long int mlen,
+        unsigned char p)
+{
+    unsigned long long i;
+    unsigned char t[200];
+
+    while (mlen >= r) {
+        for (i = 0; i < r / 8; ++i)
+            s[i] ^= load64(m + 8 * i);
+
+        KeccakF1600_StatePermute(s);
+        mlen -= r;
+        m += r;
+    }
+
+    for (i = 0; i < r; ++i) {
+        t[i] = 0;
+    }
+    for (i = 0; i < mlen; ++i) {
+        t[i] = m[i];
+    }
+
+    t[i] = p;
+    t[r - 1] |= 128;
+
+    for (i = 0; i < r / 8; ++i) {
+        s[i] ^= load64(t + 8 * i);
+    }
+}
+
+static void keccak_squeezeblocks(unsigned char *h, unsigned long long int nblocks, uint64_t *s, unsigned int r)
+{
+    unsigned int i;
+
+    while(nblocks > 0) {
+        KeccakF1600_StatePermute(s);
+        for (i = 0; i < (r>>3); i++) {
+            store64(h+8*i, s[i]);
+        }
+
+        h += r;
+        nblocks--;
+    }
+}
+
+void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input,  unsigned long long inlen)
+{
+    uint64_t s[25];
+    unsigned char t[SHAKE256_RATE];
+    unsigned long long nblocks = outlen / SHAKE256_RATE;
+    size_t i;
+
+    for (i = 0; i < 25; ++i) {
+        s[i] = 0;
+    }
+
+    /* Absorb input */
+    keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F);
+
+    /* Squeeze output */
+    keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE);
+
+    output += nblocks * SHAKE256_RATE;
+    outlen -= nblocks * SHAKE256_RATE;
+
+    if (outlen) {
+        keccak_squeezeblocks(t, 1, s, SHAKE256_RATE);
+
+        for (i = 0; i < outlen; i++) {
+            output[i] = t[i];
+        }
+    }
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.h
new file mode 100644
index 0000000000..9dd237a491
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.h
@@ -0,0 +1,23 @@
+/********************************************************************************************
+* SHA3-derived function SHAKE
+*
+* Based on the public domain implementation in crypto_hash/keccakc512/simple/
+* from http://bench.cr.yp.to/supercop.html by Ronny Van Keer
+* and the public domain "TweetFips202" implementation from https://twitter.com/tweetfips202
+* by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe
+*
+* See NIST Special Publication 800-185 for more information:
+* http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-185.pdf
+*
+*********************************************************************************************/
+
+#pragma once
+
+#include <stdint.h>
+#include "sikep434r3.h"
+
+#define SHAKE128_RATE 168
+#define SHAKE256_RATE 136
+
+#define shake256 S2N_SIKE_P434_R3_NAMESPACE(shake256)
+void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input,  unsigned long long inlen);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.c
new file mode 100644
index 0000000000..867ac0f6c1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.c
@@ -0,0 +1,297 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: modular arithmetic for P434
+*********************************************************************************************/
+
+#include "sikep434r3.h"
+#include "pq-crypto/s2n_pq.h"
+#include "sikep434r3_fp.h"
+#include "sikep434r3_fpx.h"
+#include "sikep434r3_fp_x64_asm.h"
+
+/* Multiprecision subtraction with correction with 2*p, c = a-b+2p. */
+void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+    if (s2n_sikep434r3_asm_is_enabled()) {
+        mp_sub434_p2_asm(a, b, c);
+        return;
+    }
+#endif
+
+    unsigned int i, borrow = 0;
+
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_SUBC(borrow, a[i], b[i], borrow, c[i]);
+    }
+
+    borrow = 0;
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_ADDC(borrow, c[i], ((const digit_t*)p434x2)[i], borrow, c[i]);
+    }
+}
+
+/* Multiprecision subtraction with correction with 4*p, c = a-b+4p. */
+void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+    if (s2n_sikep434r3_asm_is_enabled()) {
+        mp_sub434_p4_asm(a, b, c);
+        return;
+    }
+#endif
+
+    unsigned int i, borrow = 0;
+
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_SUBC(borrow, a[i], b[i], borrow, c[i]);
+    }
+
+    borrow = 0;
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_ADDC(borrow, c[i], ((const digit_t*)p434x4)[i], borrow, c[i]);
+    }
+}
+
+/* Modular addition, c = a+b mod p434.
+ * Inputs: a, b in [0, 2*p434-1]
+ * Output: c in [0, 2*p434-1] */
+void fpadd434(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+    if (s2n_sikep434r3_asm_is_enabled()) {
+        fpadd434_asm(a, b, c);
+        return;
+    }
+#endif
+    unsigned int i, carry = 0;
+    digit_t mask;
+
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_ADDC(carry, a[i], b[i], carry, c[i]);
+    }
+
+    carry = 0;
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_SUBC(carry, c[i], ((const digit_t*)p434x2)[i], carry, c[i]);
+    }
+    mask = 0 - (digit_t)carry;
+
+    carry = 0;
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_ADDC(carry, c[i], ((const digit_t*)p434x2)[i] & mask, carry, c[i]);
+    }
+}
+
+/* Modular subtraction, c = a-b mod p434.
+ * Inputs: a, b in [0, 2*p434-1]
+ * Output: c in [0, 2*p434-1] */
+void fpsub434(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+    if (s2n_sikep434r3_asm_is_enabled()) {
+        fpsub434_asm(a, b, c);
+        return;
+    }
+#endif
+
+    unsigned int i, borrow = 0;
+    digit_t mask;
+
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_SUBC(borrow, a[i], b[i], borrow, c[i]);
+    }
+    mask = 0 - (digit_t)borrow;
+
+    borrow = 0;
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_ADDC(borrow, c[i], ((const digit_t*)p434x2)[i] & mask, borrow, c[i]);
+    }
+}
+
+/* Modular negation, a = -a mod p434.
+ * Input/output: a in [0, 2*p434-1]  */
+void fpneg434(digit_t* a)
+{
+    unsigned int i, borrow = 0;
+
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_SUBC(borrow, ((const digit_t*)p434x2)[i], a[i], borrow, a[i]);
+    }
+}
+
+/* Modular division by two, c = a/2 mod p434.
+ * Input : a in [0, 2*p434-1]
+ * Output: c in [0, 2*p434-1] */
+void fpdiv2_434(const digit_t* a, digit_t* c)
+{
+    unsigned int i, carry = 0;
+    digit_t mask;
+
+    mask = 0 - (digit_t)(a[0] & 1); /* If a is odd compute a+p434 */
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_ADDC(carry, a[i], ((const digit_t*)p434)[i] & mask, carry, c[i]);
+    }
+
+    mp_shiftr1(c, S2N_SIKE_P434_R3_NWORDS_FIELD);
+}
+
+/* Modular correction to reduce field element a in [0, 2*p434-1] to [0, p434-1]. */
+void fpcorrection434(digit_t* a)
+{
+    unsigned int i, borrow = 0;
+    digit_t mask;
+
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_SUBC(borrow, a[i], ((const digit_t*)p434)[i], borrow, a[i]);
+    }
+    mask = 0 - (digit_t)borrow;
+
+    borrow = 0;
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        S2N_SIKE_P434_R3_ADDC(borrow, a[i], ((const digit_t*)p434)[i] & mask, borrow, a[i]);
+    }
+}
+
+/* Digit multiplication, digit * digit -> 2-digit result */
+void digit_x_digit(const digit_t a, const digit_t b, digit_t* c)
+{
+    register digit_t al, ah, bl, bh, temp;
+    digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
+    digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4);
+
+    al = a & mask_low;                        /* Low part */
+    ah = a >> (sizeof(digit_t) * 4);          /* High part */
+    bl = b & mask_low;
+    bh = b >> (sizeof(digit_t) * 4);
+
+    albl = al*bl;
+    albh = al*bh;
+    ahbl = ah*bl;
+    ahbh = ah*bh;
+    c[0] = albl & mask_low;                   /* C00 */
+
+    res1 = albl >> (sizeof(digit_t) * 4);
+    res2 = ahbl & mask_low;
+    res3 = albh & mask_low;  
+    temp = res1 + res2 + res3;
+    carry = temp >> (sizeof(digit_t) * 4);
+    c[0] ^= temp << (sizeof(digit_t) * 4);    /* C01 */
+
+    res1 = ahbl >> (sizeof(digit_t) * 4);
+    res2 = albh >> (sizeof(digit_t) * 4);
+    res3 = ahbh & mask_low;
+    temp = res1 + res2 + res3 + carry;
+    c[1] = temp & mask_low;                   /* C10 */
+    carry = temp & mask_high; 
+    c[1] ^= (ahbh & mask_high) + carry;       /* C11 */
+}
+
+/* Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. */
+void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+    if (s2n_sikep434r3_asm_is_enabled()) {
+        S2N_SIKE_P434_R3_UNREFERENCED_PARAMETER(nwords);
+        mul434_asm(a, b, c);
+        return;
+    }
+#endif
+
+    unsigned int i, j;
+    digit_t t = 0, u = 0, v = 0, UV[2];
+    unsigned int carry;
+    
+    for (i = 0; i < nwords; i++) {
+        for (j = 0; j <= i; j++) {
+            S2N_SIKE_P434_R3_MUL(a[j], b[i-j], UV+1, UV[0]);
+            S2N_SIKE_P434_R3_ADDC(0, UV[0], v, carry, v);
+            S2N_SIKE_P434_R3_ADDC(carry, UV[1], u, carry, u);
+            t += carry;
+        }
+        c[i] = v;
+        v = u; 
+        u = t;
+        t = 0;
+    }
+
+    for (i = nwords; i < 2*nwords-1; i++) {
+        for (j = i-nwords+1; j < nwords; j++) {
+            S2N_SIKE_P434_R3_MUL(a[j], b[i-j], UV+1, UV[0]);
+            S2N_SIKE_P434_R3_ADDC(0, UV[0], v, carry, v);
+            S2N_SIKE_P434_R3_ADDC(carry, UV[1], u, carry, u);
+            t += carry;
+        }
+        c[i] = v;
+        v = u; 
+        u = t;
+        t = 0;
+    }
+    c[2*nwords-1] = v; 
+}
+
+/* Efficient Montgomery reduction using comba and exploiting the special form of the prime p434.
+ * mc = ma*R^-1 mod p434x2, where R = 2^448.
+ * If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1].
+ * ma is assumed to be in Montgomery representation. */
+void rdc_mont(digit_t* ma, digit_t* mc)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+    if (s2n_sikep434r3_asm_is_enabled()) {
+        rdc434_asm(ma, mc);
+        return;
+    }
+#endif
+
+    unsigned int i, j, carry, count = S2N_SIKE_P434_R3_ZERO_WORDS;
+    digit_t UV[2], t = 0, u = 0, v = 0;
+
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        mc[i] = 0;
+    }
+
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        for (j = 0; j < i; j++) {
+            if (j < (i-S2N_SIKE_P434_R3_ZERO_WORDS+1)) {
+                S2N_SIKE_P434_R3_MUL(mc[j], ((const digit_t*)p434p1)[i-j], UV+1, UV[0]);
+                S2N_SIKE_P434_R3_ADDC(0, UV[0], v, carry, v);
+                S2N_SIKE_P434_R3_ADDC(carry, UV[1], u, carry, u);
+                t += carry; 
+            }
+        }
+        S2N_SIKE_P434_R3_ADDC(0, v, ma[i], carry, v);
+        S2N_SIKE_P434_R3_ADDC(carry, u, 0, carry, u);
+        t += carry; 
+        mc[i] = v;
+        v = u;
+        u = t;
+        t = 0;
+    }    
+
+    for (i = S2N_SIKE_P434_R3_NWORDS_FIELD; i < 2*S2N_SIKE_P434_R3_NWORDS_FIELD-1; i++) {
+        if (count > 0) {
+            count -= 1;
+        }
+        for (j = i-S2N_SIKE_P434_R3_NWORDS_FIELD+1; j < S2N_SIKE_P434_R3_NWORDS_FIELD; j++) {
+            if (j < (S2N_SIKE_P434_R3_NWORDS_FIELD-count)) {
+                S2N_SIKE_P434_R3_MUL(mc[j], ((const digit_t*)p434p1)[i-j], UV+1, UV[0]);
+                S2N_SIKE_P434_R3_ADDC(0, UV[0], v, carry, v);
+                S2N_SIKE_P434_R3_ADDC(carry, UV[1], u, carry, u);
+                t += carry;
+            }
+        }
+        S2N_SIKE_P434_R3_ADDC(0, v, ma[i], carry, v);
+        S2N_SIKE_P434_R3_ADDC(carry, u, 0, carry, u);
+        t += carry; 
+        mc[i-S2N_SIKE_P434_R3_NWORDS_FIELD] = v;
+        v = u;
+        u = t;
+        t = 0;
+    }
+
+    /* `carry` isn't read after this, but it's still a necessary argument to the macro */
+    /* cppcheck-suppress unreadVariable */
+    S2N_SIKE_P434_R3_ADDC(0, v, ma[2*S2N_SIKE_P434_R3_NWORDS_FIELD-1], carry, v);
+    mc[S2N_SIKE_P434_R3_NWORDS_FIELD-1] = v;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.h
new file mode 100644
index 0000000000..7844ba0457
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.h
@@ -0,0 +1,39 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: modular arithmetic for P434
+*********************************************************************************************/
+
+#pragma once
+
+#include "sikep434r3.h"
+
+#define mp_sub434_p2 S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p2)
+void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mp_sub434_p4 S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p4)
+void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define fpadd434 S2N_SIKE_P434_R3_NAMESPACE(fpadd434)
+void fpadd434(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define fpsub434 S2N_SIKE_P434_R3_NAMESPACE(fpsub434)
+void fpsub434(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define fpneg434 S2N_SIKE_P434_R3_NAMESPACE(fpneg434)
+void fpneg434(digit_t* a);
+
+#define fpdiv2_434 S2N_SIKE_P434_R3_NAMESPACE(fpdiv2_434)
+void fpdiv2_434(const digit_t* a, digit_t* c);
+
+#define fpcorrection434 S2N_SIKE_P434_R3_NAMESPACE(fpcorrection434)
+void fpcorrection434(digit_t* a);
+
+#define digit_x_digit S2N_SIKE_P434_R3_NAMESPACE(digit_x_digit)
+void digit_x_digit(const digit_t a, const digit_t b, digit_t* c);
+
+#define mp_mul S2N_SIKE_P434_R3_NAMESPACE(mp_mul)
+void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords);
+
+#define rdc_mont S2N_SIKE_P434_R3_NAMESPACE(rdc_mont)
+void rdc_mont(digit_t* ma, digit_t* mc);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.S b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.S
new file mode 100644
index 0000000000..1814a8b25a
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.S
@@ -0,0 +1,1054 @@
+//*******************************************************************************************
+// Supersingular Isogeny Key Encapsulation Library
+//
+// Abstract: field arithmetic in x64 assembly for P434 on Linux
+//*******************************************************************************************
+
+/* Requires bmi2 instruction set for mulx. adx instructions are optional, but preferred. */
+.intel_syntax noprefix
+
+#define S2N_SIKE_P434_R3_NAMESPACE(s) s2n_sike_p434_r3_##s
+
+// Registers that are used for parameter passing:
+#define reg_p1  rdi
+#define reg_p2  rsi
+#define reg_p3  rdx
+
+// Define addition instructions
+#ifdef S2N_ADX
+
+#define ADD1    adox
+#define ADC1    adox
+#define ADD2    adcx
+#define ADC2    adcx
+
+#else
+
+#define ADD1    add
+#define ADC1    adc
+#define ADD2    add
+#define ADC2    adc
+
+#endif
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+
+#define asm_p434 S2N_SIKE_P434_R3_NAMESPACE(asm_p434)
+.align 32
+.type   asm_p434, @object
+.size   asm_p434, 56
+asm_p434:
+.quad   -1
+.quad   -1
+.quad   -1
+.quad   -161717841442111489
+.quad   8918917783347572387
+.quad   7853257225132122198
+.quad   620258357900100
+
+
+#define asm_p434x2 S2N_SIKE_P434_R3_NAMESPACE(asm_p434x2)
+.align 32
+.type   asm_p434x2, @object
+.size   asm_p434x2, 56
+asm_p434x2:
+.quad   -2
+.quad   -1
+.quad   -1
+.quad   -323435682884222977
+.quad   -608908507014406841
+.quad   -2740229623445307220
+.quad   1240516715800200
+
+
+#define asm_p434x4 S2N_SIKE_P434_R3_NAMESPACE(asm_p434x4)
+.align 32
+.type   asm_p434x4, @object
+.size   asm_p434x4, 56
+asm_p434x4:
+.quad   -4
+.quad   -1
+.quad   -1
+.quad   -646871365768445953
+.quad   -1217817014028813681
+.quad   -5480459246890614439
+.quad   2481033431600401
+
+
+#define asm_p434p1 S2N_SIKE_P434_R3_NAMESPACE(asm_p434p1)
+.align 32
+.type   asm_p434p1, @object
+.size   asm_p434p1, 56
+asm_p434p1:
+.quad   0
+.quad   0
+.quad   0
+.quad   -161717841442111488
+.quad   8918917783347572387
+.quad   7853257225132122198
+.quad   620258357900100
+
+//***********************************************************************
+//  Field addition
+//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
+//***********************************************************************
+#define fpadd434_asm S2N_SIKE_P434_R3_NAMESPACE(fpadd434_asm)
+.global fpadd434_asm
+fpadd434_asm:
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  push   rbx
+  push   rbp
+  
+  xor    rax, rax
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  mov    r13, [reg_p1+40]
+  mov    r14, [reg_p1+48]
+  add    r8, [reg_p2] 
+  adc    r9, [reg_p2+8] 
+  adc    r10, [reg_p2+16] 
+  adc    r11, [reg_p2+24] 
+  adc    r12, [reg_p2+32] 
+  adc    r13, [reg_p2+40] 
+  adc    r14, [reg_p2+48]
+
+  mov    rbx, [rip+asm_p434x2]
+  sub    r8, rbx
+  mov    rcx, [rip+asm_p434x2+8]
+  sbb    r9, rcx
+  sbb    r10, rcx
+  mov    rdi, [rip+asm_p434x2+24]
+  sbb    r11, rdi
+  mov    rsi, [rip+asm_p434x2+32]
+  sbb    r12, rsi
+  mov    rbp, [rip+asm_p434x2+40]
+  sbb    r13, rbp
+  mov    r15, [rip+asm_p434x2+48]
+  sbb    r14, r15
+  sbb    rax, 0
+  
+  and    rbx, rax
+  and    rcx, rax
+  and    rdi, rax
+  and    rsi, rax
+  and    rbp, rax
+  and    r15, rax
+  
+  add    r8, rbx  
+  adc    r9, rcx  
+  adc    r10, rcx  
+  adc    r11, rdi 
+  adc    r12, rsi 
+  adc    r13, rbp   
+  adc    r14, r15
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9 
+  mov    [reg_p3+16], r10 
+  mov    [reg_p3+24], r11
+  mov    [reg_p3+32], r12 
+  mov    [reg_p3+40], r13 
+  mov    [reg_p3+48], r14
+  
+  pop    rbp
+  pop    rbx
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+//***********************************************************************
+//  Field subtraction
+//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
+//***********************************************************************
+#define fpsub434_asm S2N_SIKE_P434_R3_NAMESPACE(fpsub434_asm)
+.global fpsub434_asm
+fpsub434_asm:
+  push   r12
+  push   r13
+  push   r14
+  
+  xor    rax, rax
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  mov    r13, [reg_p1+40]
+  mov    r14, [reg_p1+48]
+  sub    r8, [reg_p2] 
+  sbb    r9, [reg_p2+8] 
+  sbb    r10, [reg_p2+16] 
+  sbb    r11, [reg_p2+24] 
+  sbb    r12, [reg_p2+32] 
+  sbb    r13, [reg_p2+40] 
+  sbb    r14, [reg_p2+48]
+  sbb    rax, 0
+  
+  mov    rcx, [rip+asm_p434x2]
+  mov    rdi, [rip+asm_p434x2+8]
+  mov    rsi, [rip+asm_p434x2+24]
+  and    rcx, rax
+  and    rdi, rax
+  and    rsi, rax  
+  add    r8, rcx  
+  adc    r9, rdi  
+  adc    r10, rdi  
+  adc    r11, rsi 
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9 
+  mov    [reg_p3+16], r10 
+  mov    [reg_p3+24], r11 
+  setc   cl  
+
+  mov    r8, [rip+asm_p434x2+32]
+  mov    rdi, [rip+asm_p434x2+40]
+  mov    rsi, [rip+asm_p434x2+48]
+  and    r8, rax
+  and    rdi, rax
+  and    rsi, rax  
+  bt     rcx, 0  
+  adc    r12, r8 
+  adc    r13, rdi   
+  adc    r14, rsi
+  mov    [reg_p3+32], r12 
+  mov    [reg_p3+40], r13
+  mov    [reg_p3+48], r14
+  
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+///////////////////////////////////////////////////////////////// MACRO
+.macro SUB434_PX  P0
+  push   r12
+  push   r13
+
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  mov    r13, [reg_p1+40]
+  mov    rcx, [reg_p1+48]
+  sub    r8, [reg_p2]
+  sbb    r9, [reg_p2+8]
+  sbb    r10, [reg_p2+16]
+  sbb    r11, [reg_p2+24]
+  sbb    r12, [reg_p2+32]
+  sbb    r13, [reg_p2+40]
+  sbb    rcx, [reg_p2+48]
+
+  mov    rax, [rip+\P0]
+  mov    rdi, [rip+\P0+8]
+  mov    rsi, [rip+\P0+24]
+  add    r8, rax
+  mov    rax, [rip+\P0+32]
+  adc    r9, rdi
+  adc    r10, rdi
+  adc    r11, rsi
+  mov    rdi, [rip+\P0+40]
+  mov    rsi, [rip+\P0+48]
+  adc    r12, rax
+  adc    r13, rdi
+  adc    rcx, rsi
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9
+  mov    [reg_p3+16], r10
+  mov    [reg_p3+24], r11
+  mov    [reg_p3+32], r12
+  mov    [reg_p3+40], r13
+  mov    [reg_p3+48], rcx
+
+  pop    r13
+  pop    r12
+.endm
+
+//***********************************************************************
+//  Multiprecision subtraction with correction with 2*p434
+//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p434
+//***********************************************************************
+#define mp_sub434_p2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p2_asm)
+.global mp_sub434_p2_asm
+mp_sub434_p2_asm:
+  SUB434_PX  asm_p434x2
+  ret
+
+//***********************************************************************
+//  Multiprecision subtraction with correction with 4*p434
+//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p434
+//***********************************************************************
+#define mp_sub434_p4_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p4_asm)
+.global mp_sub434_p4_asm
+mp_sub434_p4_asm:
+  SUB434_PX  asm_p434x4
+  ret
+    
+///////////////////////////////////////////////////////////////// MACRO
+// Schoolbook integer multiplication
+// Inputs:  memory pointers M0 and M1
+// Outputs: memory pointer C and regs T1, T3, rax
+// Temps:   regs T0:T6
+/////////////////////////////////////////////////////////////////
+#ifdef S2N_ADX
+
+.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
+  mov    rdx, \M0
+  mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
+  mov    \C, \T1           // C0_final
+  mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
+  xor    rax, rax
+  adox   \T0, \T2
+  mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
+  adox   \T1, \T3
+
+  mov    rdx, 8\M0
+  mulx   \T3, \T4, \M1     // T3:T4 = A1*B0
+  adox   \T2, rax
+  xor    rax, rax
+  mulx   \T5, \T6, 8\M1    // T5:T6 = A1*B1
+  adox   \T4, \T0
+  mov    8\C, \T4          // C1_final
+  adcx   \T3, \T6
+  mulx   \T6, \T0, 16\M1   // T6:T0 = A1*B2
+  adox   \T3, \T1
+  adcx   \T5, \T0
+  adcx   \T6, rax
+  adox   \T5, \T2
+
+  mov    rdx, 16\M0
+  mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
+  adox   \T6, rax
+  xor    rax, rax
+  mulx   \T4, \T2, 8\M1    // T4:T2 = A2*B1
+  adox   \T0, \T3
+  mov    16\C, \T0         // C2_final
+  adcx   \T1, \T5
+  mulx   \T0, \T3, 16\M1   // T0:T3 = A2*B2
+  adcx   \T4, \T6
+  adcx   \T0, rax
+  adox   \T1, \T2
+  adox   \T3, \T4
+  adox   rax, \T0
+.endm 
+
+///////////////////////////////////////////////////////////////// MACRO
+// Schoolbook integer multiplication
+// Inputs:  memory pointers M0 and M1
+// Outputs: memory pointer C
+// Temps:   regs T0:T9
+/////////////////////////////////////////////////////////////////
+.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+  mov    rdx, \M0
+  mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
+  mov    \C, \T1           // C0_final
+  mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
+  xor    rax, rax
+  adox   \T0, \T2
+  mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
+  adox   \T1, \T3
+  mulx   \T3, \T4, 24\M1   // T3:T4 = A0*B3
+  adox   \T2, \T4
+
+  mov    rdx, 8\M0
+  mulx   \T5, \T4, \M1     // T5:T4 = A1*B0
+  adox   \T3, rax
+  xor    rax, rax
+  mulx   \T6, \T7, 8\M1    // T6:T7 = A1*B1
+  adox   \T4, \T0
+  mov    8\C, \T4          // C1_final
+  adcx   \T5, \T7
+  mulx   \T7, \T8, 16\M1   // T7:T8 = A1*B2
+  adcx   \T6, \T8
+  adox   \T5, \T1
+  mulx   \T8, \T9, 24\M1   // T8:T9 = A1*B3
+  adcx   \T7, \T9
+  adcx   \T8, rax
+  adox   \T6, \T2
+
+  mov    rdx, 16\M0
+  mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
+  adox   \T7, \T3
+  adox   \T8, rax
+  xor    rax, rax
+  mulx   \T2, \T3, 8\M1    // T2:T3 = A2*B1
+  adox   \T0, \T5
+  mov    16\C, \T0         // C2_final
+  adcx   \T1, \T3
+  mulx   \T3, \T4, 16\M1   // T3:T4 = A2*B2
+  adcx   \T2, \T4
+  adox   \T1, \T6
+  mulx   \T4,\T9, 24\M1    // T3:T4 = A2*B3
+  adcx   \T3, \T9
+  mov    rdx, 24\M0
+  adcx   \T4, rax
+
+  adox   \T2, \T7
+  adox   \T3, \T8
+  adox   \T4, rax
+
+  mulx   \T5, \T0, \M1     // T5:T0 = A3*B0
+  xor    rax, rax
+  mulx   \T6, \T7, 8\M1    // T6:T7 = A3*B1
+  adcx   \T5, \T7
+  adox   \T1, \T0
+  mulx   \T7, \T8, 16\M1   // T7:T8 = A3*B2
+  adcx   \T6, \T8
+  adox   \T2, \T5
+  mulx   \T8, \T9, 24\M1   // T8:T9 = A3*B3
+  adcx   \T7, \T9
+  adcx   \T8, rax
+
+  adox   \T3, \T6
+  adox   \T4, \T7
+  adox   \T8, rax
+  mov    24\C, \T1         // C3_final
+  mov    32\C, \T2         // C4_final
+  mov    40\C, \T3         // C5_final
+  mov    48\C, \T4         // C6_final
+  mov    56\C, \T8         // C7_final
+.endm 
+
+#else // S2N_ADX
+
+.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
+  mov    rdx, \M0
+  mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
+  mov    \C, \T1           // C0_final
+  mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
+  add    \T0, \T2
+  mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
+  adc    \T1, \T3
+
+  mov    rdx, 8\M0
+  mulx   \T3, \T4, \M1     // T3:T4 = A1*B0
+  adc    \T2, 0
+  mulx   \T5, \T6, 8\M1    // T5:T6 = A1*B1
+  add    \T4, \T0
+  mov    8\C, \T4          // C1_final
+  adc    \T3, \T1
+  adc    \T5, \T2
+  mulx   \T2, \T1, 16\M1   // T2:T1 = A1*B2
+  adc    \T2, 0
+
+  add    \T3, \T6
+  adc    \T5, \T1
+  adc    \T2, 0
+
+  mov    rdx, 16\M0
+  mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
+  add    \T0, \T3
+  mov    16\C, \T0         // C2_final
+  mulx   \T4, \T6, 8\M1    // T4:T6 = A2*B1
+  adc    \T1, \T5
+  adc    \T2, \T4
+  mulx   rax, \T3, 16\M1   // rax:T3 = A2*B2
+  adc    rax, 0
+  add    \T1, \T6
+  adc    \T3, \T2
+  adc    rax, 0
+.endm 
+
+.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+  mov    rdx, \M0
+  mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
+  mov    \C, \T1           // C0_final
+  mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
+  add    \T0, \T2
+  mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
+  adc    \T1, \T3
+  mulx   \T3, \T4, 24\M1   // T3:T4 = A0*B3
+  adc    \T2, \T4
+  mov    rdx, 8\M0
+  adc    \T3, 0
+
+  mulx   \T5, \T4, \M1     // T5:T4 = A1*B0
+  mulx   \T6, \T7, 8\M1    // T6:T7 = A1*B1
+  add    \T5, \T7
+  mulx   \T7, \T8, 16\M1   // T7:T8 = A1*B2
+  adc    \T6, \T8
+  mulx   \T8, \T9, 24\M1   // T8:T9 = A1*B3
+  adc    \T7, \T9
+  adc    \T8, 0
+
+  add    \T4, \T0
+  mov    8\C, \T4          // C1_final
+  adc    \T5, \T1
+  adc    \T6, \T2
+  adc    \T7, \T3
+  mov    rdx, 16\M0
+  adc    \T8, 0
+
+  mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
+  mulx   \T2, \T3, 8\M1    // T2:T3 = A2*B1
+  add    \T1, \T3
+  mulx   \T3, \T4, 16\M1   // T3:T4 = A2*B2
+  adc    \T2, \T4
+  mulx   \T4,\T9, 24\M1    // T3:T4 = A2*B3
+  adc    \T3, \T9
+  mov    rdx, 24\M0
+  adc    \T4, 0
+
+  add    \T0, \T5
+  mov    16\C, \T0         // C2_final
+  adc    \T1, \T6
+  adc    \T2, \T7
+  adc    \T3, \T8
+  adc    \T4, 0
+
+  mulx   \T5, \T0, \M1     // T5:T0 = A3*B0
+  mulx   \T6, \T7, 8\M1    // T6:T7 = A3*B1
+  add    \T5, \T7
+  mulx   \T7, \T8, 16\M1   // T7:T8 = A3*B2
+  adc    \T6, \T8
+  mulx   \T8, \T9, 24\M1   // T8:T9 = A3*B3
+  adc    \T7, \T9
+  adc    \T8, 0
+
+  add    \T1, \T0
+  mov    24\C, \T1         // C3_final
+  adc    \T2, \T5
+  mov    32\C, \T2         // C4_final
+  adc    \T3, \T6
+  mov    40\C, \T3         // C5_final
+  adc    \T4, \T7
+  mov    48\C, \T4         // C6_final
+  adc    \T8, 0
+  mov    56\C, \T8         // C7_final
+.endm
+
+#endif // S2N_ADX
+
+//*****************************************************************************
+//  434-bit multiplication using Karatsuba (one level), schoolbook (one level)
+//*****************************************************************************
+#define mul434_asm S2N_SIKE_P434_R3_NAMESPACE(mul434_asm)
+.global mul434_asm
+mul434_asm:
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  mov    rcx, reg_p3
+
+  // r8-r11 <- AH + AL, rax <- mask
+  xor    rax, rax
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  push   rbx
+  push   rbp
+  sub    rsp, 96
+  add    r8, [reg_p1+32]
+  adc    r9, [reg_p1+40]
+  adc    r10, [reg_p1+48]
+  adc    r11, 0
+  sbb    rax, 0
+  mov    [rsp], r8
+  mov    [rsp+8], r9
+  mov    [rsp+16], r10
+  mov    [rsp+24], r11
+
+  // r12-r15 <- BH + BL, rbx <- mask
+  xor    rbx, rbx
+  mov    r12, [reg_p2]
+  mov    r13, [reg_p2+8]
+  mov    r14, [reg_p2+16]
+  mov    r15, [reg_p2+24]
+  add    r12, [reg_p2+32]
+  adc    r13, [reg_p2+40]
+  adc    r14, [reg_p2+48]
+  adc    r15, 0
+  sbb    rbx, 0
+  mov    [rsp+32], r12
+  mov    [rsp+40], r13
+  mov    [rsp+48], r14
+  mov    [rsp+56], r15
+
+  // r12-r15 <- masked (BH + BL)
+  and    r12, rax
+  and    r13, rax
+  and    r14, rax
+  and    r15, rax
+
+  // r8-r11 <- masked (AH + AL)
+  and    r8, rbx
+  and    r9, rbx
+  and    r10, rbx
+  and    r11, rbx
+
+  // r8-r11 <- masked (AH + AL) + masked (AH + AL)
+  add    r8, r12
+  adc    r9, r13
+  adc    r10, r14
+  adc    r11, r15
+  mov    [rsp+64], r8
+  mov    [rsp+72], r9
+  mov    [rsp+80], r10
+  mov    [rsp+88], r11
+
+  // [rsp] <- (AH+AL) x (BH+BL), low part
+  MUL256_SCHOOL  [rsp], [rsp+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp
+
+  // [rcx] <- AL x BL
+  MUL256_SCHOOL  [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp     // Result C0-C3
+
+  // [rcx+64], rbx, rbp, rax <- AH x BH
+  MUL192_SCHOOL  [reg_p1+32], [reg_p2+32], [rcx+64], r8, rbx, r10, rbp, r12, r13, r14
+
+  // r8-r11 <- (AH+AL) x (BH+BL), final step
+  mov    r8, [rsp+64]
+  mov    r9, [rsp+72]
+  mov    r10, [rsp+80]
+  mov    r11, [rsp+88]
+  mov    rdx, [rsp+32]
+  add    r8, rdx
+  mov    rdx, [rsp+40]
+  adc    r9, rdx
+  mov    rdx, [rsp+48]
+  adc    r10, rdx
+  mov    rdx, [rsp+56]
+  adc    r11, rdx
+
+  // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL
+  mov    r12, [rsp]
+  mov    r13, [rsp+8]
+  mov    r14, [rsp+16]
+  mov    r15, [rsp+24]
+  sub    r12, [rcx]
+  sbb    r13, [rcx+8]
+  sbb    r14, [rcx+16]
+  sbb    r15, [rcx+24]
+  sbb    r8, [rcx+32]
+  sbb    r9, [rcx+40]
+  sbb    r10, [rcx+48]
+  sbb    r11, [rcx+56]
+
+  // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
+  sub    r12, [rcx+64]
+  sbb    r13, [rcx+72]
+  sbb    r14, [rcx+80]
+  sbb    r15, rbx
+  sbb    r8, rbp
+  sbb    r9, rax
+  sbb    r10, 0
+  sbb    r11, 0
+
+  add    r12, [rcx+32]
+  mov    [rcx+32], r12    // Result C4-C7
+  adc    r13, [rcx+40]
+  mov    [rcx+40], r13
+  adc    r14, [rcx+48]
+  mov    [rcx+48], r14
+  adc    r15, [rcx+56]
+  mov    [rcx+56], r15
+  adc    r8, [rcx+64]
+  mov    [rcx+64], r8    // Result C8-C15
+  adc    r9, [rcx+72]
+  mov    [rcx+72], r9
+  adc    r10, [rcx+80]
+  mov    [rcx+80], r10
+  adc    r11, rbx
+  mov    [rcx+88], r11
+  adc    rbp, 0
+  mov    [rcx+96], rbp
+  adc    rax, 0
+  mov    [rcx+104], rax
+
+  add    rsp, 96
+  pop    rbp
+  pop    rbx
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+///////////////////////////////////////////////////////////////// MACRO
+// Schoolbook integer multiplication
+// Inputs:  reg I0 and memory pointer M1
+// Outputs: regs T0:T4
+// Temps:   regs T0:T5
+/////////////////////////////////////////////////////////////////
+.macro MUL64x256_SCHOOL I0, M1, T0, T1, T2, T3, T4, T5 
+  mulx   \T2, \T4, 8\M1
+  xor    rax, rax
+  mulx   \T3, \T5, 16\M1
+  ADD1   \T1, \T4            // T1 <- C1_final
+  ADC1   \T2, \T5            // T2 <- C2_final
+  mulx   \T4, \T5, 24\M1
+  ADC1   \T3, \T5            // T3 <- C3_final
+  ADC1   \T4, rax            // T4 <- C4_final
+.endm
+
+///////////////////////////////////////////////////////////////// MACRO
+// Schoolbook integer multiplication
+// Inputs:  regs I0 and I1, and memory pointer M1
+// Outputs: regs T0:T5
+// Temps:   regs T0:T5
+/////////////////////////////////////////////////////////////////
+#ifdef S2N_ADX
+
+.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5
+  mulx   \T2, \T4, 8\M1
+  xor    rax, rax
+  mulx   \T3, \T5, 16\M1
+  ADD1   \T1, \T4
+  ADC1   \T2, \T5
+  mulx   \T4, \T5, 24\M1
+  ADC1   \T3, \T5
+  ADC1   \T4, rax
+
+  xor    rax, rax
+  mov    rdx, \I1
+  mulx   \I1, \T5, \M1
+  ADD2   \T1, \T5            // T1 <- C1_final
+  ADC2   \T2, \I1
+  mulx   \T5, \I1, 8\M1
+  ADC2   \T3, \T5
+  ADD1   \T2, \I1
+  mulx   \T5, \I1, 16\M1
+  ADC2   \T4, \T5
+  ADC1   \T3, \I1
+  mulx   \T5, \I1, 24\M1
+  ADC2   \T5, rax
+  ADC1   \T4, \I1
+  ADC1   \T5, rax
+.endm
+
+#else // S2N_ADX
+
+.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 
+  mulx   \T2, \T4, 8\M1
+  mulx   \T3, \T5, 16\M1
+  add    \T1, \T4
+  adc    \T2, \T5
+  mulx   \T4, \T5, 24\M1
+  adc    \T3, \T5
+  adc    \T4, 0
+
+  mov    rdx, \I1
+  mulx   \I1, \T5, \M1
+  add    \T1, \T5            // T1 <- C1_final
+  adc    \T2, \I1
+  mulx   \T5, \I1, 8\M1
+  adc    \T3, \T5
+  mulx   \T5, rax, 16\M1
+  adc    \T4, \T5
+  mulx   \T5, rdx, 24\M1
+  adc    \T5, 0
+  add    \T2, \I1
+  adc    \T3, rax
+  adc    \T4, rdx
+  adc    \T5, 0
+.endm
+
+#endif // S2N_ADX
+
+//**************************************************************************************
+//  Montgomery reduction
+//  Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
+//  Operation: c [reg_p2] = a [reg_p1]
+//**************************************************************************************
+#define rdc434_asm S2N_SIKE_P434_R3_NAMESPACE(rdc434_asm)
+.global rdc434_asm
+rdc434_asm:
+  push   r14
+
+  // a[0-1] x p434p1_nz --> result: r8:r13
+  mov    rdx, [reg_p1]
+  mov    r14, [reg_p1+8]
+  mulx   r9, r8, [rip+asm_p434p1+24]   // result r8
+  push   r12
+  push   r13
+  push   r15
+  push   rbp
+  push   rbx
+  MUL128x256_SCHOOL rdx, r14, [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13
+
+  mov    rdx, [reg_p1+16]
+  mov    rcx, [reg_p1+72]
+  add    r8, [reg_p1+24]
+  adc    r9, [reg_p1+32]
+  adc    r10, [reg_p1+40]
+  adc    r11, [reg_p1+48]
+  adc    r12, [reg_p1+56]
+  adc    r13, [reg_p1+64]
+  adc    rcx, 0
+  mulx   rbp, rbx, [rip+asm_p434p1+24]   // result rbx
+  mov    [reg_p2], r9
+  mov    [reg_p2+8], r10
+  mov    [reg_p2+16], r11
+  mov    [reg_p2+24], r12
+  mov    [reg_p2+32], r13
+  mov    r9, [reg_p1+80]
+  mov    r10, [reg_p1+88]
+  mov    r11, [reg_p1+96]
+  mov    rdi, [reg_p1+104]
+  adc    r9, 0
+  adc    r10, 0
+  adc    r11, 0
+  adc    rdi, 0
+
+  // a[2-3] x p434p1_nz --> result: rbx, rbp, r12:r15
+  MUL128x256_SCHOOL rdx, r8, [rip+asm_p434p1+24], rbx, rbp, r12, r13, r14, r15
+
+  mov    rdx, [reg_p2]
+  add    rbx, [reg_p2+8]
+  adc    rbp, [reg_p2+16]
+  adc    r12, [reg_p2+24]
+  adc    r13, [reg_p2+32]
+  adc    r14, rcx
+  mov    rcx, 0
+  adc    r15, r9
+  adc    rcx, r10
+  mulx   r9, r8, [rip+asm_p434p1+24]   // result r8
+  mov    [reg_p2], rbp
+  mov    [reg_p2+8], r12
+  mov    [reg_p2+16], r13
+  adc    r11, 0
+  adc    rdi, 0
+
+  // a[4-5] x p434p1_nz --> result: r8:r13
+  MUL128x256_SCHOOL rdx, rbx, [rip+asm_p434p1+24], r8, r9, r10, rbp, r12, r13
+
+  mov    rdx, [reg_p2]
+  add    r8, [reg_p2+8]
+  adc    r9, [reg_p2+16]
+  adc    r10, r14
+  adc    rbp, r15
+  adc    r12, rcx
+  adc    r13, r11
+  adc    rdi, 0
+  mulx   r15, r14, [rip+asm_p434p1+24]  // result r14
+  mov    [reg_p2], r8        // Final result c0-c1
+  mov    [reg_p2+8], r9
+
+  // a[6-7] x p434p1_nz --> result: r14:r15, r8:r9, r11
+  MUL64x256_SCHOOL rdx, [rip+asm_p434p1+24], r14, r15, r8, r9, r11, rcx
+
+  // Final result c2:c6
+  add    r14, r10
+  adc    r15, rbp
+  pop    rbx
+  pop    rbp
+  adc    r8, r12
+  adc    r9, r13
+  adc    r11, rdi
+  mov    [reg_p2+16], r14
+  mov    [reg_p2+24], r15
+  pop    r15
+  pop    r13
+  mov    [reg_p2+32], r8
+  mov    [reg_p2+40], r9
+  mov    [reg_p2+48], r11
+
+  pop    r12
+  pop    r14
+  ret
+
+//***********************************************************************
+//  434-bit multiprecision addition
+//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
+//***********************************************************************
+#define mp_add434_asm S2N_SIKE_P434_R3_NAMESPACE(mp_add434_asm)
+.global mp_add434_asm
+mp_add434_asm:
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  add    r8, [reg_p2] 
+  adc    r9, [reg_p2+8] 
+  adc    r10, [reg_p2+16] 
+  adc    r11, [reg_p2+24] 
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9
+  mov    [reg_p3+16], r10
+  mov    [reg_p3+24], r11
+
+  mov    r8, [reg_p1+32]
+  mov    r9, [reg_p1+40]
+  mov    r10, [reg_p1+48]
+  adc    r8, [reg_p2+32] 
+  adc    r9, [reg_p2+40] 
+  adc    r10, [reg_p2+48] 
+  mov    [reg_p3+32], r8
+  mov    [reg_p3+40], r9
+  mov    [reg_p3+48], r10
+  ret
+
+//***************************************************************************
+//  2x434-bit multiprecision subtraction/addition
+//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. If c < 0, add p434*2^448
+//***************************************************************************
+#define mp_subadd434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_subadd434x2_asm)
+.global mp_subadd434x2_asm
+mp_subadd434x2_asm:
+  push   r12
+  push   r13 
+  push   r14 
+  push   r15 
+  xor    rax, rax
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  sub    r8, [reg_p2] 
+  sbb    r9, [reg_p2+8] 
+  sbb    r10, [reg_p2+16] 
+  sbb    r11, [reg_p2+24] 
+  sbb    r12, [reg_p2+32] 
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9
+  mov    [reg_p3+16], r10
+  mov    [reg_p3+24], r11
+  mov    [reg_p3+32], r12
+
+  mov    r8, [reg_p1+40]
+  mov    r9, [reg_p1+48]
+  mov    r10, [reg_p1+56] 
+  mov    r11, [reg_p1+64]
+  mov    r12, [reg_p1+72] 
+  sbb    r8, [reg_p2+40] 
+  sbb    r9, [reg_p2+48] 
+  sbb    r10, [reg_p2+56]
+  sbb    r11, [reg_p2+64] 
+  sbb    r12, [reg_p2+72]
+  mov    [reg_p3+40], r8
+  mov    [reg_p3+48], r9
+  mov    [reg_p3+56], r10
+
+  mov    r13, [reg_p1+80]
+  mov    r14, [reg_p1+88] 
+  mov    r15, [reg_p1+96]
+  mov    rcx, [reg_p1+104]
+  sbb    r13, [reg_p2+80]
+  sbb    r14, [reg_p2+88]
+  sbb    r15, [reg_p2+96] 
+  sbb    rcx, [reg_p2+104] 
+  sbb    rax, 0
+
+  // Add p434 anded with the mask in rax 
+  mov    r8, [rip+asm_p434]
+  mov    r9, [rip+asm_p434+24]
+  mov    r10, [rip+asm_p434+32]
+  mov    rdi, [rip+asm_p434+40]
+  mov    rsi, [rip+asm_p434+48]
+  and    r8, rax
+  and    r9, rax
+  and    r10, rax
+  and    rdi, rax
+  and    rsi, rax
+  mov    rax, [reg_p3+56]
+  add    rax, r8
+  adc    r11, r8
+  adc    r12, r8
+  adc    r13, r9
+  adc    r14, r10
+  adc    r15, rdi
+  adc    rcx, rsi
+
+  mov    [reg_p3+56], rax
+  mov    [reg_p3+64], r11
+  mov    [reg_p3+72], r12
+  mov    [reg_p3+80], r13
+  mov    [reg_p3+88], r14
+  mov    [reg_p3+96], r15
+  mov    [reg_p3+104], rcx
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+//***********************************************************************
+//  Double 2x434-bit multiprecision subtraction
+//  Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2]
+//***********************************************************************
+#define mp_dblsub434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_dblsub434x2_asm)
+.global mp_dblsub434x2_asm
+mp_dblsub434x2_asm:
+  push   r12
+  push   r13
+  push   r14
+
+  mov    r8, [reg_p3]
+  mov    r9, [reg_p3+8]
+  mov    r10, [reg_p3+16]
+  mov    r11, [reg_p3+24]
+  mov    r12, [reg_p3+32]
+  mov    r13, [reg_p3+40]
+  mov    r14, [reg_p3+48]
+  sub    r8, [reg_p1]
+  sbb    r9, [reg_p1+8] 
+  sbb    r10, [reg_p1+16] 
+  sbb    r11, [reg_p1+24] 
+  sbb    r12, [reg_p1+32] 
+  sbb    r13, [reg_p1+40] 
+  sbb    r14, [reg_p1+48]
+  setc   al  
+  sub    r8, [reg_p2]
+  sbb    r9, [reg_p2+8] 
+  sbb    r10, [reg_p2+16] 
+  sbb    r11, [reg_p2+24] 
+  sbb    r12, [reg_p2+32] 
+  sbb    r13, [reg_p2+40] 
+  sbb    r14, [reg_p2+48]
+  setc   cl  
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9
+  mov    [reg_p3+16], r10
+  mov    [reg_p3+24], r11
+  mov    [reg_p3+32], r12
+  mov    [reg_p3+40], r13
+  mov    [reg_p3+48], r14
+
+  mov    r8, [reg_p3+56]
+  mov    r9, [reg_p3+64]
+  mov    r10, [reg_p3+72]
+  mov    r11, [reg_p3+80]
+  mov    r12, [reg_p3+88]
+  mov    r13, [reg_p3+96]
+  mov    r14, [reg_p3+104]
+  bt     rax, 0  
+  sbb    r8, [reg_p1+56] 
+  sbb    r9, [reg_p1+64] 
+  sbb    r10, [reg_p1+72] 
+  sbb    r11, [reg_p1+80] 
+  sbb    r12, [reg_p1+88] 
+  sbb    r13, [reg_p1+96] 
+  sbb    r14, [reg_p1+104]
+  bt     rcx, 0  
+  sbb    r8, [reg_p2+56] 
+  sbb    r9, [reg_p2+64] 
+  sbb    r10, [reg_p2+72] 
+  sbb    r11, [reg_p2+80] 
+  sbb    r12, [reg_p2+88] 
+  sbb    r13, [reg_p2+96] 
+  sbb    r14, [reg_p2+104] 
+  mov    [reg_p3+56], r8
+  mov    [reg_p3+64], r9
+  mov    [reg_p3+72], r10
+  mov    [reg_p3+80], r11
+  mov    [reg_p3+88], r12
+  mov    [reg_p3+96], r13
+  mov    [reg_p3+104], r14
+
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.h
new file mode 100644
index 0000000000..1753e25fb4
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.h
@@ -0,0 +1,38 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: x86_64 assembly optimized modular arithmetic for P434
+*********************************************************************************************/
+
+#pragma once
+
+#if defined(S2N_SIKE_P434_R3_ASM)
+
+#define fpadd434_asm S2N_SIKE_P434_R3_NAMESPACE(fpadd434_asm)
+void fpadd434_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define fpsub434_asm S2N_SIKE_P434_R3_NAMESPACE(fpsub434_asm)
+void fpsub434_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mul434_asm S2N_SIKE_P434_R3_NAMESPACE(mul434_asm)
+void mul434_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define rdc434_asm S2N_SIKE_P434_R3_NAMESPACE(rdc434_asm)
+void rdc434_asm(digit_t* ma, digit_t* mc);
+
+#define mp_add434_asm S2N_SIKE_P434_R3_NAMESPACE(mp_add434_asm)
+void mp_add434_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mp_subadd434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_subadd434x2_asm)
+void mp_subadd434x2_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mp_dblsub434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_dblsub434x2_asm)
+void mp_dblsub434x2_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mp_sub434_p2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p2_asm)
+void mp_sub434_p2_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mp_sub434_p4_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p4_asm)
+void mp_sub434_p4_asm(const digit_t* a, const digit_t* b, digit_t* c);
+
+#endif
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.c
new file mode 100644
index 0000000000..40c61144e4
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.c
@@ -0,0 +1,478 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: core functions over GF(p) and GF(p^2)
+*********************************************************************************************/
+
+#include <string.h>
+#include "sikep434r3.h"
+#include "sikep434r3_fp.h"
+#include "sikep434r3_fpx.h"
+#include "pq-crypto/s2n_pq.h"
+#include "sikep434r3_fp_x64_asm.h"
+
+static void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc);
+static void to_mont(const felm_t a, felm_t mc);
+static void from_mont(const felm_t ma, felm_t c);
+static void fpsqr_mont(const felm_t ma, felm_t mc);
+static unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords);
+static void fpinv_chain_mont(felm_t a);
+static void fpinv_mont(felm_t a);
+static void to_fp2mont(const f2elm_t *a, f2elm_t *mc);
+static void from_fp2mont(const f2elm_t *ma, f2elm_t *c);
+
+/* Encoding digits to bytes according to endianness */
+__inline static void encode_to_bytes(const digit_t* x, unsigned char* enc, int nbytes)
+{
+    if (is_big_endian()) {
+        int ndigits = nbytes / sizeof(digit_t);
+        int rem = nbytes % sizeof(digit_t);
+
+        for (int i = 0; i < ndigits; i++) {
+            digit_t temp = S2N_SIKE_P434_R3_BSWAP_DIGIT(x[i]);
+            memcpy(enc + (i * sizeof(digit_t)), (unsigned char *)&temp, sizeof(digit_t));
+        }
+
+        if (rem) {
+            digit_t ld = S2N_SIKE_P434_R3_BSWAP_DIGIT(x[ndigits]);
+            memcpy(enc + ndigits * sizeof(digit_t), (unsigned char *) &ld, rem);
+        }
+    } else {
+        memcpy(enc, (const unsigned char *) x, nbytes);
+    }
+}
+
+/* Conversion of GF(p^2) element from Montgomery to standard representation,
+ * and encoding by removing leading 0 bytes */
+void fp2_encode(const f2elm_t *x, unsigned char *enc)
+{
+    f2elm_t t;
+
+    from_fp2mont(x, &t);
+    encode_to_bytes(t.e[0], enc, S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2);
+    encode_to_bytes(t.e[1], enc + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2, S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2);
+}
+
+/* Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation */
+void fp2_decode(const unsigned char *x, f2elm_t *dec)
+{
+    decode_to_digits(x, dec->e[0], S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2, S2N_SIKE_P434_R3_NWORDS_FIELD);
+    decode_to_digits(x + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2, dec->e[1], S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2, S2N_SIKE_P434_R3_NWORDS_FIELD);
+    to_fp2mont(dec, dec);
+}
+
+/* Multiprecision multiplication, c = a*b mod p. */
+static void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc)
+{
+    dfelm_t temp = {0};
+
+    mp_mul(ma, mb, temp, S2N_SIKE_P434_R3_NWORDS_FIELD);
+    rdc_mont(temp, mc);
+}
+
+/* Conversion to Montgomery representation,
+ * mc = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1].
+ * The Montgomery constant R^2 mod p is the global value "Montgomery_R2".  */
+static void to_mont(const felm_t a, felm_t mc)
+{
+    fpmul_mont(a, (const digit_t*)&Montgomery_R2, mc);
+}
+
+/* Conversion from Montgomery representation to standard representation,
+ * c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. */
+static void from_mont(const felm_t ma, felm_t c)
+{
+    digit_t one[S2N_SIKE_P434_R3_NWORDS_FIELD] = {0};
+    
+    one[0] = 1;
+    fpmul_mont(ma, one, c);
+    fpcorrection434(c);
+}
+
+/* Copy wordsize digits, c = a, where lng(a) = nwords. */
+void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords)
+{
+    unsigned int i;
+        
+    for (i = 0; i < nwords; i++) {
+        c[i] = a[i];
+    }
+}
+
+/* Multiprecision squaring, c = a^2 mod p. */
+static void fpsqr_mont(const felm_t ma, felm_t mc)
+{
+    dfelm_t temp = {0};
+
+    mp_mul(ma, ma, temp, S2N_SIKE_P434_R3_NWORDS_FIELD);
+    rdc_mont(temp, mc);
+}
+
+/* Copy a GF(p^2) element, c = a. */
+void fp2copy(const f2elm_t *a, f2elm_t *c)
+{
+    fpcopy(a->e[0], c->e[0]);
+    fpcopy(a->e[1], c->e[1]);
+}
+
+/* GF(p^2) division by two, c = a/2  in GF(p^2). */
+void fp2div2(const f2elm_t *a, f2elm_t *c)
+{
+    fpdiv2_434(a->e[0], c->e[0]);
+    fpdiv2_434(a->e[1], c->e[1]);
+}
+
+/* Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit. */
+unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords)
+{
+    unsigned int i, carry = 0;
+        
+    for (i = 0; i < nwords; i++) {                      
+        S2N_SIKE_P434_R3_ADDC(carry, a[i], b[i], carry, c[i]);
+    }
+
+    return carry;
+}
+
+/* GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2).
+ * Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1]
+ * Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1]  */
+void fp2sqr_mont(const f2elm_t *a, f2elm_t *c)
+{
+    felm_t t1, t2, t3;
+
+    mp_addfast(a->e[0], a->e[1], t1);       /* t1 = a0+a1 */
+    mp_sub434_p4(a->e[0], a->e[1], t2);     /* t2 = a0-a1 */
+    mp_addfast(a->e[0], a->e[0], t3);       /* t3 = 2a0 */
+    fpmul_mont(t1, t2, c->e[0]);            /* c0 = (a0+a1)(a0-a1) */
+    fpmul_mont(t3, a->e[1], c->e[1]);       /* c1 = 2a0*a1 */
+}
+
+/* Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit. */
+static unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords)
+{
+    unsigned int i, borrow = 0;
+
+    for (i = 0; i < nwords; i++) {
+        S2N_SIKE_P434_R3_SUBC(borrow, a[i], b[i], borrow, c[i]);
+    }
+
+    return borrow;
+}
+
+/* Multiprecision subtraction followed by addition with p*2^S2N_SIKE_P434_R3_MAXBITS_FIELD,
+ * c = a-b+(p*2^S2N_SIKE_P434_R3_MAXBITS_FIELD) if a-b < 0, otherwise c=a-b. */
+__inline static void mp_subaddfast(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+    if (s2n_sikep434r3_asm_is_enabled()) {
+        mp_subadd434x2_asm(a, b, c);
+        return;
+    }
+#endif
+
+    felm_t t1;
+
+    digit_t mask = 0 - (digit_t)mp_sub(a, b, c, 2*S2N_SIKE_P434_R3_NWORDS_FIELD);
+    for (int i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        t1[i] = ((const digit_t *) p434)[i] & mask;
+    }
+    mp_addfast((digit_t*)&c[S2N_SIKE_P434_R3_NWORDS_FIELD], t1, (digit_t*)&c[S2N_SIKE_P434_R3_NWORDS_FIELD]);
+}
+
+/* Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*S2N_SIKE_P434_R3_NWORDS_FIELD. */
+__inline static void mp_dblsubfast(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+    if (s2n_sikep434r3_asm_is_enabled()) {
+        mp_dblsub434x2_asm(a, b, c);
+        return;
+    }
+#endif
+
+    mp_sub(c, a, c, 2*S2N_SIKE_P434_R3_NWORDS_FIELD);
+    mp_sub(c, b, c, 2*S2N_SIKE_P434_R3_NWORDS_FIELD);
+}
+
+/* GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2).
+ * Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1]
+ * Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1]  */
+void fp2mul_mont(const f2elm_t *a, const f2elm_t *b, f2elm_t *c)
+{
+    felm_t t1, t2;
+    dfelm_t tt1, tt2, tt3; 
+    
+    mp_addfast(a->e[0], a->e[1], t1);                                 /* t1 = a0+a1 */
+    mp_addfast(b->e[0], b->e[1], t2);                                 /* t2 = b0+b1 */
+    mp_mul(a->e[0], b->e[0], tt1, S2N_SIKE_P434_R3_NWORDS_FIELD);     /* tt1 = a0*b0 */
+    mp_mul(a->e[1], b->e[1], tt2, S2N_SIKE_P434_R3_NWORDS_FIELD);     /* tt2 = a1*b1 */
+    mp_mul(t1, t2, tt3, S2N_SIKE_P434_R3_NWORDS_FIELD);               /* tt3 = (a0+a1)*(b0+b1) */
+    mp_dblsubfast(tt1, tt2, tt3);                                     /* tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 */
+    mp_subaddfast(tt1, tt2, tt1);                                     /* tt1 = a0*b0 - a1*b1 + p*2^S2N_SIKE_P434_R3_MAXBITS_FIELD if a0*b0 - a1*b1 < 0, else tt1 = a0*b0 - a1*b1 */
+    rdc_mont(tt3, c->e[1]);                                           /* c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 */
+    rdc_mont(tt1, c->e[0]);                                           /* c[0] = a0*b0 - a1*b1 */
+}
+
+/* Chain to compute a^(p-3)/4 using Montgomery arithmetic. */
+static void fpinv_chain_mont(felm_t a)
+{
+    unsigned int i, j;
+    felm_t t[31], tt;
+
+    /* Precomputed table */
+    fpsqr_mont(a, tt);
+    fpmul_mont(a, tt, t[0]);
+    for (i = 0; i <= 29; i++) {
+        fpmul_mont(t[i], tt, t[i + 1]);
+    }
+
+    fpcopy(a, tt);
+    for (i = 0; i < 7; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[5], tt, tt);
+    for (i = 0; i < 10; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[14], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[3], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[23], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[13], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[24], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[7], tt, tt);
+    for (i = 0; i < 8; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[12], tt, tt);
+    for (i = 0; i < 8; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[30], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[1], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[30], tt, tt);
+    for (i = 0; i < 7; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[21], tt, tt);
+    for (i = 0; i < 9; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[2], tt, tt);
+    for (i = 0; i < 9; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[19], tt, tt);
+    for (i = 0; i < 9; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[1], tt, tt);
+    for (i = 0; i < 7; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[24], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[26], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[16], tt, tt);
+    for (i = 0; i < 7; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[10], tt, tt);
+    for (i = 0; i < 7; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[6], tt, tt);
+    for (i = 0; i < 7; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[0], tt, tt);
+    for (i = 0; i < 9; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[20], tt, tt);
+    for (i = 0; i < 8; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[9], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[25], tt, tt);
+    for (i = 0; i < 9; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[30], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[26], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(a, tt, tt);
+    for (i = 0; i < 7; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[28], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[6], tt, tt);
+    for (i = 0; i < 6; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[10], tt, tt);
+    for (i = 0; i < 9; i++) {
+        fpsqr_mont(tt, tt);
+    }
+    fpmul_mont(t[22], tt, tt);
+    for (j = 0; j < 35; j++) {
+        for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+        fpmul_mont(t[30], tt, tt);
+    }
+    fpcopy(tt, a);
+}
+
+/* Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. */
+static void fpinv_mont(felm_t a)
+{
+    felm_t tt;
+
+    fpcopy(a, tt);
+    fpinv_chain_mont(tt);
+    fpsqr_mont(tt, tt);
+    fpsqr_mont(tt, tt);
+    fpmul_mont(a, tt, a);
+}
+
+/* GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). */
+void fp2inv_mont(f2elm_t *a)
+{
+    f2elm_t t1;
+
+    fpsqr_mont(a->e[0], t1.e[0]);                         /* t10 = a0^2 */
+    fpsqr_mont(a->e[1], t1.e[1]);                         /* t11 = a1^2 */
+    fpadd434(t1.e[0], t1.e[1], t1.e[0]);                  /* t10 = a0^2+a1^2 */
+    fpinv_mont(t1.e[0]);                                  /* t10 = (a0^2+a1^2)^-1 */
+    fpneg434(a->e[1]);                                    /* a = a0-i*a1 */
+    fpmul_mont(a->e[0], t1.e[0], a->e[0]);
+    fpmul_mont(a->e[1], t1.e[0], a->e[1]);                /* a = (a0-i*a1)*(a0^2+a1^2)^-1 */
+}
+
+/* Conversion of a GF(p^2) element to Montgomery representation,
+ * mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2).  */
+static void to_fp2mont(const f2elm_t *a, f2elm_t *mc)
+{
+    to_mont(a->e[0], mc->e[0]);
+    to_mont(a->e[1], mc->e[1]);
+}
+
+/* Conversion of a GF(p^2) element from Montgomery representation to standard representation,
+ * c_i = ma_i*R^(-1) = a_i in GF(p^2). */
+static void from_fp2mont(const f2elm_t *ma, f2elm_t *c)
+{
+    from_mont(ma->e[0], c->e[0]);
+    from_mont(ma->e[1], c->e[1]);
+}
+
+/* Multiprecision right shift by one. */
+void mp_shiftr1(digit_t* x, const unsigned int nwords)
+{
+    unsigned int i;
+
+    for (i = 0; i < nwords-1; i++) {
+        S2N_SIKE_P434_R3_SHIFTR(x[i+1], x[i], 1, x[i], S2N_SIKE_P434_R3_RADIX);
+    }
+    x[nwords-1] >>= 1;
+}
+
+void decode_to_digits(const unsigned char* x, digit_t* dec, int nbytes, int ndigits)
+{
+    dec[ndigits - 1] = 0;
+    memcpy((unsigned char*)dec, x, nbytes);
+
+    if (is_big_endian()) {
+        for (int i = 0; i < ndigits; i++) {
+            dec[i] = S2N_SIKE_P434_R3_BSWAP_DIGIT(dec[i]);
+        }
+    }
+}
+
+void fpcopy(const felm_t a, felm_t c)
+{
+    unsigned int i;
+
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        c[i] = a[i];
+    }
+}
+
+void fpzero(felm_t a)
+{
+    unsigned int i;
+
+    for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) {
+        a[i] = 0;
+    }
+}
+
+void fp2add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c)
+{
+    fpadd434(a->e[0], b->e[0], c->e[0]);
+    fpadd434(a->e[1], b->e[1], c->e[1]);
+}
+
+void fp2sub(const f2elm_t *a, const f2elm_t *b, f2elm_t *c)
+{
+    fpsub434(a->e[0], b->e[0], c->e[0]);
+    fpsub434(a->e[1], b->e[1], c->e[1]);
+}
+
+void mp_addfast(const digit_t* a, const digit_t* b, digit_t* c)
+{
+#if defined(S2N_SIKE_P434_R3_ASM)
+    if (s2n_sikep434r3_asm_is_enabled()) {
+        mp_add434_asm(a, b, c);
+        return;
+    }
+#endif
+
+    mp_add(a, b, c, S2N_SIKE_P434_R3_NWORDS_FIELD);
+}
+
+void mp2_add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c)
+{
+    mp_addfast(a->e[0], b->e[0], c->e[0]);
+    mp_addfast(a->e[1], b->e[1], c->e[1]);
+}
+
+void mp2_sub_p2(const f2elm_t *a, const f2elm_t *b, f2elm_t *c)
+{
+    mp_sub434_p2(a->e[0], b->e[0], c->e[0]);
+    mp_sub434_p2(a->e[1], b->e[1], c->e[1]);
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.h
new file mode 100644
index 0000000000..bce1849ce1
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.h
@@ -0,0 +1,65 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: core functions over GF(p) and GF(p^2)
+*********************************************************************************************/
+
+#pragma once
+
+#include <string.h>
+#include "sikep434r3.h"
+#include "sikep434r3_fp.h"
+
+#define fp2_encode S2N_SIKE_P434_R3_NAMESPACE(fp2_encode)
+void fp2_encode(const f2elm_t *x, unsigned char *enc);
+
+#define fp2_decode S2N_SIKE_P434_R3_NAMESPACE(fp2_decode)
+void fp2_decode(const unsigned char *x, f2elm_t *dec);
+
+#define copy_words S2N_SIKE_P434_R3_NAMESPACE(copy_words)
+void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords);
+
+#define fp2copy S2N_SIKE_P434_R3_NAMESPACE(fp2copy)
+void fp2copy(const f2elm_t *a, f2elm_t *c);
+
+#define fp2div2 S2N_SIKE_P434_R3_NAMESPACE(fp2div2)
+void fp2div2(const f2elm_t *a, f2elm_t *c);
+
+#define mp_add S2N_SIKE_P434_R3_NAMESPACE(mp_add)
+unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords);
+
+#define fp2sqr_mont S2N_SIKE_P434_R3_NAMESPACE(fp2sqr_mont)
+void fp2sqr_mont(const f2elm_t *a, f2elm_t *c);
+
+#define fp2mul_mont S2N_SIKE_P434_R3_NAMESPACE(fp2mul_mont)
+void fp2mul_mont(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
+
+#define fp2inv_mont S2N_SIKE_P434_R3_NAMESPACE(fp2inv_mont)
+void fp2inv_mont(f2elm_t *a);
+
+#define mp_shiftr1 S2N_SIKE_P434_R3_NAMESPACE(mp_shiftr1)
+void mp_shiftr1(digit_t* x, const unsigned int nwords);
+
+#define decode_to_digits S2N_SIKE_P434_R3_NAMESPACE(decode_to_digits)
+void decode_to_digits(const unsigned char* x, digit_t* dec, int nbytes, int ndigits);
+
+#define fpcopy S2N_SIKE_P434_R3_NAMESPACE(fpcopy)
+void fpcopy(const felm_t a, felm_t c);
+
+#define fpzero S2N_SIKE_P434_R3_NAMESPACE(fpzero)
+void fpzero(felm_t a);
+
+#define fp2add S2N_SIKE_P434_R3_NAMESPACE(fp2add)
+void fp2add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
+
+#define fp2sub S2N_SIKE_P434_R3_NAMESPACE(fp2sub)
+void fp2sub(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
+
+#define mp_addfast S2N_SIKE_P434_R3_NAMESPACE(mp_addfast)
+void mp_addfast(const digit_t* a, const digit_t* b, digit_t* c);
+
+#define mp2_add S2N_SIKE_P434_R3_NAMESPACE(mp2_add)
+void mp2_add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
+
+#define mp2_sub_p2 S2N_SIKE_P434_R3_NAMESPACE(mp2_sub_p2)
+void mp2_sub_p2(const f2elm_t *a, const f2elm_t *b, f2elm_t *c);
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_kem.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_kem.c
new file mode 100644
index 0000000000..b32add7723
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_kem.c
@@ -0,0 +1,112 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: supersingular isogeny key encapsulation (SIKE) protocol
+*********************************************************************************************/ 
+
+#include <string.h>
+#include "sikep434r3.h"
+#include "sikep434r3_fips202.h"
+#include "utils/s2n_safety.h"
+#include "tls/s2n_kem.h"
+#include "pq-crypto/s2n_pq.h"
+#include "pq-crypto/s2n_pq_random.h"
+#include "sikep434r3_fpx.h"
+#include "sikep434r3_api.h"
+
+/* SIKE's key generation
+ * Outputs: secret key sk (S2N_SIKE_P434_R3_SECRET_KEY_BYTES = S2N_SIKE_P434_R3_MSG_BYTES + S2N_SIKE_P434_R3_SECRETKEY_B_BYTES + S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES bytes)
+ *          public key pk (S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES bytes) */
+int s2n_sike_p434_r3_crypto_kem_keypair(unsigned char *pk, unsigned char *sk)
+{
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+
+    /* Generate lower portion of secret key sk <- s||SK */
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(sk, S2N_SIKE_P434_R3_MSG_BYTES));
+    POSIX_GUARD(random_mod_order_B(sk + S2N_SIKE_P434_R3_MSG_BYTES));
+
+    /* Generate public key pk */
+    EphemeralKeyGeneration_B(sk + S2N_SIKE_P434_R3_MSG_BYTES, pk);
+
+    /* Append public key pk to secret key sk */
+    memcpy(&sk[S2N_SIKE_P434_R3_MSG_BYTES + S2N_SIKE_P434_R3_SECRETKEY_B_BYTES], pk, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES);
+
+    return S2N_SUCCESS;
+}
+
+/* SIKE's encapsulation
+ * Input:   public key pk         (S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES bytes)
+ * Outputs: shared secret ss      (S2N_SIKE_P434_R3_SHARED_SECRET_BYTES bytes)
+ *          ciphertext message ct (S2N_SIKE_P434_R3_CIPHERTEXT_BYTES = S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES + S2N_SIKE_P434_R3_MSG_BYTES bytes) */
+int s2n_sike_p434_r3_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk)
+{
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+
+    unsigned char ephemeralsk[S2N_SIKE_P434_R3_SECRETKEY_A_BYTES];
+    unsigned char jinvariant[S2N_SIKE_P434_R3_FP2_ENCODED_BYTES];
+    unsigned char h[S2N_SIKE_P434_R3_MSG_BYTES];
+    unsigned char temp[S2N_SIKE_P434_R3_CIPHERTEXT_BYTES+S2N_SIKE_P434_R3_MSG_BYTES];
+
+    /* Generate ephemeralsk <- G(m||pk) mod oA */
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(temp, S2N_SIKE_P434_R3_MSG_BYTES));
+    memcpy(&temp[S2N_SIKE_P434_R3_MSG_BYTES], pk, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES);
+    shake256(ephemeralsk, S2N_SIKE_P434_R3_SECRETKEY_A_BYTES, temp, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES+S2N_SIKE_P434_R3_MSG_BYTES);
+    ephemeralsk[S2N_SIKE_P434_R3_SECRETKEY_A_BYTES - 1] &= S2N_SIKE_P434_R3_MASK_ALICE;
+
+    /* Encrypt */
+    EphemeralKeyGeneration_A(ephemeralsk, ct);
+    EphemeralSecretAgreement_A(ephemeralsk, pk, jinvariant);
+    shake256(h, S2N_SIKE_P434_R3_MSG_BYTES, jinvariant, S2N_SIKE_P434_R3_FP2_ENCODED_BYTES);
+    for (int i = 0; i < S2N_SIKE_P434_R3_MSG_BYTES; i++) {
+        ct[i + S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES] = temp[i] ^ h[i];
+    }
+
+    /* Generate shared secret ss <- H(m||ct) */
+    memcpy(&temp[S2N_SIKE_P434_R3_MSG_BYTES], ct, S2N_SIKE_P434_R3_CIPHERTEXT_BYTES);
+    shake256(ss, S2N_SIKE_P434_R3_SHARED_SECRET_BYTES, temp, S2N_SIKE_P434_R3_CIPHERTEXT_BYTES+S2N_SIKE_P434_R3_MSG_BYTES);
+
+    return S2N_SUCCESS;
+}
+
+/* SIKE's decapsulation
+ * Input:   secret key sk         (S2N_SIKE_P434_R3_SECRET_KEY_BYTES = S2N_SIKE_P434_R3_MSG_BYTES + S2N_SIKE_P434_R3_SECRETKEY_B_BYTES + S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES bytes)
+ *          ciphertext message ct (S2N_SIKE_P434_R3_CIPHERTEXT_BYTES = S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES + S2N_SIKE_P434_R3_MSG_BYTES bytes)
+ * Outputs: shared secret ss      (S2N_SIKE_P434_R3_SHARED_SECRET_BYTES bytes) */
+int s2n_sike_p434_r3_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk)
+{
+    POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED);
+
+    unsigned char ephemeralsk_[S2N_SIKE_P434_R3_SECRETKEY_A_BYTES];
+    unsigned char jinvariant_[S2N_SIKE_P434_R3_FP2_ENCODED_BYTES];
+    unsigned char h_[S2N_SIKE_P434_R3_MSG_BYTES];
+    unsigned char c0_[S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES];
+    unsigned char temp[S2N_SIKE_P434_R3_CIPHERTEXT_BYTES+S2N_SIKE_P434_R3_MSG_BYTES];
+
+    /* Decrypt */
+    EphemeralSecretAgreement_B(sk + S2N_SIKE_P434_R3_MSG_BYTES, ct, jinvariant_);
+    shake256(h_, S2N_SIKE_P434_R3_MSG_BYTES, jinvariant_, S2N_SIKE_P434_R3_FP2_ENCODED_BYTES);
+    for (int i = 0; i < S2N_SIKE_P434_R3_MSG_BYTES; i++) {
+        temp[i] = ct[i + S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES] ^ h_[i];
+    }
+
+    /* Generate ephemeralsk_ <- G(m||pk) mod oA */
+    memcpy(&temp[S2N_SIKE_P434_R3_MSG_BYTES], &sk[S2N_SIKE_P434_R3_MSG_BYTES + S2N_SIKE_P434_R3_SECRETKEY_B_BYTES], S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES);
+    shake256(ephemeralsk_, S2N_SIKE_P434_R3_SECRETKEY_A_BYTES, temp, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES+S2N_SIKE_P434_R3_MSG_BYTES);
+    ephemeralsk_[S2N_SIKE_P434_R3_SECRETKEY_A_BYTES - 1] &= S2N_SIKE_P434_R3_MASK_ALICE;
+
+    /* Generate shared secret ss <- H(m||ct), or output ss <- H(s||ct) in case of ct verification failure */
+    EphemeralKeyGeneration_A(ephemeralsk_, c0_);
+
+    /* Verify ciphertext.
+     * If c0_ and ct are NOT equal, decaps failed and we overwrite the shared secret
+     * with pseudorandom noise (ss = H(s||ct)) by performing the copy (dont_copy = false).
+     *
+     * If c0_ and ct are equal, then decaps succeeded and we skip the overwrite and output
+     * the actual shared secret: ss = H(m||ct) (dont_copy = true). */
+    bool dont_copy = s2n_constant_time_equals(c0_, ct, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES);
+    POSIX_GUARD(s2n_constant_time_copy_or_dont(temp, sk, S2N_SIKE_P434_R3_MSG_BYTES, dont_copy));
+    memcpy(&temp[S2N_SIKE_P434_R3_MSG_BYTES], ct, S2N_SIKE_P434_R3_CIPHERTEXT_BYTES);
+    shake256(ss, S2N_SIKE_P434_R3_SHARED_SECRET_BYTES, temp, S2N_SIKE_P434_R3_CIPHERTEXT_BYTES+S2N_SIKE_P434_R3_MSG_BYTES);
+
+    return S2N_SUCCESS;
+}
diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_sidh.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_sidh.c
new file mode 100644
index 0000000000..f570e27e32
--- /dev/null
+++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_sidh.c
@@ -0,0 +1,310 @@
+/********************************************************************************************
+* Supersingular Isogeny Key Encapsulation Library
+*
+* Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH)
+*********************************************************************************************/
+
+#include "sikep434r3.h"
+#include "pq-crypto/s2n_pq_random.h"
+#include "utils/s2n_safety.h"
+#include "sikep434r3_fpx.h"
+#include "sikep434r3_ec_isogeny.h"
+#include "sikep434r3_api.h"
+
+/* Initialization of basis points */
+static void init_basis(const digit_t *gen, f2elm_t *XP, f2elm_t *XQ, f2elm_t *XR)
+{
+    fpcopy(gen,                  XP->e[0]);
+    fpcopy(gen +   S2N_SIKE_P434_R3_NWORDS_FIELD, XP->e[1]);
+    fpcopy(gen + 2*S2N_SIKE_P434_R3_NWORDS_FIELD, XQ->e[0]);
+    fpcopy(gen + 3*S2N_SIKE_P434_R3_NWORDS_FIELD, XQ->e[1]);
+    fpcopy(gen + 4*S2N_SIKE_P434_R3_NWORDS_FIELD, XR->e[0]);
+    fpcopy(gen + 5*S2N_SIKE_P434_R3_NWORDS_FIELD, XR->e[1]);
+}
+
+/* Generation of Bob's secret key
+ * Outputs random value in [0, 2^Floor(Log(2, oB)) - 1] */
+int random_mod_order_B(unsigned char* random_digits)
+{
+    POSIX_GUARD_RESULT(s2n_get_random_bytes(random_digits, S2N_SIKE_P434_R3_SECRETKEY_B_BYTES));
+    random_digits[S2N_SIKE_P434_R3_SECRETKEY_B_BYTES-1] &= S2N_SIKE_P434_R3_MASK_BOB; /* Masking last byte */
+
+    return 0;
+}
+
+/* Alice's ephemeral public key generation
+ * Input:  a private key PrivateKeyA in the range [0, 2^eA - 1].
+ * Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded
+ *     by removing leading 0 bytes. */
+int EphemeralKeyGeneration_A(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA)
+{
+    point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE];
+    f2elm_t _XPA, _XQA, _XRA, coeff[3], _A24plus = {0}, _C24 = {0}, _A = {0};
+    f2elm_t *XPA=&_XPA, *XQA=&_XQA, *XRA=&_XRA, *A24plus=&_A24plus, *C24=&_C24, *A=&_A;
+    unsigned int i, row, m, tree_index = 0, pts_index[S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE], npts = 0, ii = 0;
+    digit_t SecretKeyA[S2N_SIKE_P434_R3_NWORDS_ORDER] = {0};
+
+    /* Initialize basis points */
+    init_basis((const digit_t*)A_gen, XPA, XQA, XRA);
+    init_basis((const digit_t*)B_gen, &phiP->X, &phiQ->X, &phiR->X);
+    fpcopy((const digit_t*)&Montgomery_one, (phiP->Z.e)[0]);
+    fpcopy((const digit_t*)&Montgomery_one, (phiQ->Z.e)[0]);
+    fpcopy((const digit_t*)&Montgomery_one, (phiR->Z.e)[0]);
+
+    /* Initialize constants: A24plus = A+2C, C24 = 4C, where A=6, C=1 */
+    fpcopy((const digit_t*)&Montgomery_one, A24plus->e[0]);
+    mp2_add(A24plus, A24plus, A24plus);
+    mp2_add(A24plus, A24plus, C24);
+    mp2_add(A24plus, C24, A);
+    mp2_add(C24, C24, A24plus);
+
+    /* Retrieve kernel point */
+    decode_to_digits(PrivateKeyA, SecretKeyA, S2N_SIKE_P434_R3_SECRETKEY_A_BYTES, S2N_SIKE_P434_R3_NWORDS_ORDER);
+    LADDER3PT(XPA, XQA, XRA, SecretKeyA, S2N_SIKE_P434_R3_ALICE, R, A);
+
+    /* Traverse tree */
+    tree_index = 0;
+    for (row = 1; row < S2N_SIKE_P434_R3_MAX_ALICE; row++) {
+        while (tree_index < S2N_SIKE_P434_R3_MAX_ALICE-row) {
+            fp2copy(&R->X, &pts[npts]->X);
+            fp2copy(&R->Z, &pts[npts]->Z);
+            pts_index[npts++] = tree_index;
+            m = strat_Alice[ii++];
+            xDBLe(R, R, A24plus, C24, (int)(2*m));
+            tree_index += m;
+        }
+        get_4_isog(R, A24plus, C24, coeff);
+
+        for (i = 0; i < npts; i++) {
+            eval_4_isog(pts[i], coeff);
+        }
+        eval_4_isog(phiP, coeff);
+        eval_4_isog(phiQ, coeff);
+        eval_4_isog(phiR, coeff);
+
+        fp2copy(&pts[npts-1]->X, &R->X);
+        fp2copy(&pts[npts-1]->Z, &R->Z);
+        tree_index = pts_index[npts-1];
+        npts -= 1;
+    }
+
+    get_4_isog(R, A24plus, C24, coeff);
+    eval_4_isog(phiP, coeff);
+    eval_4_isog(phiQ, coeff);
+    eval_4_isog(phiR, coeff);
+
+    inv_3_way(&phiP->Z, &phiQ->Z, &phiR->Z);
+    fp2mul_mont(&phiP->X, &phiP->Z, &phiP->X);
+    fp2mul_mont(&phiQ->X, &phiQ->Z, &phiQ->X);
+    fp2mul_mont(&phiR->X, &phiR->Z, &phiR->X);
+                
+    /* Format public key */
+    fp2_encode(&phiP->X, PublicKeyA);
+    fp2_encode(&phiQ->X, PublicKeyA + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES);
+    fp2_encode(&phiR->X, PublicKeyA + 2*S2N_SIKE_P434_R3_FP2_ENCODED_BYTES);
+
+    return 0;
+}
+
+/* Bob's ephemeral public key generation
+ * Input:  a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1].
+ * Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded
+ *     by removing leading 0 bytes. */
+int EphemeralKeyGeneration_B(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB)
+{
+    point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB];
+    f2elm_t _XPB, _XQB, _XRB, coeff[3], _A24plus = {0}, _A24minus = {0}, _A = {0};
+    f2elm_t *XPB=&_XPB, *XQB=&_XQB, *XRB=&_XRB, *A24plus=&_A24plus, *A24minus=&_A24minus, *A=&_A;
+
+    unsigned int i, row, m, tree_index = 0, pts_index[S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB], npts = 0, ii = 0;
+    digit_t SecretKeyB[S2N_SIKE_P434_R3_NWORDS_ORDER] = {0};
+
+    /* Initialize basis points */
+    init_basis((const digit_t*)B_gen, XPB, XQB, XRB);
+    init_basis((const digit_t*)A_gen, &phiP->X, &phiQ->X, &phiR->X);
+    fpcopy((const digit_t*)&Montgomery_one, (phiP->Z.e)[0]);
+    fpcopy((const digit_t*)&Montgomery_one, (phiQ->Z.e)[0]);
+    fpcopy((const digit_t*)&Montgomery_one, (phiR->Z.e)[0]);
+
+    /* Initialize constants: A24minus = A-2C, A24plus = A+2C, where A=6, C=1 */
+    fpcopy((const digit_t*)&Montgomery_one, A24plus->e[0]);
+    mp2_add(A24plus, A24plus, A24plus);
+    mp2_add(A24plus, A24plus, A24minus);
+    mp2_add(A24plus, A24minus, A);
+    mp2_add(A24minus, A24minus, A24plus);
+
+    /* Retrieve kernel point */
+    decode_to_digits(PrivateKeyB, SecretKeyB, S2N_SIKE_P434_R3_SECRETKEY_B_BYTES, S2N_SIKE_P434_R3_NWORDS_ORDER);
+    LADDER3PT(XPB, XQB, XRB, SecretKeyB, S2N_SIKE_P434_R3_BOB, R, A);
+    
+    /* Traverse tree */
+    tree_index = 0;
+    for (row = 1; row < S2N_SIKE_P434_R3_MAX_BOB; row++) {
+        while (tree_index < S2N_SIKE_P434_R3_MAX_BOB-row) {
+            fp2copy(&R->X, &pts[npts]->X);
+            fp2copy(&R->Z, &pts[npts]->Z);
+            pts_index[npts++] = tree_index;
+            m = strat_Bob[ii++];
+            xTPLe(R, R, A24minus, A24plus, (int)m);
+            tree_index += m;
+        } 
+        get_3_isog(R, A24minus, A24plus, coeff);
+
+        for (i = 0; i < npts; i++) {
+            eval_3_isog(pts[i], coeff);
+        }     
+        eval_3_isog(phiP, coeff);
+        eval_3_isog(phiQ, coeff);
+        eval_3_isog(phiR, coeff);
+
+        fp2copy(&pts[npts-1]->X, &R->X);
+        fp2copy(&pts[npts-1]->Z, &R->Z);
+        tree_index = pts_index[npts-1];
+        npts -= 1;
+    }
+    
+    get_3_isog(R, A24minus, A24plus, coeff);
+    eval_3_isog(phiP, coeff);
+    eval_3_isog(phiQ, coeff);
+    eval_3_isog(phiR, coeff);
+
+    inv_3_way(&phiP->Z, &phiQ->Z, &phiR->Z);
+    fp2mul_mont(&phiP->X, &phiP->Z, &phiP->X);
+    fp2mul_mont(&phiQ->X, &phiQ->Z, &phiQ->X);
+    fp2mul_mont(&phiR->X, &phiR->Z, &phiR->X);
+
+    /* Format public key */
+    fp2_encode(&phiP->X, PublicKeyB);
+    fp2_encode(&phiQ->X, PublicKeyB + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES);
+    fp2_encode(&phiR->X, PublicKeyB + 2*S2N_SIKE_P434_R3_FP2_ENCODED_BYTES);
+
+    return 0;
+}
+
+/* Alice's ephemeral shared secret computation
+ * It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB
+ * Inputs: Alice's PrivateKeyA is an integer in the range [0, oA-1].
+ *     Bob's PublicKeyB consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes.
+ * Output: a shared secret SharedSecretA that consists of one element in GF(p^2) encoded
+ *     by removing leading 0 bytes.   */
+int EphemeralSecretAgreement_A(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB,
+        unsigned char* SharedSecretA)
+{
+    point_proj_t R, pts[S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE];
+    f2elm_t coeff[3], PKB[3], _jinv;
+    f2elm_t _A24plus = {0}, _C24 = {0}, _A = {0};
+    f2elm_t *jinv=&_jinv, *A24plus=&_A24plus, *C24=&_C24, *A=&_A;
+    unsigned int i, row, m, tree_index = 0, pts_index[S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE], npts = 0, ii = 0;
+    digit_t SecretKeyA[S2N_SIKE_P434_R3_NWORDS_ORDER] = {0};
+      
+    /* Initialize images of Bob's basis */
+    fp2_decode(PublicKeyB, &PKB[0]);
+    fp2_decode(PublicKeyB + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES, &PKB[1]);
+    fp2_decode(PublicKeyB + 2*S2N_SIKE_P434_R3_FP2_ENCODED_BYTES, &PKB[2]);
+
+    /* Initialize constants: A24plus = A+2C, C24 = 4C, where C=1 */
+    get_A(&PKB[0], &PKB[1], &PKB[2], A);
+    mp_add((const digit_t*)&Montgomery_one, (const digit_t*)&Montgomery_one, C24->e[0], S2N_SIKE_P434_R3_NWORDS_FIELD);
+    mp2_add(A, C24, A24plus);
+    mp_add(C24->e[0], C24->e[0], C24->e[0], S2N_SIKE_P434_R3_NWORDS_FIELD);
+
+    /* Retrieve kernel point */
+    decode_to_digits(PrivateKeyA, SecretKeyA, S2N_SIKE_P434_R3_SECRETKEY_A_BYTES, S2N_SIKE_P434_R3_NWORDS_ORDER);
+    LADDER3PT(&PKB[0], &PKB[1], &PKB[2], SecretKeyA, S2N_SIKE_P434_R3_ALICE, R, A);
+
+    /* Traverse tree */
+    tree_index = 0;
+    for (row = 1; row < S2N_SIKE_P434_R3_MAX_ALICE; row++) {
+        while (tree_index < S2N_SIKE_P434_R3_MAX_ALICE-row) {
+            fp2copy(&R->X, &pts[npts]->X);
+            fp2copy(&R->Z, &pts[npts]->Z);
+            pts_index[npts++] = tree_index;
+            m = strat_Alice[ii++];
+            xDBLe(R, R, A24plus, C24, (int)(2*m));
+            tree_index += m;
+        }
+        get_4_isog(R, A24plus, C24, coeff);
+
+        for (i = 0; i < npts; i++) {
+            eval_4_isog(pts[i], coeff);
+        }
+
+        fp2copy(&pts[npts-1]->X, &R->X);
+        fp2copy(&pts[npts-1]->Z, &R->Z);
+        tree_index = pts_index[npts-1];
+        npts -= 1;
+    }
+
+    get_4_isog(R, A24plus, C24, coeff);
+    mp2_add(A24plus, A24plus, A24plus);
+    fp2sub(A24plus, C24, A24plus);
+    fp2add(A24plus, A24plus, A24plus);
+    j_inv(A24plus, C24, jinv);
+    fp2_encode(jinv, SharedSecretA); /* Format shared secret */
+
+    return 0;
+}
+
+/* Bob's ephemeral shared secret computation
+ * It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA
+ * Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,oB)) - 1].
+ *     Alice's PublicKeyA consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes.
+ * Output: a shared secret SharedSecretB that consists of one element in GF(p^2) encoded
+ *     by removing leading 0 bytes.   */
+int EphemeralSecretAgreement_B(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA,
+        unsigned char* SharedSecretB)
+{
+    point_proj_t R, pts[S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB];
+    f2elm_t coeff[3], PKB[3], _jinv;
+    f2elm_t _A24plus = {0}, _A24minus = {0}, _A = {0};
+    f2elm_t *jinv=&_jinv, *A24plus=&_A24plus, *A24minus=&_A24minus, *A=&_A;
+    unsigned int i, row, m, tree_index = 0, pts_index[S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB], npts = 0, ii = 0;
+    digit_t SecretKeyB[S2N_SIKE_P434_R3_NWORDS_ORDER] = {0};
+      
+    /* Initialize images of Alice's basis */
+    fp2_decode(PublicKeyA, &PKB[0]);
+    fp2_decode(PublicKeyA + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES, &PKB[1]);
+    fp2_decode(PublicKeyA + 2*S2N_SIKE_P434_R3_FP2_ENCODED_BYTES, &PKB[2]);
+
+    /* Initialize constants: A24plus = A+2C, A24minus = A-2C, where C=1 */
+    get_A(&PKB[0], &PKB[1], &PKB[2], A);
+    mp_add((const digit_t*)&Montgomery_one, (const digit_t*)&Montgomery_one, A24minus->e[0], S2N_SIKE_P434_R3_NWORDS_FIELD);
+    mp2_add(A, A24minus, A24plus);
+    mp2_sub_p2(A, A24minus, A24minus);
+
+    /* Retrieve kernel point */
+    decode_to_digits(PrivateKeyB, SecretKeyB, S2N_SIKE_P434_R3_SECRETKEY_B_BYTES, S2N_SIKE_P434_R3_NWORDS_ORDER);
+    LADDER3PT(&PKB[0], &PKB[1], &PKB[2], SecretKeyB, S2N_SIKE_P434_R3_BOB, R, A);
+    
+    /* Traverse tree */
+    tree_index = 0;
+    for (row = 1; row < S2N_SIKE_P434_R3_MAX_BOB; row++) {
+        while (tree_index < S2N_SIKE_P434_R3_MAX_BOB-row) {
+            fp2copy(&R->X, &pts[npts]->X);
+            fp2copy(&R->Z, &pts[npts]->Z);
+            pts_index[npts++] = tree_index;
+            m = strat_Bob[ii++];
+            xTPLe(R, R, A24minus, A24plus, (int)m);
+            tree_index += m;
+        }
+        get_3_isog(R, A24minus, A24plus, coeff);
+
+        for (i = 0; i < npts; i++) {
+            eval_3_isog(pts[i], coeff);
+        } 
+
+        fp2copy(&pts[npts-1]->X, &R->X);
+        fp2copy(&pts[npts-1]->Z, &R->Z);
+        tree_index = pts_index[npts-1];
+        npts -= 1;
+    }
+     
+    get_3_isog(R, A24minus, A24plus, coeff);
+    fp2add(A24plus, A24minus, A);
+    fp2add(A, A, A);
+    fp2sub(A24plus, A24minus, A24plus);
+    j_inv(A, A24plus, jinv);
+    fp2_encode(jinv, SharedSecretB); /* Format shared secret */
+
+    return 0;
+}
author	thegeorg <thegeorg@yandex-team.ru>	2022-05-10 22:16:03 +0300
committer	thegeorg <thegeorg@yandex-team.ru>	2022-05-10 22:16:03 +0300
commit	09c71d918d4d0b0ebf67e1ab41aa90ddf587a3f2 (patch)
tree	dd44d2cb68e2845c2d4c367b66893f3e043a6e8e /contrib/restricted/aws/s2n/pq-crypto
parent	5eb4a8a2d487411924e1d1b27c454223dcf35005 (diff)
download	ydb-09c71d918d4d0b0ebf67e1ab41aa90ddf587a3f2.tar.gz